** Outline:**
* Визуализация данных
* Генерация данных    

In [1]:
from IPython.core.display import Image
import pandas
from sklearn import datasets
from scipy.stats import gaussian_kde
import seaborn as sns
from matplotlib.colors import ListedColormap

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

<h1 align="center">  Визуализация данных </h1>

# Квартет Энскомба

Подробнее, см. на Википедии.

Наборы данных имеют одинаковое среднее, дисперсию, коэффициент корреляции между признаками. При визуализации на плоскости оказывается, что они имеют принципиально разный характер.


In [2]:
A = array([
[10.0, 8.04, 10.0, 9.14, 10.0, 7.46, 8.0, 6.58],
[8.0, 6.95, 8.0, 8.14, 8.0, 6.77, 8.0, 5.76],
[13.0, 7.58, 13.0, 8.74, 13.0, 12.74, 8.0, 7.71],
[9.0, 8.81, 9.0, 8.77, 9.0, 7.11, 8.0, 8.84],
[11.0, 8.33, 11.0, 9.26, 11.0, 7.81, 8.0, 8.47],
[14.0, 9.96, 14.0, 8.10, 14.0, 8.84, 8.0, 7.04],
[6.0, 7.24, 6.0, 6.13, 6.0, 6.08, 8.0, 5.25],
[4.0, 4.26, 4.0, 3.10, 4.0, 5.39, 19.0, 12.50],
[12.0, 10.84, 12.0, 9.13, 12.0, 8.15, 8.0, 5.56],
[7.0, 4.82, 7.0, 7.26, 7.0, 6.42, 8.0, 7.91],
[5.0, 5.68, 5.0, 4.74, 5.0, 5.73, 8.0, 6.89],
])
data = [A[:, :2], A[:, 2:4], A[:, 4:6], A[:, 6:]]

NameError: name 'array' is not defined

In [None]:
for idx, X in enumerate(data):
    print ('X%d' % (idx+1))
    print ('  Mean:', mean(X, axis=0))
    print ('  Std:', std(X, axis=0))
    print ('  Pearson correlation:', corrcoef(X.T)[0, 1])

In [None]:
imshow(A, interpolation='none')
colorbar()

In [None]:
figure(figsize=(16, 4))
for idx, X in enumerate(data):
    subplot(1, 4, idx+1)
    scatter(X[:, 0], X[:, 1])

# Наборы данных

Стандартные наборы данных и генераторы в SciKit-Learn

In [None]:
diabetes = datasets.load_diabetes()
X = diabetes.data

In [None]:
# sample 1-d data
points1 = random.choice(X[:, 5], 100)

In [None]:
# sample 2-d data
points2x, points2y = X[:,5], X[:,6]

# Одномерные признаки

## Набор точек на прямой

In [None]:
figure(figsize=(15, 1.5)), grid(True, axis='x'), yticks([])
scatter(points1, zeros(len(points1)), s=50, alpha=0.4)

In [None]:
figure(figsize=(15, 1.5)), grid(True, axis='x'), yticks([])
scatter(points1, zeros(len(points1)), s=50, alpha=0.2)

In [None]:
figure(figsize=(15, 1.5)), grid(True, axis='x'), yticks([])
scatter(points1, zeros(len(points1)), s=50, alpha=0.1)

In [None]:
figure(figsize=(15, 1.5)), grid(True, axis='x'), yticks([])
scatter(points1, zeros(len(points1)), s=50, alpha=0.9)

## Jitter: случайно выбираем вертикальную координату, чтобы лучше видеть количество точек

In [None]:
figure(figsize=(15, 1.5)), grid(True, axis='x'), yticks([])
scatter(points1, randn(len(points1)), s=50, marker='*')

## Гистограмма

In [None]:
hist(points1)

## Гистограмма: число бинов

In [None]:
figure(figsize=(17, 4))
subplot(1, 3, 1), hist(points1, bins=5)
subplot(1, 3, 2), hist(points1, bins=20)
subplot(1, 3, 3), hist(points1, bins=40)

## KDE: Kernel Density Estimation

In [None]:
figure(figsize=(12, 6))
density = gaussian_kde(points1)
xs = linspace(min(points1)-0.01, max(points1)+0.01, 100)
plot(xs, density(xs), linewidth=2.5, color='red')
scatter(points1, zeros(len(points1)), s=50, alpha=0.4)

In [None]:
figure(figsize=(17, 4))

xs = linspace(min(points1)-0.01, max(points1)+0.01, 100)

widths = [0.05, 0.2, 2]

for i, width in enumerate(widths):
    subplot(1, 3, i+1)
    density = gaussian_kde(points1, bw_method=width)
    plot(xs, density(xs), linewidth=2.5, color='green')
    scatter(points1, zeros(len(points1)), s=50, alpha=0.4)

## Последовательности

In [None]:
seq1 = [0.035753708, 0.025425873, -0.02886173, -0.062208079, 0.009859905, -0.029191028, 0.015445348, -0.041167612, 0.000661905, 0.022037345, -0.022692465, -0.013708704, 0.000864697, -0.00381506, 0.00566126, 0.046831302, -0.006634978, 0.034566982, -0.020528213, -0.008776701, -0.025919141, 0.015279487, 0.018577796, -0.014132879, 0.036607044, 0.011353209, -0.040542021, -0.022105644, -0.014888368, 0.007026745, -0.011494996, -0.041136038, -0.002631499, 0.024654643, -0.03584061, 0.017303168, 0.001725406, 0.004975853, 0.000671759, -0.005891895, -0.013689039, 0.002192959, 0.007913215, -0.03852223, 0.007958798, -0.007133473, 0.011234009, -0.001410361]

In [None]:
figure(figsize=(15, 3))
plot(seq1, '-^')
grid(True)

In [None]:
figure(figsize=(15, 3))

# Stem plot
stem(seq1)

In [None]:
figure(figsize=(15, 3))
bar(arange(len(seq1)), seq1)

## Polar Plot: циклические данные

In [None]:
r = np.arange(0, 3.0, 0.01)
theta = 2 * np.pi * r

figure(figsize=(5, 5))
polar(theta, r)

# Категориальные данные

In [None]:
# path = 'C:/Users/rusrom.EUROPE/OneDrive/Teaching/MachineLearning/data/digit_recognizer/train.csv' # путь к папке с данными
frame = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data', 
                        names=['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'rwd', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'spfi', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'])
frame.head()

In [None]:
frame.describe()

In [None]:
frame.describe(include='all')

In [None]:
frame.groupby('body-style').count()

## Многомерные данные

In [None]:
iris = datasets.load_iris()

iris_frame = pandas.DataFrame(iris.data)
iris_frame.columns = iris.feature_names
iris_frame['target'] = iris.target

In [None]:
iris_frame.head()

## Scatter plot

In [None]:
figure(figsize=(7, 7))
scatter(iris_frame.iloc[:, 0], iris_frame.iloc[:, 1], alpha=0.5)
grid(True)

In [None]:
N = len(iris_frame.iloc[:,:4].T)
T=len(iris_frame.iloc[:,:])
color=[]
for t in range(T):
    if iris_frame.ix[t,4]==1.0:
        color+=['blue']
    if iris_frame.ix[t,4]==2.0:
        color+=['red']
    if iris_frame.ix[t,4]==3.0:
        color+=['yellow']
data_name=['Sepal length', 'Seal width', 'Petal length', 'Petal width']
fig = plt.figure(figsize=(15, 10)) 
for i in list(range(N)):
    for j in list(range(N)): 
        ax = fig.add_subplot(N,N,i*N+j+1) 
        if j == 0: 
            ax.set_ylabel(data_name[i],size='12')
        if i == 0: 
            ax.set_title(data_name[j],size='12')
        if i == j: 
            ax.hist(iris_frame.iloc[:,i], 10) 
        else:    
            ax.scatter(iris_frame.iloc[:,i], iris_frame.iloc[:,j] ,c=color[:], alpha=0.5)  

In [None]:
sns.pairplot(iris_frame, hue = 'target')

## Данные какой размерности можно представить одним двумерным скаттер-плотом?

 - координата X
 - координата Y
 - цвет
 - размер
 - форма

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
min(diabetes.data[:, 5]), max(diabetes.data[:, 5])

In [None]:
figure(figsize=(8, 8))

feature = diabetes.data[:, 5]
s = 200*((feature - min(feature)) / (max(feature) - min(feature)))

feature2 = (diabetes.data[:, 6] > 0)

scatter(diabetes.data[feature2, 2], 
        diabetes.data[feature2, 4], 
        c=diabetes.data[feature2, 1], 
        s=s, 
        cmap='rainbow', 
        marker='s',
        alpha=0.7)
scatter(diabetes.data[logical_not(feature2), 2], 
        diabetes.data[logical_not(feature2), 4], 
        c=diabetes.data[logical_not(feature2), 1], 
        s=s, 
        cmap='rainbow', 
        marker='o',
        alpha=0.7)

## Дискретные признаки и scatter

In [None]:
f = (diabetes.data[:,3] - min(diabetes.data[:,3]) )*4
feature = zeros(len(f))
feature[f > 0.2] = 1
feature[f > 0.4] = 2
feature[f > 0.6] = 3
feature[f > 0.8] = 4

In [None]:
figure(figsize=(12, 7))
scatter(diabetes.data[:,2], feature, alpha=0.5, c=diabetes.data[:, 1], s=40)
grid(True)

In [None]:
figure(figsize=(12, 7))
scatter(diabetes.data[:,2], feature + 0.15*randn(len(feature)), alpha=0.5, c=diabetes.data[:, 1], s=40)
grid(True)

<h1 align="center"> Генерация данных </h1>

# Случайные наборы данных

In [None]:
def plot_model_data(blue_data, red_data):
    pylab.figure(figsize=(10, 10))
    pylab.scatter(blue_data[:,0], blue_data[:,1], color="blue", alpha=0.3)
    pylab.scatter(red_data[:,0], red_data[:,1], color="red", alpha=0.3)
    pylab.show()

In [None]:
def generate_linear(point_count, slope, constant):
    points = np.random.rand(point_count, 2)
    above_index = slope * points[:, 0] + constant < points[:, 1]
    blue_data = points[above_index, :]
    red_data = points[~above_index, :]
    return blue_data, red_data

In [None]:
linear1, linerate2 = generate_linear(6000, 1, 0.5)
plot_model_data(linear1, linerate2)

In [None]:
linear1, linerate2 = generate_linear(6000, 2, 0.0)
plot_model_data(linear1, linerate2)

In [None]:
def generate_circle(point_count, center, radius = 0.25):
    points = np.random.rand(point_count, 2)
    x, y = center
    inside_index = (points[:, 0] - x)**2 + (points[:, 1] - y)**2 <= (radius)**2
    blue_data = points[inside_index, :]
    red_data = points[~inside_index, :]
    return blue_data, red_data

In [None]:
blue_data, red_data = generate_circle(6000, (0.5, 0.5))
plot_model_data(blue_data, red_data)

In [None]:
def generate_saw(point_count, width, height):
    points = np.random.rand(point_count, 2)
    subsaw_index = (points[:, 0] / width).astype(int)
    unit = points[:, 0] - subsaw_index * width
    below_index = points[:, 1] < 0.5 - 2.0 * (subsaw_index % 2 - 0.5) * (unit * 2 * height / width - height)
    blue_data = points[below_index, :]
    red_data = points[~below_index, :]
    return blue_data, red_data

In [None]:
blue_data, red_data = generate_saw(5000, 0.05, 0.1)
plot_model_data(blue_data, red_data)

In [None]:
blue_data, red_data = generate_saw(6000, 0.05, 0.1)
plot_model_data(blue_data, red_data)

In [None]:
blue_data, red_data = generate_saw(6000, 0.1, 0.1)
plot_model_data(blue_data, red_data)

In [None]:
def generate_gaussian(point_count, mu, transform_matrix):
    """V = transform_matrix
    function: X = mu + V.T x X
    output: Normal(mu, V.T x V)
    """
    points = np.random.randn(point_count, 2)
    points = mu + np.dot(transform_matrix.T, points.T).T
    return points

In [None]:
blue_data = generate_gaussian(3000, (0, 0), np.array([[0.1, 0.2], [0.1, 0.5]]))
red_data = generate_gaussian(3000, (0.5, 0), np.array([[0.1, 0], [0, 1.0]]))
plot_model_data(blue_data, red_data)

# Генерация выборок в sklearn

**Способы генерации данных:** 
* make_classification
* make_regression
* make_circles
* make_checkerboard
* etc

#### datasets.make_circles

In [None]:
circles = datasets.make_circles()

In [None]:
print ("features: {}".format(circles[0][:10]))
print ("target: {}".format(circles[1][:10]))

In [None]:
# классы точек в датасете
circles[1]

In [None]:
colors = ListedColormap(['red', 'yellow'])

pyplot.figure(figsize(8, 8))
pyplot.scatter(list((x[0] for x in circles[0])), list((x[1] for x in circles[0])), c = circles[1], cmap = colors)

In [None]:
def plot_2d_dataset(data, colors):
    pyplot.figure(figsize(8, 8))
    pyplot.scatter(list(x[0] for x in data[0]), list(x[0] for x in data[0]), c = data[1], cmap = colors)

In [None]:
noisy_circles = datasets.make_circles(noise = 0.15)

In [None]:
plot_2d_dataset(noisy_circles, colors)

#### datasets.make_classification

In [None]:
simple_classification_problem = datasets.make_classification(n_features = 2, n_informative = 1, 
                                                            n_redundant = 1, n_clusters_per_class = 1,
                                                            random_state = 1 )

In [None]:
plot_2d_dataset(simple_classification_problem, colors)

In [None]:
classification_problem = datasets.make_classification(n_features = 2, n_informative = 2, n_classes = 4, 
                                                      n_redundant = 0, n_clusters_per_class = 1, random_state = 1)

colors = ListedColormap(['red', 'blue', 'green', 'yellow'])

In [None]:
plot_2d_dataset(classification_problem, colors)