# Семинар 9 - Кластеризация

In [None]:
import math 
from math import floor
from itertools import product

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

%matplotlib inline
plt.style.use('seaborn')

seed = 42
np.random.seed(seed)

# Подготовим данные

In [None]:
from sklearn.datasets import make_blobs, make_moons

In [None]:
centers = [[1, 1], [-1, -1], [1, -1], [3, 4]]
blobs = make_blobs(n_samples=750, centers=centers, cluster_std=0.5, random_state=seed) 
# noisy_moons = make_moons(n_samples=750, noise=0.05, random_state=seed)

X, y = blobs
X = StandardScaler().fit_transform(X)

In [None]:
def plot_xy_set(X, labels):
    unique_labels = set(labels)

    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 1]

        class_member_mask = labels == k

        xy = X[class_member_mask]
        plt.plot(
            xy[:, 0],
            xy[:, 1],
            "o",
            markerfacecolor=tuple(col),
            markeredgecolor="k",
            markersize=4,
        )

    plt.title("Estimated number of clusters: %d" % len(unique_labels))

In [None]:
plot_xy_set(X, y)

## Вспомогательная функция оценки сегментации

In [None]:
from sklearn import metrics

In [None]:
def log_calc_metrics(y_true, y_pred):
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(y_true, y_pred))
    print("Completeness: %0.3f" % metrics.completeness_score(y_true, y_pred))
    print("V-measure: %0.3f" % metrics.v_measure_score(y_true, y_pred))
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(y_true, y_pred))
    print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(y_true, y_pred))

## Agglomerative

In [None]:
# Compute clustering
print("Compute structured hierarchical clustering...")
n_clusters = 5  # number of regions
ward = AgglomerativeClustering(n_clusters=None,
        linkage='ward', distance_threshold=10).fit(X)

label = ward.labels_
print("Number of frames: ", label.size)
print("Number of clusters: ", np.unique(label).size)

log_calc_metrics(y, label)

In [None]:
plot_xy_set(X, label)

## Mean-Shift 

In [None]:
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
print("Number of frames: ", label.size)
print("Number of clusters: ", np.unique(label).size)

log_calc_metrics(y, label)

In [None]:
plot_xy_set(X, labels)

## DBSCAN 

In [None]:
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
label = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Number of clusters: %d" % n_clusters_)
print("Number of noise points: %d" % n_noise_)

log_calc_metrics(y, label)

In [None]:
plot_xy_set(X, label)

# Визуализация данных

In [None]:
data = pd.read_csv('covid_stat.csv')

In [None]:
data.head(5).T

In [None]:
df_corr = data.corr()

mask = np.triu(np.ones_like(df_corr, dtype=bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(df_corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

In [None]:
scaler = StandardScaler()

df_orig = data.copy().set_index('Country/Region').drop(columns=['Country Code'])
df_sc = pd.DataFrame(scaler.fit_transform(df_orig), index=df_orig.index, columns=df_orig.columns)

display('original', data.head(2), 'scaled', df_sc.head(2))

## PCA

In [None]:
plotX = df_sc.copy()

pca = PCA(n_components='mle', random_state=seed)
principalComponents = pca.fit_transform(plotX)
PCA_components = pd.DataFrame(principalComponents, index=plotX.index)

In [None]:
# Plot the explained variances
features = range(pca.n_components_)

fig,ax = plt.subplots(1,4, figsize=(16,4))

ax[0].bar(features, pca.explained_variance_ratio_, color='black')
ax[0].set(xlabel='PCA features')
ax[0].set(ylabel='variance %')
ax[0].set(xticks=features)

ax[1].scatter(PCA_components[0], PCA_components[1], alpha=.1, color='black')
ax[1].set(xlabel='PCA1')
ax[1].set(ylabel='PCA2')
ax[1].set(title='PCA')

ax[2].scatter(PCA_components[1], PCA_components[2], alpha=.1, color='black')
ax[2].set(xlabel='PCA2')
ax[2].set(ylabel='PCA3')

ax[3].scatter(PCA_components[0], PCA_components[2], alpha=.1, color='black')
ax[3].set(xlabel='PCA1')
ax[3].set(ylabel='PCA3')

plt.show()

## KMeans

Алгоритм KMeans группирует данные, пытаясь разделить выборки на n групп с равной дисперсией, минимизируя критерий, известный как **инерция** или внутрикластерная сумма квадратов:

$$\sum_{i=0}^{n}\min_{\mu_j \in C}(||x_i - \mu_j||^2)$$

In [None]:
def elbow_check(df_km):
    ks = range(1, 10)
    inertias = []
    for k in ks:
        model = KMeans(n_clusters=k)
        model.fit(df_km.iloc[:, :3])
        inertias.append(model.inertia_)

    plt.plot(ks, inertias, '-o', color='black')
    plt.title('inertia - sum dist^2 of centroid to samples')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()

In [None]:
elbow_check(PCA_components)

In [None]:
def plot_clusters(cluster_name, cluster_cases, df_comp, dims=2):
    fig,ax = plt.subplots(1, len(cluster_cases), figsize=(16,4))

    for c, clusters in enumerate(cluster_cases):
        model = KMeans(n_clusters=clusters)
        df_comp2 = df_comp.copy()
        m = model.fit(df_comp2)
        df_comp2.insert(0, 'cluster', m.labels_)

        datas = []
        for i in range(clusters): 
            tup = []
            for j in range(dims):  ## should be cluster dims? pca dims?
                tup.append(df_comp2[df_comp2['cluster'] == i][j])
            datas.append(tup)

        groups = [cluster_name+str(i) for i in range(clusters)]
        colors = ('green','red','blue','yellow','purple','brown')

        for data,color,group in zip(datas,colors,groups):
            x,y = data[0],data[1]
            ax[c].scatter(x, y, alpha=0.5, c=color, edgecolors='none', s=100, label=group)
            ax[c].set_title(f'{cluster_name} Kmeans {clusters} clusters')
            ax[c].set(xlabel=cluster_name+'1')
            ax[c].set(ylabel=cluster_name+'2')
    plt.show()

In [None]:
cluster_cases = [2, 3, 4, 5, 6]

plot_clusters('PCA', cluster_cases, PCA_components)

In [None]:
def plot_big(clustering_name, reducer_name, model, df_comp, dims=2, countries=30):
    df_comp2 = df_comp.copy()
    fig, ax = plt.subplots(figsize=(16, 12))

    m = model.fit(df_comp2)  # first 3 cols, or dims
    df_comp2.insert(0, 'cluster', m.labels_)
    
    datas = []
    for i in range(len(set(m.labels_))):
        tup = []
        for j in range(dims):
            tup.append(df_comp2[df_comp2['cluster']==i][j])
        datas.append(tup)

    groups = [clustering_name+str(i) for i in range(len(set(m.labels_)))]
    colors = ('green','red','yellow','cyan','purple','brown')

    for data,color,group in zip(datas,colors,groups):
        x,y = data[0],data[1]
        ax.scatter(x, y, alpha=0.5, c=color, edgecolors='none', s=500, label=group)
        ax.set_title(f'{reducer_name} {clustering_name} {len(set(m.labels_))} clusters')
        ax.set(xlabel=f'{reducer_name}1')
        ax.set(ylabel=f'{reducer_name}2')

    for index,row in df_comp2[df_comp2.index.isin(['United Kingdom','Belgium','Canada', 'France', 'Germany', 'Italy', 'Netherlands', 'Sweden', 'Switzerland','Canada','Japan','United States','China','Australia','Singapore','South Africa','Belarus','Brazil','Russia','India','Poland'])].iterrows():
        ax.annotate(index, (row[0], row[1]))
    for index,row in df_comp2.sample(countries).iterrows():
        ax.annotate(index, (row[0], row[1]))

    plt.legend(loc=1)
    plt.show()

In [None]:
model = KMeans(n_clusters=6, random_state=seed)
plot_big('KMeans','PCA', model, PCA_components, countries=50)

## t-SNE Method (T-Distributed Stochastic Neighbor Embedding)

In [None]:
plotX = df_sc.copy()
ps = np.linspace(10, 90, 9).astype(int)

fig,ax = plt.subplots(3, 3, figsize=(12,10))
plt.subplots_adjust(hspace=.5)

for i, perplexity in enumerate(ps):
    tsne = TSNE(n_components=2, perplexity=perplexity)
    tsneComponents = tsne.fit_transform(plotX)
    TSNE_components = pd.DataFrame(tsneComponents, index=plotX.index)

    ax[i%3][math.floor(i/3)].set_title(f'perplexity {perplexity}')
    ax[i%3][math.floor(i/3)].scatter(TSNE_components[0], TSNE_components[1], alpha=.2, color='black')
    ax[i%3][math.floor(i/3)].set(xlabel='TSNE1')
    ax[i%3][math.floor(i/3)].set(ylabel='TSNE2')

## Fixing TSNE HyperParams (Perplexity and Comps)

In [None]:
tsne = TSNE(n_components=2, perplexity=20, early_exaggeration=10, random_state=seed)
tsneComponents = tsne.fit_transform(plotX)
TSNE_components = pd.DataFrame(tsneComponents, index=plotX.index)
TSNE_components.head()

## KMeans w/ TSNE

In [None]:
elbow_check(TSNE_components)

## KMeans on TSNE - Zoom with Labels

In [None]:
model = KMeans(n_clusters=6, random_state=seed)            
plot_big('KMeans','TSNE', model, TSNE_components, countries=100)

## DBScan on TSNE Cluster Analysis

In [None]:
samp_list = [2, 3, 4]
eps_list = [0.8, 1.0, 1.25, 1.5, 1.75]
rows = len(samp_list)

fig, ax = plt.subplots(len(samp_list), len(eps_list), figsize=(18, 10))

for k, (eps, samp) in enumerate(product(eps_list, samp_list)):
    ax[k%rows][floor(k/rows)].tick_params(axis='both',which='both',bottom=False,top=False,left=False,labelbottom=False) 

    # fit t-SNE
    TSNE_df = TSNE_components.copy()
    clustering = DBSCAN(eps=eps, min_samples=samp).fit(TSNE_df)
    TSNE_df.insert(0, 'cluster', clustering.labels_)

    # plot predict
    datas = [[TSNE_df[TSNE_df['cluster']==i][j] for j in range(2)] for i in range(len(set(clustering.labels_)))]
    colors = ('green','red','blue','yellow','purple','brown','orange','purple')
    groups = [str(i) for i in range(len(set(clustering.labels_)))]

    for data,color,group in zip(datas,colors,groups):
        ax[k%rows][floor(k/rows)].scatter(data[0],data[1], alpha=0.5, c=color, edgecolors='none', s=100, label=group)
        ax[k%rows][floor(k/rows)].set_title(f'DBS eps:{eps}, samp:{samp}')

In [None]:
TSNE_df = TSNE_components.copy()
clustering = DBSCAN(eps=1.75, min_samples=4).fit(TSNE_df)
plot_big('DBScan','TSNE', clustering, TSNE_df, countries=100)

# Оценка важности признаков 


![pipeline](pipeline.png)

In [None]:
# обучите модель понижения размерности

# Ваш код ...

In [None]:
# сделайте кластеризацию

# Ваш код ...

### Feature importance

In [None]:
# обучите модель классификации

# Ваш код ...

In [None]:
# нарисуйте важность признаков

# Ваш код ...