# Overview of classification models
This notebook provides a comparative overview of supervised and unsupervised classification models mainly based on Sklearn implementation.

Methods are:
- Kmeans
- Expectation Maximization
- Hierarchical agglomerative clustering
- Affinity propagation
- Mean-shift
- Spectral clustering
- DBSCAN

# Environment

In [1]:
import nbimporter
from supervised import Dataset
import numpy as np
import matplotlib.pyplot as plt

importing Jupyter notebook from supervised.ipynb


## Synthetic dataset

In [2]:
from sklearn.datasets.samples_generator import make_blobs

samples, training_p = 5000, 0.7
x_t = int(np.ceil(samples*training_p))
seeds = [[1, 0], [0, 1], [1, 1], [2, 2], [2, 0], [0, 2], [2, 1], [1, 2]]
R, E = make_blobs(n_samples=samples, centers=seeds, cluster_std=0.25)
R += np.array([100, 100])
tr_docs, te_docs = range(0, x_t), range(x_t, samples)
tr_labels, te_labels = E[:x_t], E[x_t:]
tr_data, te_data = R[:x_t, :], R[x_t:, :]
SD = Dataset((tr_data, te_data), list(set(E)), (tr_docs, te_docs), (tr_labels, te_labels))

## Set-up the dataset

In [3]:
T = SD
experiments, e_labels, e_time = [], [], []

## KMeans

In [4]:
import sklearn.cluster as cls

experiments.append('KMeans')
km_labels, km_time = T.clustering(cls.KMeans(n_clusters=len(T.categories)))
e_labels.append(km_labels)
e_time.append(km_time)

## Hierarchical clustering

In [5]:
experiments.append('Ward HC')
hw_labels, hw_time = T.clustering(cls.AgglomerativeClustering(n_clusters=len(T.categories), linkage='ward'))
e_labels.append(hw_labels)
e_time.append(hw_time)

experiments.append('Complete HC')
ch_labels, ch_time = T.clustering(cls.AgglomerativeClustering(n_clusters=len(T.categories), linkage='complete'))
e_labels.append(ch_labels)
e_time.append(ch_time)

experiments.append('Average HC')
sh_labels, sh_time = T.clustering(cls.AgglomerativeClustering(n_clusters=len(T.categories), linkage='average'))
e_labels.append(sh_labels)
e_time.append(sh_time)

## Mean-shift

In [6]:
experiments.append('Mean-shift')
mh_labels, mh_time = T.clustering(cls.MeanShift(bin_seeding=False, cluster_all=False))
e_labels.append(mh_labels)
e_time.append(mh_time)

## Spectral clustering

In [7]:
experiments.append('Spectral Clustering')
se_labels, se_time = T.clustering(cls.SpectralClustering(n_clusters=len(T.categories)))
e_labels.append(se_labels)
e_time.append(se_time)

## DBSCAN

In [8]:
experiments.append('DBSCAN')
db_labels, db_time = T.clustering(cls.DBSCAN(eps=0.1, min_samples=6))
e_labels.append(db_labels)
e_time.append(db_time)

# Evaluation

In [9]:
from sklearn import metrics
from IPython.core.display import display, HTML

headers = ['Method', 'Rand', 'MI', 'Prec.', 'Recall', 'F1', 'Time (T)', 'Time (P)', 'Time']
table = "<table style='width: 100%;'>"
table += "<tr><th style='text-align: center;'>" + "</th><th style='text-align: right;'>".join(headers) + "</th></tr>"
for i, e in enumerate(experiments):
    e_l = e_labels[i]
    rand = metrics.adjusted_rand_score(T.testing_labels, e_l)
    mi = metrics.adjusted_mutual_info_score(T.testing_labels, e_l) 
    precision = metrics.homogeneity_score(T.testing_labels, e_l)
    recall = metrics.completeness_score(T.testing_labels, e_l) 
    f = (2*precision*recall) / (precision + recall)
    data = [e, round(rand, 3), round(mi, 3), round(precision, 3), round(recall, 3), round(f, 3), 
            round(e_time[i][0], 3), round(e_time[i][1], 3), round(e_time[i][0]+e_time[i][1], 3)]
    data[5] = "<strong>" + str(data[5]) + "</strong>"
    data[8] = "<strong>" + str(data[8]) + "</strong>"
    table += "<tr><th style='text-align: center;'>"
    table += "</td><td style='text-align: right;'>".join([str(x) for x in data])
    table += "</td></tr>"
table += "</table>"

In [10]:
display(HTML(table))

Method,Rand,MI,Prec.,Recall,F1,Time (T),Time (P),Time
KMeans,0.884,0.88,0.881,0.881,0.881,0.0,0.039,0.039
Ward HC,0.818,0.835,0.836,0.839,0.838,0.0,0.07,0.07
Complete HC,0.728,0.779,0.78,0.791,0.786,0.0,0.044,0.044
Average HC,0.854,0.863,0.864,0.865,0.865,0.0,0.039,0.039
Mean-shift,0.203,0.245,0.246,0.738,0.369,0.0,3.336,3.336
Spectral Clustering,0.869,0.868,0.869,0.869,0.869,0.0,0.301,0.301
DBSCAN,0.667,0.686,0.779,0.692,0.733,0.0,0.011,0.011
