# Exercizes on clustering

Here you can find a program to read a data file in csv format. 

The data file has the following heading:
    number of samples, number of features
    list of the names of the features (separated by comma)
    
The remaining lines contain an example per row.
For each row there is a list of real numbers (commas separated) which are the feature values.   

In [None]:
import csv
import sys
from os.path import join
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

data_path="./data/"
color = ['b','g','r','c','m','y','k','w']
file_path_1 = join(data_path, "3-clusters.csv")
file_path_2 = join(data_path, "dataset-DBSCAN.csv")
file_path_3 = join(data_path, "CURE-complete.csv")

In [None]:
def load_data(file_path):
   with open(file_path) as csv_file:
       data_file = csv.reader(csv_file,delimiter=',')

       header = next(data_file)
       n_samples = int(header[0])
       n_features = int(header[1])
       feature_names = np.array(next(data_file))

       data_list = [iter for iter in data_file]
       data = np.asarray(data_list, dtype=np.float64)                  
       
   return(data,feature_names,n_samples,n_features)

def plot_dataset(x, y, labels, title='Plot', colors=None, single_plot=True, **kwargs):
    if single_plot:
        fig = plt.figure(figsize=(8,8))
        plt.suptitle(title)
    plt.grid(True, alpha=0.4)
    plt.xlabel(labels[0])
    plt.ylabel(labels[1])
    plt.scatter(x, y, marker='.', c=colors, **kwargs)
 
data1, feature_names1, n_samples1, n_features1 = load_data(file_path_1)
data2, feature_names2, n_samples2, n_features2 = load_data(file_path_2)
data3, feature_names3, n_samples3, n_features3 = load_data(file_path_3)

print(f"dataset n. 1 - n. samples: {n_samples1}, n. features: {n_features1}")
print(f"dataset n. 2 - n. samples: {n_samples2}, n. features: {n_features2}")
print(f"dataset n. 3 - n. samples: {n_samples3}, n. features: {n_features3}")

### Plot dataset n.1

In [None]:
plot_dataset(data1[:,0], data1[:,1], feature_names1, title='Dataset n. 1 of data points')

### Plot dataset n.2

In [None]:
plot_dataset(data2[:,0], data2[:,1], feature_names2, title='Dataset n. 2 of data points')

### Plot dataset n.3

In [None]:
plot_dataset(data3[:,0], data3[:,1], feature_names3, title='Dataset n. 3 of data points', s=2)

### Clustering with K-means on the dataset n.1

In [None]:
k = 3
kmeans = KMeans(n_clusters=k, random_state=0).fit(data1)

prediction = [color[kmeans.labels_[i]] for i in range(data1.shape[0])]
plot_dataset(data1[:,0], data1[:,1], feature_names1, title='Clustered points in dataset n. 1', colors=prediction)

### Clustering with K-means & DBSCAN on the dataset n.2

In [None]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=0).fit(data2)
dbscan = DBSCAN(min_samples=5, eps=0.04).fit(StandardScaler().fit_transform(data2))

km_prediction = [color[kmeans.labels_[i]] for i in range(data2.shape[0])]
db_prediction = [color[dbscan.labels_[i]] for i in range(data2.shape[0])]

fig = plt.figure(figsize=(20,10))

plt.subplot(1, 2, 1)
plt.title('K-Means Clustering of dataset n.2')
plot_dataset(data2[:,0], data2[:,1], feature_names2, colors=km_prediction, single_plot=False, s=5)

plt.subplot(1, 2, 2)
plt.title('DBSCAN Clustering of dataset n.2')
plot_dataset(data2[:,0], data2[:,1], feature_names2, colors=db_prediction, single_plot=False, s=5)

### Clustering with K-means & DBSCAN on the dataset n.3

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=0).fit(data3)

factor = 0.1
subset_width = int(len(data3) * factor)

data3_subset = np.random.permutation(data3)[:subset_width]
dbscan = DBSCAN(min_samples=40, eps=.14).fit(StandardScaler().fit_transform(data3_subset))

km_prediction = [color[kmeans.labels_[i]] for i in range(data3.shape[0])]
db_prediction = [color[dbscan.labels_[i]] for i in range(data3_subset.shape[0])]

fig = plt.figure(figsize=(20,10))

plt.subplot(1, 2, 1)
plt.title('K-Means Clustering of dataset n.3')
plot_dataset(data3[:,0], data3[:,1], feature_names3, colors=km_prediction, single_plot=False, s=5)

plt.subplot(1, 2, 2)
plt.title('DBSCAN Clustering of dataset n.3')
plot_dataset(data3_subset[:,0], data3_subset[:,1], feature_names3, colors=db_prediction, single_plot=False, s=5)

In the following cells I propose you to write a program that computes with a statistical measure of your choice a quantitative evaluation of clusters in the three datasets.

**Note:**
It is advisable to execute K-means a certain number of times (let us try 10 times) and then select the clustering solution that gives the best value of the evaluation measure.

As already done with classification by k-nn, plot in the cell below the quantitative measure of your choice (used above) with respect to an increasing value of k (the number of clusters) so that the best value of k can be selected.


1. Set *Minpts* to a number (say **10**).
2. Compute the **reachability distance** of the **10-th nearest neighbour ** for each data-point.
3. Sort the set of reachability distances you obtained in an increasing way.
4. Plot the sorted reachability distances
5. Find the **elbow** of the diagram => it gives the eps value combined with Minpts=10.
6. Try this combined pair of parameters on the dataset you chose, with DBSCAN.