# TP 4 : Apprentissage Non Supervisé : Clustering

<br><br>

# Partie 1 : Données de travail

In [1]:
import pandas as pd

#Reading the file, assigning the first line and column as header and index and parsing according to tabulations
fromage=pd.read_table("fromage.txt",sep="\t",header=0,index_col=0)
#describe gives statistic information about the dataset attributes
print(fromage.describe())
from jupyterthemes import jtplot
jtplot.style(theme=’monokai’, context=’notebook’, ticks=True, grid=False)


SyntaxError: invalid character in identifier (<ipython-input-1-0813157637cc>, line 8)

In [None]:
#listing dataset attributes
print(list(fromage.columns))


In [None]:
#croisement 2 à 2 des variables
pd.plotting.scatter_matrix(fromage,figsize=(9,9))


- Dans cette figure, on peut visualiser la répartition de données de chaque attribut en fonction des autres attributs
- On remarque que quelques attributs sont plus corrélés avec d'autres

<br>

# Partie 2 : Methode des centres mobiles (k-means)

In [None]:
import numpy as np
from sklearn import cluster

# fixing np random results
np.random.seed(0)
#create the model for 4 centers/clusters
kmeans = cluster.KMeans(n_clusters = 4)
#fit the model with our dataset : fromage
kmeans.fit(fromage)
#return indices that would sort the labels
idk = np.argsort(kmeans.labels_)



#show the label/cluster of each frame
kmeansDataFrame = pd.DataFrame(fromage.index[idk],kmeans.labels_[idk])
print(kmeansDataFrame)

#show the distance to the cluster center for each frame
print(kmeans.transform(fromage))

In [2]:
# Attributes of cluster centers
print(kmeans.cluster_centers_)

NameError: name 'kmeans' is not defined

<br>

### Aide à la détection du nombre adéquat de groupes

In [None]:
from sklearn import metrics

#utilisation de la métrique "silhouette"
res = np.arange(9, dtype ="double")
for k in np.arange(9):
    #cluster number ranging 2:10
    km = cluster.KMeans(n_clusters = k+2)
    #fitting the model
    km.fit(fromage)
    #silouhette score for choosing cluster number k
    res[k] = metrics.silhouette_score(fromage,km.labels_)
print (res)


#graphique
import matplotlib.pyplot as plt

plt.title("silhouette")
plt.xlabel("number of clusters")
plt.plot(np.arange(2,11,1),(res))
plt.show


- Selon la métrique silhouette, le meilleur nombre de clusters à choisir est 2 puisque le score est le plus élévé

<br>

# Partie 3 : Classification ascendante hiérarchique

In [None]:
import pandas
import numpy as np


#librairies pour la CAH
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
Z = linkage(fromage,method='ward', metric='euclidean')

#affichage du dendrogramme
plt.title("CAH")
plt.title('CAH avec matérialisation des 4 classes')
#Create the dendrogram of the clustering
dendrogram(Z,labels=fromage.index,orientation='left',color_threshold=255)
plt.show() 

#Forms flat clusters from the hierarchical clustering : shows the order of clustering hierarchy 
groupes_cah = fcluster(Z, t = 255,criterion='distance')
print(groupes_cah)

#index triés des groupes
idg = np.argsort(groupes_cah)

#affichage des observations et leurs groupes
cahDataFrame = pandas.DataFrame(fromage.index[idg],groupes_cah[idg])
print(cahDataFrame)

- 2/ En Analysant l'arboresence, on remarque qu'il existe 4 clusters, ce qui est confirmé par le resultat de methode fcluster

<br>

In [None]:
pd.crosstab(kmeansDataFrame.index,[cahDataFrame['Fromages'].tolist(),kmeansDataFrame['Fromages'].tolist()])


<br><br>
# Partie 4 : Interprétation des classes

In [None]:
from sklearn.decomposition import PCA

#fitting the model for PCA with 4 clusters
km = cluster.KMeans(n_clusters = 4)
km.fit(fromage)
acp = PCA(n_components =2).fit_transform(fromage)

#plotting the results
for couleur,k in zip(['red','blue','lawngreen', 'aqua'],[0,1,2,3]):
    plt.scatter(acp[km.labels_==k,0],acp[km.labels_==k,1],c=couleur)
plt.show()

- Selon l'analyse en composanted principales, les données sont réparties en 4 clusters distinctes

### 2. AgglomerativeClustering

In [None]:

from sklearn.cluster import AgglomerativeClustering

def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


model = AgglomerativeClustering(n_clusters=4)

model = model.fit(fromage.values)
plt.figure(figsize=(9,9))
plt.title('Hierarchical Clustering Dendrogram')
plot_dendrogram(model, labels=fromage.index,orientation='left')
plt.show()

- La methode AgglomerativeClustering nous fournit le même résultat que la methode CAH clustering de scipy : on obtient le même dendrogramme divisé principalement en 4 clusters

<br>
### 3. Divisive clustering

Afin de réaliser un clustering hiérarchique descendant en utilisant k-means on effectue les étapes suivantes:



In [None]:
from sklearn import cluster
import random

#Kmeans with 2 clusters
kmeans = cluster.KMeans(n_clusters = 2)

def Divisive_clustering(df):
    
    if(len(df.index) < 2):
        return 1
    else:
        
        # 1 . Clustering of data frame into 2 clusters
        kmeans.fit(df)
        
        # 2 . Adding clusters column to the dataframe
        cluster = pd.DataFrame({'cluster': list(map(int, kmeans.labels_))})
        df['cluster'] = list(map(int, kmeans.labels_))
        
        # 3 . Creating 2 dataframes each containing data belonging to a cluster
        df0 = df.loc[df['cluster'] == 0]
        df1 = df.loc[df['cluster'] == 1]
        
        #Output
        print('*****************************************************************************')
        randomcolor = random.choice(['grey','blue','magenta','green','red'])
        print(colored(df0.index,randomcolor))
        print(colored(df1.index,randomcolor))
        
        # 4 . Removing the cluster column to perform a fitting for the 2 subset dataframes
        df0 = df0.iloc[:,0:9]
        df1 = df0.iloc[:,0:9]
        
        # 5 . Recursive Call for both extracted dataframes
        Divisive_clustering(df0)
        Divisive_clustering(df1)
        
        return 1
        

In [None]:
Divisive_clustering(fromage)