<a href="https://colab.research.google.com/github/mnijhuis-dnb/Artificial_Intelligence_and_Machine_Learning_for_SupTech/blob/main/Tutorials/Tutorial%206%20Finding%20clusters%20and%20neighbours.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Artificial Intelligence and Machine Learning for SupTech  
Tutorial 6: Finding clusters and neighbours

*	Implementing K-means and DBSCAN
*	Hierarchical clustering: Bottom-up or Top-down?
*	Visual inspection of results

<br/>

14 March 2023  

**Instructors**  
Prof. Iman van Lelyveld (iman.van.lelyveld@vu.nl)<br/>
Dr. Michiel Nijhuis (m.nijhuis@dnb.nl)  

In [None]:
!gdown 1PCu4jNahysRpZ72z31KHpVkyAOp6nrKj

Tooday is about applying machine learning methods to examine stock returns in various ways. This involves data processing, standardization, normalization, model fitting etc. Importantly, we

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/company_data.csv', index_col=0)

In [None]:
df = df.fillna(-1)

In [None]:
df

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(
    n_clusters=n_clusters, 
    random_state=0,
).fit(df_X)

In [None]:
sr_clusters = pd.Series(kmeans.labels_, index=df_X.index, name='clusters')
sr_clusters

In [None]:
sr_clusters.value_counts()

In [None]:
df_X.join(sr_clusters).groupby('clusters').mean()

In [None]:
df_X_clusters = df_X.join(sr_clusters).groupby('clusters')
centroids = df_X_clusters.mean().values

In [None]:
# store for comparison
sr_clusters_km = sr_clusters.copy()

In [None]:
for permno, (x1, x2) in df_X.iterrows():
  cluster = sr_clusters[permno]
  color = colors[cluster]
  plt.plot(
    x1, x2,
    color=color,
    marker='o', markersize=3, lw=0
  )
for i, (x1, x2) in enumerate(centroids):
  plt.plot(x1, x2, color=colors[i], marker='X', markersize=40, alpha=.6)

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
agg = AgglomerativeClustering(
    n_clusters=n_clusters
).fit(df_X)

sr_clusters = pd.Series(agg.labels_, index=df_X.index, name='clusters')
df_X_clusters = df_X.join(sr_clusters).groupby('clusters')
centers = df_X_clusters.mean().values

In [None]:
# store for comparison
sr_clusters_agg = sr_clusters.copy()

In [None]:
for permno, (x1, x2) in df_X.iterrows():
  cluster = sr_clusters[permno]
  color = colors[cluster]
  plt.plot(
    x1, x2,
    color=color,
    marker='o', markersize=3, lw=0
  )
for i, (x1, x2) in enumerate(centers):
  plt.plot(x1, x2, color=colors[i], marker='X', markersize=40, alpha=.6)

## How to evaluate clustering performance?

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:

help(silhouette_score)

## KMeans silhouette

In [None]:
fig, axes = plt.subplots(1, n_clusters, sharey=True, figsize=[15,5])

silhouette_values = silhouette_samples(df_X, sr_clusters_km)

for cluster in sr_clusters.unique():
  ax = axes[cluster]
  color = colors[cluster]
  sils = silhouette_values[sr_clusters == cluster]
  sils = sorted(sils)
  ax.bar(range(len(sils)), sils, color=color, width=2)

  sils_avg = np.mean(sils)
  ax.axhline(sils_avg, lw=3, ls='--', color=color)
  ax.set_title(f'Cluster {cluster}\n(avg. silhoutte: {sils_avg:.3f}')

fig.tight_layout()

## Agglomerative clustering

In [None]:
fig, axes = plt.subplots(1, n_clusters, sharey=True, figsize=[15,5])

silhouette_values = silhouette_samples(df_X, sr_clusters_agg)

for cluster in sr_clusters.unique():
  ax = axes[cluster]
  color = colors[cluster]
  sils = silhouette_values[sr_clusters == cluster]
  sils = sorted(sils)
  ax.bar(range(len(sils)), sils, color=color, width=2)

  sils_avg = np.mean(sils)
  ax.axhline(sils_avg, lw=3, ls='--', color=color)
  ax.set_title(f'Cluster {cluster}\n(avg. silhoutte: {sils_avg:.3f}')

fig.tight_layout()