In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "grid.alpha" : 0,
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.bottom" : False,
    "axes.spines.right" : False,
    "axes.spines.left" : False,
    "legend.frameon" : False,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 500,
})

%matplotlib inline

In [None]:
# load the dataset without the target feature
df = pd.read_csv("./data/cad/data_feat.csv", index_col=0, decimal='.')
df = df.iloc[:, :-1]
# df.info()

### 1- Feature Clustering Using Correlation Matrix

In [None]:
corr = df.corr(method='pearson').abs()
threshold = 0.3
df_cluster = pd.DataFrame()

# for each column we sort the column's and get the top correlated columns by setting a threshold,
# and save the result in a df
for col_name in corr.columns:
    corr_col = corr[col_name].sort_values(ascending=False)
    corr_col = corr_col[corr_col >= threshold]
    
    corr_series = pd.DataFrame([",".join(corr_col.index.values)], index=[col_name], columns=["correlated_columns"])
    df_cluster = df_cluster.append(corr_series)
    # print(corr_series.index.values[0])
    # print(corr_series.values[0][0])
    # print("\n")

# df_cluster.to_csv(f"data/data/features_clustering/feat_cluster_{threshold}.csv")
df_cluster.head()

In [None]:
corr1D = pd.Series(corr.values.flatten())
corr1D = corr1D[corr1D != 1]
corr1D = corr1D.map(lambda x: round(x, 1))
corr_dist = corr1D.value_counts().sort_index()
corr_dist = corr_dist[corr_dist.index >= threshold]
corr_dist.plot(kind="bar", fontsize=6)
plt.title("Correlation Features Distribution")

print(f"Correlation (>= {threshold}) size: {corr_dist[corr_dist.index >= threshold].sum()/2} / {len(corr1D/2)}")

### 2- Feature Clustering Using Correlation Matrix + Hierarchical Clustering
[Clustering using a matrix of correlation coeff](https://stackoverflow.com/questions/38070478/how-to-do-clustering-using-the-matrix-of-correlation-coefficients)

[S v P](https://stats.stackexchange.com/questions/8071/how-to-choose-between-pearson-and-spearman-correlatio) considerations
- Spearman checks only the monotonic relationships (less assumptions, more resistant to outliers)
- Pearson checks the linear relationships

In [None]:
df = df.rename(columns={"Hyperlipemia\nHistoty of hyperlipemia":"Dyslipidemia\nHistoty of dyslipidemia"})

In [None]:
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist, squareform

# 1) compute the correlation (similarity) and distance (dissimilarity)
corr = df.corr(method="spearman")  
dist = 1 - corr.abs()

# 2) transform the distance matrix into a condensed distance vector
dist_condense = pdist(dist)

# 3) create the hierarchical clustering using the condensend distance vector
hier = hierarchy.linkage(dist_condense, method="weighted")
threshold = 1.3
cluster_labels = hierarchy.fcluster(hier, threshold, criterion="distance")
print(f"Number of clusters: {len(set(cluster_labels))}")

# plot
# choose threshold using dendrogram or any other method (e.g. quantile or desired number of features)
plt.figure(figsize=(15,10), dpi=500)
# plt.title("Hierarchy Clustering of Features")
plt.tick_params(labelbottom=False, bottom=False)
dend = hierarchy.dendrogram(hier, truncate_mode="level", p=30, color_threshold=threshold, 
                     labels=corr.columns, orientation="left", leaf_font_size=10)

In [None]:
# save each group feature
df_cluster = pd.DataFrame({'ClusterID': cluster_labels,
                           'Feature': df.columns})
df_cluster = df_cluster.groupby(['ClusterID'])
df_cluster = df_cluster['Feature'].apply(lambda x: list(x)).reset_index()['Feature']
df_cluster.to_pickle('data/cad/feat_cluster_hier.df')
df_cluster

In [None]:
from scipy.cluster.hierarchy import cophenet
from sklearn.metrics import silhouette_score

# cophenet correlation measure how faithfully the dendrogram preserves the original data point's distance
# cophenet = optimal when -> 1
# https://en.wikipedia.org/wiki/Cophenetic_correlation
c, coph_dists = cophenet(hier, pdist(dist))
c