In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn import datasets
from sqlalchemy import create_engine
import warnings 
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'heartdisease'

In [3]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

In [4]:
heartdisease_df = pd.read_sql_query('select * from heartdisease',con=engine)

In [7]:
# Define the features and the outcome
X = heartdisease_df.iloc[:, :13]
y = heartdisease_df.iloc[:, 13]

# Replace missing values (marked by ?) with a 0
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis
y = np.where(y > 0, 0, 1)

# Normalize
X_std = StandardScaler().fit_transform(X)

If y is equal to 1, then it indicates that the corresponding patient has heart disease and if y is equal to 0, then the patient doesn't have heart disease.

## Produce dendrograms for the heart disease dataset using three different linkage methods: complete, average and ward. Which linkage method do you think produces more reasonable result?

In [6]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn import datasets, metrics

Linkage: Complete

In [9]:
# Defining the agglomerative clustering
agg_cluster = AgglomerativeClustering(linkage='complete', 
                                      affinity='cosine',
                                      n_clusters=3)

# Fit model
clusters_complete = agg_cluster.fit_predict(X_std)

In [10]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_complete)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_complete, metric='euclidean')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.18301939761943772
The silhoutte score of the Agglomerative Clustering solution: 0.09438720643481781


Linkage: Ward

In [13]:
agg_cluster_ward = AgglomerativeClustering(linkage='ward', 
                                      affinity='euclidean',
                                      n_clusters=3)

# Fit model
clusters_ward = agg_cluster_ward.fit_predict(X_std)

In [14]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_ward)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_ward, metric='euclidean')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.14859596899025246
The silhoutte score of the Agglomerative Clustering solution: 0.155240382445262


Linkage: Average

In [17]:
agg_cluster_avg = AgglomerativeClustering(linkage='average', 
                                      affinity='manhattan',
                                      n_clusters=3)

# Fit model
clusters_avg = agg_cluster_ward.fit_predict(X_std)

In [19]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_avg)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_avg, metric='manhattan')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.14859596899025246
The silhoutte score of the Agglomerative Clustering solution: 0.19999005076869555


All three linkage methods produced fairly low results on both the ARI and silhoutte scores. The best of the three is the Average with the best set of results of ARI: .1485 and Silhoutte of .19999 

# Apply agglomerative clustering to the heart disease data by setting n_clusters=2. Try the three linkage methods above and get ARI and silhouette scores for each of your solutions. Compare the results with each other and with that of the k-means solution that you implemented in the assignment of the previous checkpoint. Which algorithm and setting does perform better?

Linkage: Complete

In [20]:
agg_cluster = AgglomerativeClustering(linkage='complete', 
                                      affinity='cosine',
                                      n_clusters=2)

# Fit model
clusters_complete = agg_cluster.fit_predict(X_std)

In [21]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_complete)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_complete, metric='euclidean')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.21394030618551016
The silhoutte score of the Agglomerative Clustering solution: 0.11730765444448985


Linkage: Ward 

In [24]:
agg_cluster_ward = AgglomerativeClustering(linkage='ward', 
                                      affinity='euclidean',
                                      n_clusters=2)

# Fit model
clusters_ward = agg_cluster_ward.fit_predict(X_std)

In [25]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_ward)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_ward, metric='euclidean')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.146129913123814
The silhoutte score of the Agglomerative Clustering solution: 0.1387197366557222


Linkage: Average

In [26]:
agg_cluster_avg = AgglomerativeClustering(linkage='average', 
                                      affinity='manhattan',
                                      n_clusters=2)

# Fit model
clusters_avg = agg_cluster_ward.fit_predict(X_std)

In [27]:
print("Adjusted Rand Index of the Agglomerative Clustering solution: {}"
      .format(metrics.adjusted_rand_score(y, clusters_avg)))
print("The silhoutte score of the Agglomerative Clustering solution: {}"
      .format(metrics.silhouette_score(X_std, clusters_avg, metric='manhattan')))

Adjusted Rand Index of the Agglomerative Clustering solution: 0.146129913123814
The silhoutte score of the Agglomerative Clustering solution: 0.18207989265419602


Comparing all ARI and Silouhette scores, two cluster shows the best results. In the agglomerative clustering of 2, the ARI score went up most significantly among the three methods. In the previous assignment, two clusters also produced the higher results. 