In [None]:
#Mohsin Essani

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import silhouette_score as sil, calinski_harabasz_score as chs, silhouette_samples
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import silhouette_score as sil_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.model_selection import cross_validate, KFold


# Get the silhouette score

In [None]:
# Supressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data display customization

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
# Importing dataset
data = pd.read_excel('EastWestAirlines.xlsx',sheet_name='data')

# Column rename.
data.rename(columns={'ID#':'ID', 'Award?':'Award'}, inplace=True)

#Set ID as Index Column
data.set_index('ID',inplace=True)
data

# Hierarchical Clustering

In [None]:
# Scale the data
data_scaled = (data - np.mean(data)) / np.std(data)


In [None]:
# Perform hierarchical clustering
Z = linkage(data_scaled, method='ward', metric='euclidean')


In [None]:

# Plot the dendrogram
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.title('Dendrogram')
plt.xlabel('Passenger ID')
plt.ylabel('Distance')
plt.show()



In [None]:
# Determine the optimal number of clusters
k = 3
clusters = fcluster(Z, k, criterion='maxclust')


In [None]:

# Add the cluster number to the original data
data['cluster'] = clusters

# Group the data by clusters
grouped_data = data.groupby(['cluster'])


In [None]:

# Get the mean of each feature for each cluster
grouped_data_mean = grouped_data.mean()

# Print the inferences
print(grouped_data_mean)

In [None]:
#calculate the silhouette score

silhouette_score = sil(data_scaled, clusters)
print("Silhouette score:", silhouette_score)

# Get the adjusted Rand index
ari = adjusted_rand_score(data['Award'], clusters)
print("Adjusted Rand index:", ari)

# Get the Fowlkes-Mallows index
fmi = fowlkes_mallows_score(data['Award'], clusters)
print("Fowlkes-Mallows index:", fmi)

# Get the Calinski-Harabasz index
chi = calinski_harabasz_score(data_scaled, clusters)
print("Calinski-Harabasz index:", chi)


# Get the Davies-Bouldin index
dbi = davies_bouldin_score(data_scaled, clusters)
print("Davies-Bouldin index:", dbi)

# Perform cross-validation
cv_results = cross_val_score(RandomForestClassifier(), data_scaled, clusters, cv=KFold(5))
print("Cross-validation results:", cv_results)


In [None]:
#In the above inference, we are not getting the proper accuracy so we will try more functions and try using Kmeans

# KMeans Clustering

In [None]:


# Perform scaling on the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)



In [None]:
# Fit the KMeans algorithm on scaled data
kmeans = KMeans(n_clusters=2, random_state=123)
kmeans.fit(data_scaled)
prediction = kmeans.predict(data_scaled)


In [None]:

# Elbow Plot
ssd = []
range_n_clusters = [15,30,45,60,75,90,105,120]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50, random_state=123)
    kmeans.fit(data_scaled)
    
    ssd.append(kmeans.inertia_)
    



In [None]:
# plot the SSDs for each n_clusters
plt.plot(ssd)

# Determining the optimal number of clusters
# The elbow plot shows that 4 clusters would be the optimal number of clusters

In [None]:
# Analysis of clusters formed
data["LABEL"] = prediction

# Number of customers in each cluster
print(data["LABEL"].value_counts())




In [None]:
# Mean of each variable for each cluster
aggregations = {"Balance": "mean",
                "Qual_miles": "mean",
                "cc1_miles": "mean",
                "cc2_miles": "mean",
                "cc3_miles": "mean",
                "Bonus_miles": "mean",
                "Bonus_trans": "mean",
                "Flight_miles_12mo": "mean",
                "Flight_trans_12": "mean",
                "Days_since_enroll": "mean",
                "Award": "mean"}
result = data.groupby(["LABEL"]).agg(aggregations)
print(result)


In [None]:

# Checking the centroids of the clusters
print("Centroids:")
print(kmeans.cluster_centers_)



In [None]:
# # Plotting the variables against the centroids
# for i in range(4):
#     plt.scatter(data[data["LABEL"]==i]["Balance"],
#                 data[data["LABEL"]==i]["Qual_miles"],
#                 label="Cluster"+str(i+1),
#                 color=np.random.rand(3,))
# pl

In [None]:
# plot the results
plt.scatter(data.iloc[prediction == 0, 0], data.iloc[prediction == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(data.iloc[prediction == 1, 0], data.iloc[prediction == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(data.iloc[prediction == 2, 0], data.iloc[prediction == 2, 1], s = 100, c = 'green', label = 'Cluster 3')


In [None]:

# plot the centroids
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of passengers')
plt.xlabel('ID')
plt.ylabel('Balance')
plt.legend()
plt.show()

In [None]:
#calculate the silhouette score

silhouette_score = sil(data_scaled, clusters)
print("Silhouette score:", silhouette_score)

# Get the adjusted Rand index
ari = adjusted_rand_score(data['Award'], clusters)
print("Adjusted Rand index:", ari)

# Get the Fowlkes-Mallows index
fmi = fowlkes_mallows_score(data['Award'], clusters)
print("Fowlkes-Mallows index:", fmi)

# Get the Calinski-Harabasz index
chi = calinski_harabasz_score(data_scaled, clusters)
print("Calinski-Harabasz index:", chi)


# Get the Davies-Bouldin index
dbi = davies_bouldin_score(data_scaled, clusters)
print("Davies-Bouldin index:", dbi)

# Perform cross-validation
cv_results = cross_val_score(RandomForestClassifier(), data_scaled, clusters, cv=KFold(5))
print("Cross-validation results:", cv_results)

#In this case, the cross-validation results are all 1, which suggests that the model has a high accuracy in terms of clustering the data points into the right clusters. This could mean that the K-means model is a good fit for the data and can effectively identify the different segments among the passengers.

# Use of DBSCAN 

In [None]:

# Fit the DBSCAN model to the scaled data
model = DBSCAN(eps=0.5, min_samples=5)
model.fit(data_scaled)


In [None]:
# # Obtain the labels for the data points
labels = model.labels_

In [None]:
# # Plot the data points and color-code the clusters

colors = {0:'red', 1:'blue', 2:'green', 3:'yellow', 4:'black', 5:'orange'}
fig, ax = plt.subplots()
default_color = 'gray'
c = [colors.get(l, default_color) for l in labels if l != -1]
ax.scatter(data[labels != -1]['Balance'], data[labels != -1]['Qual_miles'], c=c)
plt.show()

In [None]:
#calculate the silhouette score

silhouette_score = sil(data_scaled, clusters)
print("Silhouette score:", silhouette_score)

# Get the adjusted Rand index
ari = adjusted_rand_score(data['Award'], clusters)
print("Adjusted Rand index:", ari)

# Get the Fowlkes-Mallows index
fmi = fowlkes_mallows_score(data['Award'], clusters)
print("Fowlkes-Mallows index:", fmi)

# Get the Calinski-Harabasz index
chi = calinski_harabasz_score(data_scaled, clusters)
print("Calinski-Harabasz index:", chi)


# Get the Davies-Bouldin index
dbi = davies_bouldin_score(data_scaled, clusters)
print("Davies-Bouldin index:", dbi)

# Perform cross-validation
cv_results = cross_val_score(RandomForestClassifier(), data_scaled, clusters, cv=KFold(5))
print("Cross-validation results:", cv_results)
