In [507]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import silhouette_score
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
from google.colab import files
uploaded = files.upload()

In [508]:
import warnings
warnings.filterwarnings("ignore")

In [509]:
df = pd.read_csv('World Indicators.csv')
# df

In [510]:
#Dropping null values
df = df.dropna()

In [511]:
#Formatting dataset to remove $ and % symbols
df["GDP"] = df["GDP"].str.replace("$","").str.replace(",","").astype(float)
df["Business Tax Rate"] = df["Business Tax Rate"].str.replace("%","").str.replace(",","").astype(float)
df["Health Exp/Capita"] = df["Health Exp/Capita"].str.replace("$","").str.replace(",","").astype(float)

In [512]:
df_2 = df.copy()

In [513]:
#Dropping non-numeric values
df = df.drop(['Region', 'Country'], axis=1)
df = df.drop(['Energy Usage', 'Lending Interest'], axis=1)

In [None]:
df

In [515]:
# normalizing data
scaler = MinMaxScaler()
df_normalized = scaler.fit_transform(df)

In [None]:
df_pr = pd.DataFrame(df_normalized, columns=df.columns)    
df_pr

Elbow Plot

In [517]:
k_rng = range(1,11)
sse =[]
for k in k_rng:
    km1 = KMeans(n_clusters=k)
    km1.fit_predict(df_pr.iloc[:,0:15])
    sse.append(km1.inertia_)

In [None]:
sse

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of Squared Error')
plt.plot(k_rng,sse)

K-Means Clustering

In [521]:
# K-means scores for k = 2 to 5
for n in range(2, 6):
    km_test = KMeans(n_clusters=n)
    y_test_predicted = km_test.fit_predict(df_pr.iloc[:,0:15])
    shscore = metrics.silhouette_score(df_pr.iloc[:,0:15], y_test_predicted)
    print(f"SHScore with K equals {n}: {shscore}")

SHScore with K equals 2: 0.3645406509699328
SHScore with K equals 3: 0.26211896985169775
SHScore with K equals 4: 0.26075215366845544
SHScore with K equals 5: 0.2198817699532651


SHScore plot using Kmeans clusters

In [None]:
cluster_range_1 = range(2, 10)

# Initializing lists to store silhouette scores and cluster labels for each size
silhouette_scores = []
cluster_labels = []

# Looping through the range of cluster sizes and fit a KMeans model for each one
for n_clusters in cluster_range_1:
    km_test = KMeans(n_clusters=n_clusters)
    cluster_labels.append(km_test.fit_predict(df_pr.iloc[:,0:15]))
    silhouette_scores.append(silhouette_score(df_pr.iloc[:,0:15], cluster_labels[-1]))


# Plotting the silhouette scores
plt.plot(cluster_range_1, silhouette_scores, 'bo-')
plt.xlabel('No of clusters')
plt.ylabel('SHScore')
plt.show()

Hierarchical clustering

In [None]:
# Create loop for n_clusters from 2 to 5
for n in range(2, 6):
    # Hierarchical clustering
    hc_test_1 = AgglomerativeClustering(n_clusters=n)
    hc_test_1.fit(df_pr.iloc[:,0:15])
    
    # Calculate silhouette score
    sh_score = metrics.silhouette_score(df_pr.iloc[:,0:15], hc_test_1.labels_)
    print(f'SHScore with Hierarchical clusters equals {n}: {sh_score}')

SHScore plot using Hierarchical clustering

In [None]:
# Define the range of clusters to test
cluster_range_2 = range(2, 10)

# Initialize lists to store silhouette scores and cluster labels for each size
silhouette_scores_hc = []
cluster_labels_hc = []

# Loop through the range of cluster sizes and fit a hierarchical clustering model for each one
for n_clusters in cluster_range_2:
    hc_test_1 = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels_hc.append(hc_test_1.fit_predict(df_pr.iloc[:,0:15]))
    silhouette_scores_hc.append(silhouette_score(df_pr.iloc[:,0:15], cluster_labels_hc[-1]))

# Plot the silhouette scores
plt.plot(cluster_range_2, silhouette_scores_hc, 'bo-')
plt.xlabel('No of clusters')
plt.ylabel('SHScore')
plt.show()

Reporting the best clustering solution.

In [526]:
# Hence for k=2, the score is highest 

# silhouette_score with K=2
km_tested = KMeans(n_clusters=2)
y_tested = km_tested.fit_predict(df_pr.iloc[:,0:15])
print('SHScore with K equals 2: '+ str(metrics.silhouette_score(df_pr.iloc[:,0:15], y_tested)))

# hierarchical clustering with K=2
hc_tested = AgglomerativeClustering(n_clusters=2)
hc_tested.fit(df_pr.iloc[:,0:15])
print('SHScore with clusters equals 2 : '+ str(metrics.silhouette_score(df_pr.iloc[:,0:15], hc_tested.labels_)))
print("Since Silhouette score for Hierarchical clustering is the highest , Hierarchical clustering is considered as the best solution")

SHScore with K equals 2: 0.3645406509699328
SHScore with clusters equals 2 : 0.37816563381651036
Since Silhouette score for Hierarchical clustering is the highest , Hierarchical clustering is considered as the best solution


Grouping countries based on K-means and hierarchical clustering methods

In [None]:
Hierarchical_grouping = pd.DataFrame({'Cluster': hc_tested.labels_, 'Country': df_2['Country']})
Hierarchical_grouping

In [None]:
Kmeans_grouping = pd.DataFrame({'Cluster': y_tested, 'Country': df_2['Country']})
Kmeans_grouping

In [None]:
# grouping the DataFrame by Cluster and converting each group to a dictionary to provide a detailed list of all the groups and the countries included within the groups
clusters_dict = {str(k): v['Country'].tolist() for k, v in Hierarchical_grouping.groupby('Cluster')}

# print the dictionary
clusters_dict

3 Scatter plots 

In [530]:
# Normalizing the df except the last 2 columns i.e Region and Country
scaler = MinMaxScaler()
df_2_normalized = pd.DataFrame(scaler.fit_transform(df_2.iloc[:, :-2]), columns=df_2.columns[:-2])

In [None]:
df_2_normalized

In [533]:
# adding new column to df_2_normalized with cluster labels
df_2_normalized['Country'] = df_2['Country'].reset_index(drop=True)
df_2_normalized['Cluster'] = hc_tested.labels_

In [None]:
df_2_normalized

1. GDP vs URBAN POPULATION

In [None]:
t41 = df_2_normalized[df_2_normalized.Cluster==0]
t42 = df_2_normalized[df_2_normalized.Cluster==1]

# Scatter plot: Life expectancy vs GDP
plt.scatter(t41['GDP'], t41['Population Urban'], c ='blue')
plt.scatter(t42['GDP'], t42['Population Urban'], c ='green')
plt.xlabel('GDP')
plt.ylabel('Population Urban')
plt.title('Population Urban vs GDP')
plt.show()

2. Birth Rate vs Infant Mortality Rate


In [None]:
t43 = df_2_normalized[df_2_normalized.Cluster==0]
t44 = df_2_normalized[df_2_normalized.Cluster==1]

# Scatter plot: Infant Mortality Rate vs Birth Rate
plt.scatter(t43['Birth Rate'], t43['Infant Mortality Rate'], c ='blue')
plt.scatter(t44['Birth Rate'], t44['Infant Mortality Rate'], c ='green')
plt.xlabel('Birth Rate')
plt.ylabel('Infant Mortality Rate')
plt.title('Birth Rate vs Infant Mortality Rate')
plt.show()

3. Life Expectancy Male VS Life Expectancy Female

In [None]:
t45 = df_2_normalized[df_2_normalized.Cluster==0]
t46 = df_2_normalized[df_2_normalized.Cluster==1]

# Scatter plot: Literacy vs GDP
plt.scatter(t45['Life Expectancy Male'], t45['Life Expectancy Female'], c ='blue')
plt.scatter(t46['Life Expectancy Male'], t46['Life Expectancy Female'], c ='green')
plt.xlabel('Life Expectancy Male')
plt.ylabel('Life Expectancy Female')
plt.title('Life Expectancy Male vs Life Expectancy Female')
plt.show()