In [None]:
# Import Libraies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

# Reading Data

In [None]:
df=pd.read_csv('Country-data.csv')
df.head()

In [None]:
# Shape of the dataframe
df.shape

In [None]:
#Let's check the missing data
print((df.isnull().sum()/len(df))*100)

In [None]:
# Data Details
df.info()

# EDA

In [None]:
# All the columns of the dataframe
df.columns

In [None]:
#Distplot to identify the data distribution of each column
plt.figure(figsize=(15,20))
feature=['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(feature):
    plt.subplot(6,2,i[0]+1)
    sns.distplot(df[i[1]])
    

### From the above plot its found that 'child_mort', 'exports', 'health', 'imports', 'inflation', 'gdpp', 'life_expec' are almost normally distributed. But for 'income' there are much number of countries income around 400k and for 'total_fer' there are much number of countries total fertility rate around 5

In [None]:
# As the 'exports', 'health' & 'import' column are the %age of 'gdpp' so, convering to its actual value
df['exports']=(df['gdpp']*df['exports'])/100
df['health']=(df['gdpp']*df['health'])/100
df['imports']=(df['gdpp']*df['imports'])/100


# Outlier Treatment

In [None]:
# Boxplot to visualise the outliers
plt.figure(figsize=(15,20))
feature=['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(feature):
    plt.subplot(6,2,i[0]+1)
    sns.boxplot(df[i[1]],orient='v')


### As we can define from the above Boxplot, all the variable has outliers so we will cap the outliers as we do not want to loose any data.

In [None]:
# Outlier Capping of "child_mort" column
q1= df['child_mort'].quantile(0.01)
q2= df['child_mort'].quantile(0.99)
df['child_mort'][df['child_mort']<=q1]=q1
df['child_mort'][df['child_mort']>=q2]=q2

# Outlier Capping of "exports" column
q3= df['exports'].quantile(0.01)
q4= df['exports'].quantile(0.99)
df['exports'][df['exports']<=q3]=q3
df['exports'][df['exports']>=q4]=q4

# Outlier Capping of "health" column
q5= df['health'].quantile(0.01)
q6= df['health'].quantile(0.99)
df['health'][df['health']<=q5]=q5
df['health'][df['health']>=q6]=q6

# Outlier Capping of "imports" column
q7= df['imports'].quantile(0.01)
q8= df['imports'].quantile(0.99)
df['imports'][df['imports']<=q7]=q7
df['imports'][df['imports']>=q8]=q8

# Outlier Capping of "income" column
q9= df['income'].quantile(0.01)
q10= df['income'].quantile(0.99)
df['income'][df['income']<=q9]=q9
df['income'][df['income']>=q10]=q10

# Outlier Capping of "inflation" column
q11= df['inflation'].quantile(0.01)
q12= df['inflation'].quantile(0.99)
df['inflation'][df['inflation']<=q11]=q11
df['inflation'][df['inflation']>=q12]=q12

# Outlier Capping of "life_expec" column
q13= df['life_expec'].quantile(0.01)
q14= df['life_expec'].quantile(0.99)
df['life_expec'][df['life_expec']<=q13]=q13
df['life_expec'][df['life_expec']>=q14]=q14

# Outlier Capping of "total_fer" column
q15= df['total_fer'].quantile(0.01)
q16= df['total_fer'].quantile(0.99)
df['total_fer'][df['total_fer']<=q15]=q15
df['total_fer'][df['total_fer']>=q16]=q16

# Outlier Capping of "gdpp" column
q17= df['gdpp'].quantile(0.01)
q18= df['gdpp'].quantile(0.99)
df['gdpp'][df['gdpp']<=q17]=q17
df['gdpp'][df['gdpp']>=q18]=q18



In [None]:
#Boxplot after Capping to 1% and 99% 
plt.figure(figsize=(15,20))
feature=['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
for i in enumerate(feature):
    plt.subplot(6,2,i[0]+1)
    sns.boxplot(df[i[1]],orient='v')


# Data Preparetion

In [None]:
df.head()

In [None]:
#copy data
df1=df.copy()

In [None]:
#delete the 'country' column from the df dataframe 
df.drop('country', axis=1,inplace=True)

In [None]:
## Scaling the Variable
Scale=StandardScaler()
df2=Scale.fit_transform(df)

In [None]:
#converting to Dataframe
df2=pd.DataFrame(df2)
df2.columns=['child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec', 'total_fer', 'gdpp']
df2.head()

## Hopkins Statistics:
The Hopkins statistic, is a statistic which gives a value which indicates the cluster tendency, in other words: how well the data can be clustered.

- If the value is between {0.01, ...,0.3}, the data is regularly spaced.

- If the value is around 0.5, it is random.

- If the value is between {0.7, ..., 0.99}, it has a high tendency to cluster.

In [None]:
# Hopkin Score
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
#Hopkin score for df2
hopkins(df2)

# Choosing the Value of K for K-Mean Clustering


### Silhouette Score for K-mean

In [None]:
# silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(df2)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(df2, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
    

In [None]:
# silhouette score curve for cluster 2 to 10
ss=[]
for k in range(2,10):
    kmeans= KMeans(n_clusters= k).fit(df2)
    ss.append([k,silhouette_score(df2, kmeans.labels_)])
plt.plot(pd.DataFrame(ss)[0],pd.DataFrame(ss)[1]);

### Plotting Elbow Curve

In [None]:
ssd=[]
for k in range(2,10):
    model= KMeans(n_clusters= k).fit(df2)
    ssd.append([k,model.inertia_])
plt.plot(pd.DataFrame(ssd)[0],pd.DataFrame(ssd)[1]);

### So, from the Silhouette score curve & Elbow curve we can conclue that k=3 is significant

In [None]:
#Clustering with k=3
kmean_= KMeans(n_clusters=3, random_state=100)
kmean_.fit(df2)

In [None]:
#Labels of clusters
kmean_.labels_

In [None]:
#Concatination of Kmean Label column with the DataFrame
df_cluster=pd.concat([df1,pd.Series(kmean_.labels_)],axis=1)
df_cluster.head()

In [None]:
#Rename the columns
df_cluster.columns=['country', 'child_mort', 'exports', 'health', 'imports', 'income','inflation', 'life_expec',
                   'total_fer','gdpp','KMean_Labels']
df_cluster.head()

In [None]:
#Data Point in each cluster
df_cluster.KMean_Labels.value_counts()

In [None]:
#Mean of each Label
df_cluster.drop('country',axis=1).groupby('KMean_Labels').mean()

In [None]:
plt.figure(figsize=(50,80))
df_cluster[['child_mort','income','gdpp','KMean_Labels']].groupby('KMean_Labels').mean().plot(kind='bar');

### After plotting the 3 clusters on basis of Child mortality, income & GDP, it is found that the  countries in Cluster no 2 are lowest among all as per K-mean clustering

In [None]:
df_cluster[df_cluster['KMean_Labels']==2].sort_values(['gdpp','child_mort','income'],ascending=[True,False,True]).head(10)

### Identified 10 Countries by K-Mean clustering that are in the direst need of aid, choosed on the basis of [gdpp, child_mort and income]

In [None]:
plt.figure(figsize=(15,12))
plt.subplot(3,1,1)
sns.scatterplot(x= df_cluster['gdpp'],y= df_cluster['income'],hue= df_cluster['KMean_Labels'], palette='Set1');
plt.subplot(3,1,2)
sns.scatterplot(x= df_cluster['child_mort'],y= df_cluster['income'],hue= df_cluster['KMean_Labels'], palette='Set1')
plt.subplot(3,1,3)
sns.scatterplot(x= df_cluster['child_mort'],y= df_cluster['gdpp'],hue= df_cluster['KMean_Labels'], palette='Set1')
plt.show()

### Visualising the Clusters on the basis of income, child mortality & gdp by K-mean cluster labels

In [None]:
plt.figure(figsize=(15,12))
plt.subplot(3,1,1)
sns.boxplot(x= df_cluster['KMean_Labels'],y= df_cluster['income'],palette='Set1');
plt.subplot(3,1,2)
sns.boxplot(x= df_cluster['KMean_Labels'],y= df_cluster['child_mort'],palette='Set1')
plt.subplot(3,1,3)
sns.boxplot(x= df_cluster['KMean_Labels'],y= df_cluster['gdpp'],palette='Set1')
plt.show()

### As we can visualise, Cluster 2 countries has Lowset income, highest child mortality & lowest gdp

## Hierarchical Clustering

In [None]:
# Using the previous scaled data
df2.head()

In [None]:
# single linkage
plt.figure(figsize=(30,10))
mergings = linkage(df2, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

### As we can visualise the Hierarchical clustering by single linkage is completly unreadable

In [None]:
# complete linkage
plt.figure(figsize=(30,10))
mergings = linkage(df2, method="complete", metric='euclidean')
dendrogram(mergings)
plt.show()

### We can have a clear view of clusters at Hierarchical clustering by complete linkage

### Silhouette Score for Hierarchical Clustering

In [None]:
#Silhouette Score for different clusters from 2 to 10
ssi=[]
for k in range(2,10):
    cluster_labels = cut_tree(mergings, n_clusters=k).reshape(-1, )
    ssi.append([k,silhouette_score(df2, cluster_labels)])
plt.plot(pd.DataFrame(ssi)[0],pd.DataFrame(ssi)[1]);

### By the Silhouette Curve we found 3 clusters are optimum and by Hierarchical Clustering using complete linkage, x axis consists of countries and y axis consists of Euclidean distance between the clusters,to get the largest distance we count the number of lines on the diagram and determine optimal numbers of clusters, so we have choosed 3 clusters as optimum

In [None]:
# 3 clusters
cluster_labels = cut_tree(mergings, n_clusters=3).reshape(-1, )
cluster_labels

In [None]:
# assign cluster labels
df_cluster['Hcal_labels'] = cluster_labels
df_cluster.head()

In [None]:
#Data Point at each cluster
df_cluster.Hcal_labels.value_counts()

In [None]:
#Mean of each Label
df_cluster.drop(['country','KMean_Labels'],axis=1).groupby('Hcal_labels').mean()

In [None]:
plt.figure(figsize=(50,80))
df_cluster[['child_mort','income','gdpp','Hcal_labels']].groupby('Hcal_labels').mean().plot(kind='bar');

### After plotting the 3 clusters on basis of Child mortality, income & GDP, it is found that the  countries in Cluster no 0 are lowest among all as per Hierarchical clustering

In [None]:
df_cluster[df_cluster['Hcal_labels']==0].sort_values(['gdpp','child_mort','income'],ascending=[True,False,True]).head(10)

In [None]:
plt.figure(figsize=(15,12))
plt.subplot(3,1,1)
sns.scatterplot(x= df_cluster['gdpp'],y= df_cluster['income'],hue= df_cluster['Hcal_labels'], palette='Set1');
plt.subplot(3,1,2)
sns.scatterplot(x= df_cluster['child_mort'],y= df_cluster['income'],hue= df_cluster['Hcal_labels'], palette='Set1')
plt.subplot(3,1,3)
sns.scatterplot(x= df_cluster['child_mort'],y= df_cluster['gdpp'],hue= df_cluster['Hcal_labels'], palette='Set1')
plt.show()

### Visualising the Clusters on the basis of income, child mortality & gdp by Hierarchical cluster labels

In [None]:
plt.figure(figsize=(15,12))
plt.subplot(3,1,1)
sns.boxplot(x= df_cluster['Hcal_labels'],y= df_cluster['income'],palette='Set1');
plt.subplot(3,1,2)
sns.boxplot(x= df_cluster['Hcal_labels'],y= df_cluster['child_mort'],palette='Set1')
plt.subplot(3,1,3)
sns.boxplot(x= df_cluster['Hcal_labels'],y= df_cluster['gdpp'],palette='Set1')
plt.show()

### As we can visualise, Cluster 0 countries has Lowset income, highest child mortality & lowest gdp as per Hierarchical clustering

# Final Suggestion List

In [None]:
# Country List By Hirarchical CLustering
df_cluster[df_cluster['Hcal_labels']==0].sort_values(['gdpp','child_mort','income'],ascending=[True,False,True]).head(5)

In [None]:
# Country List By K-mean CLustering
df_cluster[df_cluster['KMean_Labels']==2].sort_values(['gdpp','child_mort','income'],ascending=[True,False,True]).head(5)

# Final Conclusion:

### As we have found that the same list of countries for both the clusters(K-mean & Hierarchical) which are in the direst need of aid, sorted by these three variables - [gdpp, child_mort and income], but K-means is more suitable as the data set is large and the the value of k can be identified by Silhouette curve & elbow curve easily in statistical way.