## Clustering Under 5 Mortality

## TO DO LIST:

- Evaluate the optimal number of clusters using HDBSCAN and K-means
- Run clustering on 2011 and predict clusters for 2016
- Take average values for clusters for maternal mortality for 2011 and 2016
- Take average values for all variables in corresponding clustering option for 2011 and 2016
- Take difference for maternal mortality and all other indicators
- Create spreadsheet and share with Marelize

In [1]:
import os
import re
import glob
import conda
import hdbscan
import operator
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from config import Config
from collections import Counter
from matplotlib import pyplot as plt
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans

In [2]:
def evaluate_hdbscan(input_df, min_samples, min_cluster_size, 
                     output, cluster_selection_method, 
                     fmin_samples, fmin_cluster_size,
                     prune=False, plot=True):
    samples = list(itertools.product(min_samples, min_cluster_size))
    counter = 0
    models = pd.DataFrame(columns=['min_samples',
                                   'min_cluster_size',
                                   'num_clusters_including_unclustered',
                                   'percent_of_unclustered_geos',
                                   'percent_of_maxclass',],index=range(len(samples)))
    #geo = input_df['index']
    #input_df = input_df.drop('index', axis=1)
    df = input_df.copy(deep=True)
    for iteration in samples:
        model = hdbscan.HDBSCAN(min_samples=int(iteration[0]), 
                                min_cluster_size=int(iteration[1]), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)
        models.loc[counter,'min_cluster_size'] = iteration[1]
        models.loc[counter, 'min_samples'] = iteration[0]
        models.loc[counter, 'num_clusters_including_unclustered'] = len(Counter(model.labels_))
        tmp_dict = dict(Counter(model.labels_))
        total = sum([v for k,v in tmp_dict.items()])
        tmp_dict = {k:round(v/total*100,2) for k,v in tmp_dict.items()}
        try:
            models.loc[counter, 'percent_of_unclustered_geos'] = tmp_dict.pop(-1)
        except KeyError as error:
            models.loc[counter, 'percent_of_unclustered_geos'] = 0 
        if len(tmp_dict) > 1:
            models.loc[counter, 'percent_of_maxclass'] = tmp_dict[max(tmp_dict.items(), key=operator.itemgetter(1))[0]]
        else:
            models.loc[counter, 'percent_of_maxclass'] = 100
        counter += 1
    if prune:
        out_model = hdbscan.HDBSCAN(min_samples=int(fmin_samples), 
                                min_cluster_size=int(fmin_cluster_size), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)

    else:
        out_model = None

    if plot:
        plt.rcParams['figure.figsize'] = [20,10]
        plt.plot(models['num_clusters_including_unclustered'], label='Number of clusters including unclustered')
        plt.plot(models['percent_of_unclustered_geos'], label='Percent of unclustered geographies')
        plt.plot(models['percent_of_maxclass'], label='Size of larges cluster (%)')
        plt.xlabel("Iterations", fontsize=20)
        plt.ylabel("Value", fontsize=20)
        plt.savefig(os.path.split(output)[1] + "/finetune_parameteres.jpeg")
        plt.legend()
        plt.show()
    del(input_df, df)
    return models, out_model

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/under5_mortality_2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/under5_mortality_2016.csv'
SDATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_under5_mortality_2011.csv'
SDATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_under5_mortality_2016.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

## Import Data

- Under 5 year mortality rate = (Number of deaths under 5 year / Number of live births) * 1000
- Infant mortality rate = (Number of deaths under 1 year / Number of live births) * 1000
- Maternal mortality rate = Number of maternal deaths / Number of live births) * 1000

In [4]:
data2011 = pd.read_csv(DATA2011)
s_data2011 = pd.read_csv(SDATA2011)
data2016 = pd.read_csv(DATA2016)
s_data2016 = pd.read_csv(SDATA2016)

In [5]:
print(data2011['rate_under5y_mortality'].mean())
print(data2016['rate_under5y_mortality'].mean())


42.829375
39.446562500000006


## Evaluate clustering method: HDBSCAN - leaf - 2011

In [6]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('rate_under5y_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [7]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
102,3,8,3,15.62,48.44
151,4,9,3,15.62,45.31
152,4,10,3,15.62,45.31
153,4,11,3,15.62,45.31
154,4,12,3,15.62,45.31
155,4,13,3,15.62,45.31
156,4,14,3,15.62,45.31
157,4,15,3,15.62,45.31
150,4,8,3,15.62,45.31
158,4,16,3,15.62,45.31


## Evaluate clustering method: HDBSCAN - eom - 2011

In [8]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('rate_under5y_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [9]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
102,3,8,3,15.62,48.44
147,4,5,3,15.62,45.31
148,4,6,3,15.62,45.31
149,4,7,3,15.62,45.31
150,4,8,3,15.62,45.31
151,4,9,3,15.62,45.31
152,4,10,3,15.62,45.31
153,4,11,3,15.62,45.31
154,4,12,3,15.62,45.31
146,4,4,3,15.62,45.31


## Evaluate clustering method: HDBSCAN - leaf - 2016

In [10]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('rate_under5y_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [11]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
18,1,20,3,7.81,51.56
24,1,26,3,7.81,51.56
23,1,25,3,7.81,51.56
22,1,24,3,7.81,51.56
21,1,23,3,7.81,51.56
20,1,22,3,7.81,51.56
19,1,21,3,7.81,51.56
17,1,19,3,7.81,51.56
16,1,18,3,7.81,51.56
15,1,17,3,7.81,51.56


## Evaluate clustering method: HDBSCAN - eom - 2016

In [12]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('rate_under5y_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [13]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
21,1,23,3,7.81,51.56
24,1,26,3,7.81,51.56
23,1,25,3,7.81,51.56
22,1,24,3,7.81,51.56
20,1,22,3,7.81,51.56
19,1,21,3,7.81,51.56
18,1,20,3,7.81,51.56
17,1,19,3,7.81,51.56
16,1,18,3,7.81,51.56
15,1,17,3,7.81,51.56


# Clustering with K-means

In [14]:
kmeans_model = KMeans(n_clusters=3, random_state=0).fit(s_data2011.drop('rate_under5y_mortality', axis=1))
predicted2016 = kmeans_model.predict(s_data2016.drop('rate_under5y_mortality', axis=1))
data2011['cluster'] = kmeans_model.labels_
data2016['cluster'] = predicted2016

In [15]:
print(Counter(data2011['cluster']))

Counter({1: 22, 0: 21, 2: 21})


In [16]:
print(Counter(data2011['cluster']))

Counter({1: 22, 0: 21, 2: 21})


In [17]:
mean_cluster_values2011 = data2011.groupby('cluster').mean().transpose()
mean_cluster_values2011.head()

cluster,0,1,2
04Newborn: % of female baby admitted in SCANU reported monthly,38.923529,38.912834,39.580112
Fully_Children23M,81.842857,83.118182,84.219048
PENTA1_Children12M,98.952381,98.85,98.971429
prop_live_births,98.850952,98.605,98.90381
OPV1_Children12M,98.790476,98.827273,98.971429


In [18]:
mean_cluster_values2016 = data2016.groupby('cluster').mean().transpose()
mean_cluster_values2016.head()

cluster,0,1,2
04Newborn: % of female baby admitted in SCANU reported monthly,39.219888,40.129679,40.016807
Fully_Children23M,88.2,86.190909,87.666667
PENTA1_Children12M,97.92381,97.381818,98.166667
prop_live_births,96.932381,97.391818,96.491429
OPV1_Children12M,97.92381,97.381818,98.166667


In [19]:
percentdiff_cluster_values = ((mean_cluster_values2016 - mean_cluster_values2011) / mean_cluster_values2011) * 100
percentdiff_cluster_values = percentdiff_cluster_values.round(2)

In [20]:
percentdiff_cluster_values.head()

cluster,0,1,2
04Newborn: % of female baby admitted in SCANU reported monthly,0.76,3.13,1.1
Fully_Children23M,7.77,3.7,4.09
PENTA1_Children12M,-1.04,-1.49,-0.81
prop_live_births,-1.94,-1.23,-2.44
OPV1_Children12M,-0.88,-1.46,-0.81


In [21]:
percentdiff_cluster_values.sort_values(by=0 ,ascending=False)

cluster,0,1,2
imp11subdistr_ngothanaprocessNGO_Percent_PerMale,48.55,26.2,15.6
Imp12DistrNGOMonthThana_Percent_PerMale,48.55,26.2,15.6
07Vaccine&LogisticsstockofUpazilaMunCC: upazilla_epi_Supervision Form closing balance,47.4,3.56,14.7
TT5_Mother0-11MChildren,25.09,32.02,12.21
01MaternalHealth: Institutional C-section rate,17.46,14.95,17.8
TT4_Mother0-11MChildren,14.67,19.09,10.57
rate_death,8.13,-4.37,-6.04
Fully_Children23M,7.77,3.7,4.09
TT3_Mother0-11MChildren,7.47,8.76,5.16
Fully_Children12M,7.01,1.92,1.48


In [22]:
percentdiff_cluster_values.to_csv(OUT+"clustersummary_under5.csv")