## Clustering Maternal Mortality

## TO DO LIST:

- Evaluate the optimal number of clusters using HDBSCAN and K-means
- Run clustering on 2011 and predict clusters for 2016
- Take average values for clusters for maternal mortality for 2011 and 2016
- Take average values for all variables in corresponding clustering option for 2011 and 2016
- Take difference for maternal mortality and all other indicators
- Create spreadsheet and share with Marelize

In [1]:
import os
import re
import glob
import conda
import hdbscan
import operator
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from config import Config
from collections import Counter
from matplotlib import pyplot as plt
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans

In [2]:
def evaluate_hdbscan(input_df, min_samples, min_cluster_size, 
                     output, cluster_selection_method, 
                     fmin_samples, fmin_cluster_size,
                     prune=False, plot=True):
    samples = list(itertools.product(min_samples, min_cluster_size))
    counter = 0
    models = pd.DataFrame(columns=['min_samples',
                                   'min_cluster_size',
                                   'num_clusters_including_unclustered',
                                   'percent_of_unclustered_geos',
                                   'percent_of_maxclass',],index=range(len(samples)))
    #geo = input_df['index']
    #input_df = input_df.drop('index', axis=1)
    df = input_df.copy(deep=True)
    for iteration in samples:
        model = hdbscan.HDBSCAN(min_samples=int(iteration[0]), 
                                min_cluster_size=int(iteration[1]), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)
        models.loc[counter,'min_cluster_size'] = iteration[1]
        models.loc[counter, 'min_samples'] = iteration[0]
        models.loc[counter, 'num_clusters_including_unclustered'] = len(Counter(model.labels_))
        tmp_dict = dict(Counter(model.labels_))
        total = sum([v for k,v in tmp_dict.items()])
        tmp_dict = {k:round(v/total*100,2) for k,v in tmp_dict.items()}
        try:
            models.loc[counter, 'percent_of_unclustered_geos'] = tmp_dict.pop(-1)
        except KeyError as error:
            models.loc[counter, 'percent_of_unclustered_geos'] = 0 
        if len(tmp_dict) > 1:
            models.loc[counter, 'percent_of_maxclass'] = tmp_dict[max(tmp_dict.items(), key=operator.itemgetter(1))[0]]
        else:
            models.loc[counter, 'percent_of_maxclass'] = 100
        counter += 1
    if prune:
        out_model = hdbscan.HDBSCAN(min_samples=int(fmin_samples), 
                                min_cluster_size=int(fmin_cluster_size), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)

    else:
        out_model = None

    if plot:
        plt.rcParams['figure.figsize'] = [20,10]
        plt.plot(models['num_clusters_including_unclustered'], label='Number of clusters including unclustered')
        plt.plot(models['percent_of_unclustered_geos'], label='Percent of unclustered geographies')
        plt.plot(models['percent_of_maxclass'], label='Size of larges cluster (%)')
        plt.xlabel("Iterations", fontsize=20)
        plt.ylabel("Value", fontsize=20)
        plt.savefig(os.path.split(output)[1] + "/finetune_parameteres.jpeg")
        plt.legend()
        plt.show()
    del(input_df, df)
    return models, out_model

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/maternal_mortality_2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/maternal_mortality_2016.csv'
SDATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_maternal_mortality_2011.csv'
SDATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_maternal_mortality_2016.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

## Import Data

- Under 5 year mortality rate = (Number of deaths under 5 year / Number of live births) * 1000
- Infant mortality rate = (Number of deaths under 1 year / Number of live births) * 1000
- Maternal mortality rate = Number of maternal deaths / Number of live births) * 1000

In [4]:
data2011 = pd.read_csv(DATA2011)
s_data2011 = pd.read_csv(SDATA2011)
data2016 = pd.read_csv(DATA2016)
s_data2016 = pd.read_csv(SDATA2016)

## Evaluate clustering method: HDBSCAN - leaf - 2011

In [5]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('rate_maternal_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [6]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
69,2,23,3,15.62,42.19
72,2,26,3,15.62,42.19
71,2,25,3,15.62,42.19
70,2,24,3,15.62,42.19
68,2,22,3,15.62,42.19
67,2,21,3,15.62,42.19
66,2,20,3,15.62,42.19
65,2,19,3,15.62,42.19
64,2,18,3,15.62,42.19
25,1,27,3,15.62,42.19


## Evaluate clustering method: HDBSCAN - eom - 2011

In [7]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('rate_maternal_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [8]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
69,2,23,3,15.62,42.19
59,2,13,3,15.62,42.19
73,2,27,3,15.62,42.19
72,2,26,3,15.62,42.19
71,2,25,3,15.62,42.19
70,2,24,3,15.62,42.19
68,2,22,3,15.62,42.19
67,2,21,3,15.62,42.19
66,2,20,3,15.62,42.19
65,2,19,3,15.62,42.19


## Evaluate clustering method: HDBSCAN - leaf - 2016

In [9]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('rate_maternal_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [10]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
56,2,10,3,31.25,50.0
58,2,12,3,31.25,50.0
57,2,11,3,31.25,50.0
8,1,10,3,31.25,50.0
9,1,11,3,31.25,50.0
10,1,12,3,31.25,50.0
105,3,11,3,46.88,34.38
104,3,10,3,46.88,34.38
103,3,9,3,46.88,34.38
102,3,8,3,46.88,34.38


## Evaluate clustering method: HDBSCAN - eom - 2016

In [11]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('rate_maternal_mortality', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [12]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
54,2,8,3,31.25,50
53,2,7,3,31.25,50
55,2,9,3,31.25,50
57,2,11,3,31.25,50
58,2,12,3,31.25,50
10,1,12,3,31.25,50
9,1,11,3,31.25,50
56,2,10,3,31.25,50
7,1,9,3,31.25,50
6,1,8,3,31.25,50


# Clustering with K-means

In [13]:
kmeans_model = KMeans(n_clusters=3, random_state=0).fit(s_data2011.drop('rate_maternal_mortality', axis=1))
predicted2016 = kmeans_model.predict(s_data2016.drop('rate_maternal_mortality', axis=1))
data2011['cluster'] = kmeans_model.labels_
data2016['cluster'] = predicted2016

In [14]:
print(Counter(data2011['cluster']))

Counter({1: 22, 0: 21, 2: 21})


In [15]:
print(Counter(data2011['cluster']))

Counter({1: 22, 0: 21, 2: 21})


In [16]:
mean_cluster_values2011 = data2011.groupby('cluster').mean().transpose()
mean_cluster_values2011.head()

cluster,0,1,2
imp12distr_monthprocessImp12DistrMonthThana_Percent_PerFemale,9.406667,9.143182,10.672381
VitACoverage_Children12-59M,91.119048,91.577273,93.72381
PENTA2_Children23M,97.280952,97.431818,98.033333
Fully_Children23M,81.842857,83.118182,84.219048
imp12distr_monthprocessImp12DistrMonthThana_Percent_Implant,2.427619,2.735909,2.440476


In [17]:
mean_cluster_values2016 = data2016.groupby('cluster').mean().transpose()
mean_cluster_values2016.head()

cluster,0,1,2
imp12distr_monthprocessImp12DistrMonthThana_Percent_PerFemale,9.308571,9.024545,10.839048
VitACoverage_Children12-59M,90.014286,89.831818,92.133333
PENTA2_Children23M,97.347619,96.709091,97.928571
Fully_Children23M,88.2,86.190909,87.666667
imp12distr_monthprocessImp12DistrMonthThana_Percent_Implant,4.657619,4.910455,4.939524


In [18]:
percentdiff_cluster_values = ((mean_cluster_values2016 - mean_cluster_values2011) / mean_cluster_values2011) * 100
percentdiff_cluster_values = percentdiff_cluster_values.round(2)

In [19]:
percentdiff_cluster_values.head()

cluster,0,1,2
imp12distr_monthprocessImp12DistrMonthThana_Percent_PerFemale,-1.04,-1.3,1.56
VitACoverage_Children12-59M,-1.21,-1.91,-1.7
PENTA2_Children23M,0.07,-0.74,-0.11
Fully_Children23M,7.77,3.7,4.09
imp12distr_monthprocessImp12DistrMonthThana_Percent_Implant,91.86,79.48,102.4


In [20]:
percentdiff_cluster_values.sort_values(by=0 ,ascending=False)

cluster,0,1,2
03Immunization: Penta 1 to MR 2 drop out Rate,379.48,679.71,494.46
imp12distr_monthprocessImp12DistrMonthThana_Percent_Implant,91.86,79.48,102.40
imp11subdistr_thanaprocessPercent_Implant,91.67,79.41,102.50
imp11subdistr_ngothanaprocessNGO_Percent_Implant,80.73,108.14,91.18
Imp12DistrNGOMonthThana_Percent_Implant,80.73,108.14,91.18
03Immunization: IPV vial wastage rate,56.86,37.21,47.55
imp11subdistr_ngothanaprocessNGO_Percent_PerMale,48.55,26.20,15.60
Imp12DistrNGOMonthThana_Percent_PerMale,48.55,26.20,15.60
03Immunization: EPI Session drop out rate,33.19,-53.80,522.30
04Newborn: Nurse Bed ratio at SCANU (Recommended 0.8),32.67,1.24,83.11


In [21]:
percentdiff_cluster_values.to_csv(OUT+"clustersummary_maternal.csv")