## Clustering Fully Immunized

## TO DO LIST:

- Evaluate the optimal number of clusters using HDBSCAN and K-means
- Run clustering on 2011 and predict clusters for 2016
- Take average values for clusters for maternal mortality for 2011 and 2016
- Take average values for all variables in corresponding clustering option for 2011 and 2016
- Take difference for maternal mortality and all other indicators
- Create spreadsheet and share with Marelize

In [1]:
import os
import re
import glob
import conda
import hdbscan
import operator
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from config import Config
from collections import Counter
from matplotlib import pyplot as plt
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans

In [2]:
def evaluate_hdbscan(input_df, min_samples, min_cluster_size, 
                     output, cluster_selection_method, 
                     fmin_samples, fmin_cluster_size,
                     prune=False, plot=True):
    samples = list(itertools.product(min_samples, min_cluster_size))
    counter = 0
    models = pd.DataFrame(columns=['min_samples',
                                   'min_cluster_size',
                                   'num_clusters_including_unclustered',
                                   'percent_of_unclustered_geos',
                                   'percent_of_maxclass',],index=range(len(samples)))
    #geo = input_df['index']
    #input_df = input_df.drop('index', axis=1)
    df = input_df.copy(deep=True)
    for iteration in samples:
        model = hdbscan.HDBSCAN(min_samples=int(iteration[0]), 
                                min_cluster_size=int(iteration[1]), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)
        models.loc[counter,'min_cluster_size'] = iteration[1]
        models.loc[counter, 'min_samples'] = iteration[0]
        models.loc[counter, 'num_clusters_including_unclustered'] = len(Counter(model.labels_))
        tmp_dict = dict(Counter(model.labels_))
        total = sum([v for k,v in tmp_dict.items()])
        tmp_dict = {k:round(v/total*100,2) for k,v in tmp_dict.items()}
        try:
            models.loc[counter, 'percent_of_unclustered_geos'] = tmp_dict.pop(-1)
        except KeyError as error:
            models.loc[counter, 'percent_of_unclustered_geos'] = 0 
        if len(tmp_dict) > 1:
            models.loc[counter, 'percent_of_maxclass'] = tmp_dict[max(tmp_dict.items(), key=operator.itemgetter(1))[0]]
        else:
            models.loc[counter, 'percent_of_maxclass'] = 100
        counter += 1
    if prune:
        out_model = hdbscan.HDBSCAN(min_samples=int(fmin_samples), 
                                min_cluster_size=int(fmin_cluster_size), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)

    else:
        out_model = None

    if plot:
        plt.rcParams['figure.figsize'] = [20,10]
        plt.plot(models['num_clusters_including_unclustered'], label='Number of clusters including unclustered')
        plt.plot(models['percent_of_unclustered_geos'], label='Percent of unclustered geographies')
        plt.plot(models['percent_of_maxclass'], label='Size of larges cluster (%)')
        plt.xlabel("Iterations", fontsize=20)
        plt.ylabel("Value", fontsize=20)
        plt.savefig(os.path.split(output)[1] + "/finetune_parameteres.jpeg")
        plt.legend()
        plt.show()
    del(input_df, df)
    return models, out_model

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/full_immunized_2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/full_immunized_2016.csv'
SDATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_full_immunized_2011.csv'
SDATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_full_immunized_2016.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

## Import Data

- Under 5 year mortality rate = (Number of deaths under 5 year / Number of live births) * 1000
- Infant mortality rate = (Number of deaths under 1 year / Number of live births) * 1000
- Maternal mortality rate = Number of maternal deaths / Number of live births) * 1000

In [4]:
data2011 = pd.read_csv(DATA2011)
s_data2011 = pd.read_csv(SDATA2011)
data2016 = pd.read_csv(DATA2016)
s_data2016 = pd.read_csv(SDATA2016)

## Evaluate clustering method: HDBSCAN - leaf - 2011

In [5]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('Fully_Children12M', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [6]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
17,1,19,3,32.81,35.94
18,1,20,3,32.81,35.94
16,1,18,3,32.81,35.94
15,1,17,3,32.81,35.94
14,1,16,3,32.81,35.94
13,1,15,3,32.81,35.94
12,1,14,3,32.81,35.94
10,1,12,3,32.81,35.94
11,1,13,3,32.81,35.94
8,1,10,3,32.81,35.94


## Evaluate clustering method: HDBSCAN - eom - 2011

In [7]:
tmp, out = evaluate_hdbscan(input_df=s_data2011.drop('Fully_Children12M', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [8]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,4,7.81,85.94
18,1,20,3,32.81,35.94
17,1,19,3,32.81,35.94
16,1,18,3,32.81,35.94
14,1,16,3,32.81,35.94
13,1,15,3,32.81,35.94
12,1,14,3,32.81,35.94
11,1,13,3,32.81,35.94
10,1,12,3,32.81,35.94
15,1,17,3,32.81,35.94


## Evaluate clustering method: HDBSCAN - leaf - 2016

In [9]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('Fully_Children12M', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [10]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
18,1,20,3,18.75,40.62
24,1,26,3,18.75,40.62
23,1,25,3,18.75,40.62
22,1,24,3,18.75,40.62
21,1,23,3,18.75,40.62
20,1,22,3,18.75,40.62
19,1,21,3,18.75,40.62
17,1,19,3,18.75,40.62
16,1,18,3,18.75,40.62
15,1,17,3,18.75,40.62


## Evaluate clustering method: HDBSCAN - eom - 2016

In [11]:
tmp, out = evaluate_hdbscan(input_df=s_data2016.drop('Fully_Children12M', axis=1), 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [12]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
20,1,22,3,18.75,40.62
23,1,25,3,18.75,40.62
19,1,21,3,18.75,40.62
18,1,20,3,18.75,40.62
17,1,19,3,18.75,40.62
16,1,18,3,18.75,40.62
15,1,17,3,18.75,40.62
14,1,16,3,18.75,40.62
13,1,15,3,18.75,40.62
12,1,14,3,18.75,40.62


# Clustering with K-means

In [22]:
kmeans_model = KMeans(n_clusters=4, random_state=0).fit(s_data2011.drop('Fully_Children12M', axis=1))
predicted2016 = kmeans_model.predict(s_data2016.drop('Fully_Children12M', axis=1))
data2011['cluster'] = kmeans_model.labels_
data2016['cluster'] = predicted2016

In [23]:
print(Counter(data2011['cluster']))

Counter({1: 17, 0: 17, 2: 16, 3: 14})


In [24]:
print(Counter(data2016['cluster']))

Counter({1: 17, 0: 17, 2: 16, 3: 14})


In [25]:
mean_cluster_values2011 = data2011.groupby('cluster').mean().transpose()
mean_cluster_values2011.head()

cluster,0,1,2,3
07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,489.588235,172.058824,126.5,-769.357143
04Newborn: % of nurse trained on IMCI working at SCANU,56.459477,34.919608,33.250694,49.642063
PENTA2_Children23M,97.323529,98.117647,97.76875,97.021429
Fully_Children23M,82.629412,84.2,84.10625,81.007143
PENTA1_Children12M,98.788235,99.064706,99.05625,98.764286


In [26]:
mean_cluster_values2016 = data2016.groupby('cluster').mean().transpose()
mean_cluster_values2016.head()

cluster,0,1,2,3
07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,10568.117647,10232.0,10271.5625,56394.214286
04Newborn: % of nurse trained on IMCI working at SCANU,85.164097,46.945842,48.071121,57.320197
PENTA2_Children23M,97.411765,97.6,96.6875,97.585714
Fully_Children23M,87.205882,87.082353,86.56875,88.671429
PENTA1_Children12M,97.911765,97.864706,97.225,98.321429


In [27]:
percentdiff_cluster_values = ((mean_cluster_values2016 - mean_cluster_values2011) / mean_cluster_values2011) * 100
percentdiff_cluster_values = percentdiff_cluster_values.round(2)

In [28]:
percentdiff_cluster_values.head()

cluster,0,1,2,3
07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,2058.57,5846.8,8019.81,-7430.04
04Newborn: % of nurse trained on IMCI working at SCANU,50.84,34.44,44.57,15.47
PENTA2_Children23M,0.09,-0.53,-1.11,0.58
Fully_Children23M,5.54,3.42,2.93,9.46
PENTA1_Children12M,-0.89,-1.21,-1.85,-0.45


In [29]:
percentdiff_cluster_values.sort_values(by=0 ,ascending=False)

cluster,0,1,2,3
07Vaccine&LogisticsstockofUpazilaMunCC: Differences between Pentavalent doses and vial uses,2058.57,5846.8,8019.81,-7430.04
imp11subdistr_ngothanaprocessNGO_Percent_Implant,83.78,106.67,100.18,80.07
Imp12DistrNGOMonthThana_Percent_Implant,83.78,106.67,100.18,80.07
04Newborn: % of nurse trained on IMCI working at SCANU,50.84,34.44,44.57,15.47
TT5_Mother0-11MChildren,16.14,13.04,32.59,34.08
TT4_Mother0-11MChildren,12.19,10.03,17.99,20.41
Measles_Children23M,6.22,2.45,3.73,6.71
Fully_Children23M,5.54,3.42,2.93,9.46
TT3_Mother0-11MChildren,5.52,4.18,8.89,10.92
prop_married_women_15.45y,4.82,3.97,3.5,3.2


In [30]:
percentdiff_cluster_values.to_csv(OUT+"clustersummary_fullimmunized.csv")