# Correlation analysis, CCA and feature importance: Maternal mortality

## Outcome variables:
- Maternal mortality rate: rate_anc_mortality
- Under 5 mortality rate: rate_under5y_mortality
- Antenatal coverage (ANC): prop_antenatal_coverage
- Proportion of unmet contraceptive need: prop_unmet_need_family_planing
- ORS: 

In [1]:
import re
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
def remove_miss_vars(input_df):
    df = input_df.copy(deep=True)
    remove_list = []
    for var in df.columns:
        if any(df[var].isna()):
            remove_list.append(var)
    return df.drop(remove_list, axis=1)

def impute_miss_vars(input_df):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if any(df[var].isna()):
            df[var].fillna(df[var].mean, inplace=True)
    return df

def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

## STEP 1: Import data and data processing: remove absolute and remove missing values

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv'
DHIS2_VARS = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/DHIS_Rate_Absolute.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

In [4]:
d2011 = pd.read_csv(DATA2011)
d2016 = pd.read_csv(DATA2016)
dhis2vars = pd.read_csv(DHIS2_VARS)
print(dhis2vars.shape)
tmp = dhis2vars[dhis2vars['Rate_Absolute'] == 'Absolute']
print(tmp.shape)
vars_remove = list(tmp['Full_name'])

(349, 4)
(279, 4)


In [5]:
d2011 = d2011.drop(vars_remove, axis=1)
d2016 = d2016.drop(vars_remove, axis=1)

In [6]:
d2011.shape
d2011 = d2011.set_index(['DistrictName'])
print(d2011.shape)
d2011 = d2011.drop(['DistrictGeo'], axis=1)
print(d2011.shape)
subset_vars = [var for var, var_type in zip(d2011.dtypes.index, d2011.dtypes) if str(var_type) != 'object'] 
d2011 = d2011[subset_vars]
d2011 = d2011.fillna(d2011.mean())
#d2011 = remove_miss_vars(input_df=d2011)
print(d2011.shape)
d2011.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,TT1_Mother0-11MChildren,Measles_Children23M,TT4_Mother0-11MChildren,OPV3_Children12M,OPV2_Children23M,OPV3_Children23M,PENTA2_Children12M,TT3_Mother0-11MChildren,PENTA3_Children23M,VitACoverage_Children12-59M,...,prop_pop_rural.1,prop_institutional_delivery,prop_current_contraceptive,prop_female_head,prop_antenatal_care4.,prop_unmet_need_family_planing,prop_pop_women.1,dependency_ratio.1,prop_registered_under5,sex_ratio.1
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,90.0,88.1,52.9,93.3,96.0,93.3,96.0,71.4,85.6,96.7,...,11.48,30.66,65.52,11.29,26.56,7.89,44.85,74.37,32.84,93.31
Bandarban,89.0,82.3,54.3,88.8,90.8,89.4,90.2,72.4,84.4,83.3,...,0.0,0.0,45.45,11.29,0.0,27.27,127.03,107.5,31.25,84.44
Barguna,99.0,88.1,47.6,94.7,98.6,95.4,98.6,72.4,87.9,96.7,...,6.48,10.56,72.36,2.97,20.97,11.14,25.33,70.73,41.4,105.44
Barisal,97.1,86.8,52.4,94.5,98.1,94.5,98.1,77.1,86.9,79.0,...,8.22,21.53,64.33,5.1,26.53,13.31,24.48,75.53,38.53,89.02
Bhola,98.6,85.0,67.6,93.0,96.5,93.0,95.8,85.7,86.3,87.1,...,7.4,9.64,68.4,5.65,29.51,10.81,26.74,80.16,21.19,94.76


In [7]:
d2016.shape
d2016 = d2016.set_index(['DistrictName'])
print(d2016.shape)
d2016 = d2016.drop(['DistrictGeo'], axis=1)
print(d2016.shape)
subset_vars = [var for var, var_type in zip(d2016.dtypes.index, d2016.dtypes) if str(var_type) != 'object'] 
d2016 = d2016[subset_vars]
d2016 = d2016.fillna(d2016.mean())
# d2016 = remove_miss_vars(input_df=d2016)
print(d2016.shape)
d2016.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,TT1_Mother0-11MChildren,Measles_Children23M,TT4_Mother0-11MChildren,OPV3_Children12M,OPV2_Children23M,OPV3_Children23M,PENTA2_Children12M,TT3_Mother0-11MChildren,PENTA3_Children23M,VitACoverage_Children12-59M,...,prop_pop_rural.1,prop_institutional_delivery,prop_current_contraceptive,prop_female_head,prop_antenatal_care4.,prop_unmet_need_family_planing,prop_pop_women.1,dependency_ratio.1,prop_registered_under5,sex_ratio.1
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.1,92.2,61.9,90.8,97.1,91.0,96.8,85.3,91.0,82.0,...,14.38,26.49,67.82,4.17,25.13,11.0,39.86,56.82,37.56,94.78
Bandarban,94.8,89.8,79.6,87.8,94.8,87.8,94.8,86.1,87.8,84.1,...,0.0,75.0,63.64,25.49,75.0,13.64,77.52,45.16,40.0,84.93
Barguna,98.8,94.9,64.4,93.0,98.3,93.6,97.9,88.4,93.6,96.8,...,9.48,27.54,73.3,8.06,47.46,8.22,28.64,68.24,25.1,87.82
Barisal,100.0,97.1,79.3,95.5,99.3,96.0,99.3,96.7,96.0,100.0,...,28.48,46.33,64.26,6.99,32.39,10.08,30.97,59.68,23.4,94.74
Bhola,100.0,96.6,79.0,94.5,99.8,94.5,99.8,94.1,94.5,98.4,...,9.99,11.27,67.28,3.53,15.62,10.01,26.58,69.35,17.64,101.71


## STEP 2: Outcome variables: Maternal mortality rate

### Maternal mortality rate

In [8]:
print(f"Mean: {d2011['prop_antenatal_coverage'].mean()}",
      f" Standard deviation: {d2011['prop_antenatal_coverage'].std()}")

Mean: 37.97062499999999  Standard deviation: 13.593624307939248


In [9]:
print(f"Mean: {d2016['prop_antenatal_coverage'].mean()}",
      f" Standard deviation: {d2016['prop_antenatal_coverage'].std()}")

Mean: 56.92234374999999  Standard deviation: 17.352643432692975


### Scaled and normalized data


In [10]:
print(d2011.shape)
drop_columns = []
for var in d2011.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2011.drop(drop_columns, inplace=True, axis=1)
print(d2011.shape)

(64, 187)
[]
(64, 187)


In [11]:
print(d2016.shape)
drop_columns = []
for var in d2016.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2016.drop(drop_columns, inplace=True, axis=1)
print(d2016.shape)

(64, 187)
[]
(64, 187)


In [12]:
s_data2011 = StandardScaler().fit_transform(d2011)
s_data2011 = pd.DataFrame(s_data2011, columns=d2011.columns)
print(s_data2011.shape)
s_data2011 = remove_miss_vars(input_df=s_data2011)
print(s_data2011.shape)

(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [13]:
d2016[d2016==np.inf]=np.nan
d2016.fillna(d2016.mean(), inplace=True)
s_data2016 = StandardScaler().fit_transform(d2016)
s_data2016 = pd.DataFrame(s_data2016, columns=d2016.columns)
print(s_data2016.shape)
s_data2016 = remove_miss_vars(input_df=s_data2016)
print(s_data2016.shape)


(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Performing PCA on both years 2011 and 2016: Preliminary

In [14]:
pca = PCA(.95)
pca2011 = pca.fit(s_data2011)
print(pca.n_components_)

40


In [15]:
pca = PCA(.95)
pca2016 = pca.fit(s_data2016)
print(pca.n_components_)

42


## Performing Canonical Correlation Analysis (CCA) and correlation analysis for 2011
- The aim of this part is to identify variables highly correlated with maternal mortality rate

### Canonical Correlation Analysis (CCA) 2011

In [16]:
cca_anc = CCA(copy=True, max_iter=1000, n_components=40, scale=True, tol=1e-06)
cca_anc.fit(s_data2011.drop('prop_antenatal_coverage', axis=1),
                 s_data2011['prop_antenatal_coverage'])
print(cca_anc.score(s_data2011.drop('prop_antenatal_coverage', axis=1),
                         s_data2011['prop_antenatal_coverage']))

0.9997458205677789




In [17]:
CCA_coeff_anc = pd.DataFrame({'Indicators': list(s_data2011.drop('prop_antenatal_coverage', axis=1).columns),
                                   'CCA_coeff': cca_anc.coef_[:,0],
                                   'CCA_coeff_abs': np.absolute(cca_anc.coef_[:,0]),})
CCA_coeff_anc.sort_values(by='CCA_coeff_abs', ascending =False).head()
CCA_coeff_anc = CCA_coeff_anc[CCA_coeff_anc['CCA_coeff_abs'] > 0.1]
display(CCA_coeff_anc.head())
print(CCA_coeff_anc.shape)
CCA_coeff_anc.to_csv(OUT+'/cca_antenatal_coverage_2011.csv', index=False, index_label=False)

Unnamed: 0,Indicators,CCA_coeff,CCA_coeff_abs
2,TT4_Mother0-11MChildren,-0.176999,0.176999
15,TT5_Mother0-11MChildren,-0.14678,0.14678
25,prop_women_15.45y_overwomen,-0.134148,0.134148
30,prop_registered_births,0.13442,0.13442
35,rate_maternal_mortality,-0.10195,0.10195


(28, 3)


### Correlation analysis 2011

In [18]:
import scipy.stats  as stats
all(s_data2011.columns == s_data2016.columns)

True

In [19]:
corr2011 = s_data2011.corr()

In [20]:
antenatal_coverage_2011 = corr2011[['prop_antenatal_coverage']]
antenatal_coverage_2011['abs_antenatal_coverage'] = np.absolute(antenatal_coverage_2011['prop_antenatal_coverage'])
antenatal_coverage_2011 = antenatal_coverage_2011.sort_values(by='abs_antenatal_coverage', ascending=False)
antenatal_coverage_2011.drop('prop_antenatal_coverage',axis=0, inplace=True)
antenatal_coverage_2011.reset_index(inplace=True)
corr_pvalues = []
for var in antenatal_coverage_2011['index']:
    pvalue = stats.pearsonr(s_data2011[var], s_data2011['prop_antenatal_coverage'])[1]
    corr_pvalues.append(pvalue)
antenatal_coverage_2011['p_value'] = corr_pvalues
antenatal_coverage_2011 = antenatal_coverage_2011[antenatal_coverage_2011['p_value'] < 0.1]
antenatal_coverage_2011.sort_values(by = 'abs_antenatal_coverage', ascending=False)
display(antenatal_coverage_2011)
print(antenatal_coverage_2011.shape)
antenatal_coverage_2011.to_csv(OUT+'/corr_antenatal_coverage_2011.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  r = r_num / r_den


Unnamed: 0,index,prop_antenatal_coverage,abs_antenatal_coverage,p_value
0,prop_caesarean,0.684099,0.684099,4.623235e-10
1,prop_attendant_delivery,0.671220,0.671220,1.280299e-09
2,prop_institutional_delivery,0.632610,0.632610,2.048893e-08
3,prop_women_15.45y_overwomen.1,0.543440,0.543440,3.479477e-06
4,dependency_ratio.1,-0.517718,0.517718,1.180959e-05
5,prop_live_births,0.466903,0.466903,1.005824e-04
6,rate_under5y_mortality,-0.459151,0.459151,1.355096e-04
7,prop_antenatal_care4.,0.455553,0.455553,1.552391e-04
8,imp12distr_monthprocessImp12DistrMonthThana_Pe...,0.420163,0.420163,5.471984e-04
9,imp11subdistr_thanaprocessPercent_PerFemale,0.420026,0.420026,5.497310e-04


(65, 4)


### Combine results

In [21]:
antenatal_coverage_vars_2011 = list(set(CCA_coeff_anc['Indicators']).union(set(antenatal_coverage_2011['index'])))
len(list(set(CCA_coeff_anc['Indicators']).union(set(antenatal_coverage_2011['index']))))

84

## Performing Canonical Correlation Analysis (CCA) and correlation analysis for 2016
- The aim of this part is to identify variables highly correlated with maternal mortality rate

### Canonical Correlation Analysis (CCA) 2016

In [22]:
cca_anc = CCA(copy=True, max_iter=500, n_components=40, scale=True, tol=1e-06)
cca_anc.fit(s_data2016.drop('prop_antenatal_coverage', axis=1),
                 s_data2016['prop_antenatal_coverage'])
print(cca_anc.score(s_data2016.drop('prop_antenatal_coverage', axis=1),
                         s_data2016['prop_antenatal_coverage']))

0.9562493144871785




In [23]:
CCA_coeff_anc = pd.DataFrame({'Indicators': list(s_data2016.drop('prop_antenatal_coverage', axis=1).columns),
                                   'CCA_coeff': cca_anc.coef_[:,0],
                                   'CCA_coeff_abs': np.absolute(cca_anc.coef_[:,0]),})
CCA_coeff_anc.sort_values(by='CCA_coeff_abs', ascending =False).head()
CCA_coeff_anc = CCA_coeff_anc[CCA_coeff_anc['CCA_coeff_abs'] > 0.1]
display(CCA_coeff_anc.head())
print(CCA_coeff_anc.shape)
CCA_coeff_anc.to_csv(OUT+'/cca_antenatal_coverage_2016.csv', index=False, index_label=False)

Unnamed: 0,Indicators,CCA_coeff,CCA_coeff_abs
9,VitACoverage_Children12-59M,-0.205407,0.205407
34,prop_deaths_rural,-0.104371,0.104371
36,prop_pop_women,0.119128,0.119128
37,dependency_ratio,-0.12436,0.12436
38,sex_ratio,-0.115102,0.115102


(28, 3)


### Correlation analysis 2016

In [24]:
import scipy.stats  as stats
all(s_data2011.columns == s_data2016.columns)

True

In [25]:
corr2016 = s_data2016.corr()

In [26]:
antenatal_coverage_2016 = corr2016[['prop_antenatal_coverage']]
antenatal_coverage_2016['abs_antenatal_coverage'] = np.absolute(antenatal_coverage_2016['prop_antenatal_coverage'])
antenatal_coverage_2016 = antenatal_coverage_2016.sort_values(by='abs_antenatal_coverage', ascending=False)
antenatal_coverage_2016.drop('prop_antenatal_coverage',axis=0, inplace=True)
antenatal_coverage_2016.reset_index(inplace=True)
corr_pvalues = []
for var in antenatal_coverage_2016['index']:
    pvalue = stats.pearsonr(s_data2016[var], s_data2016['prop_antenatal_coverage'])[1]
    corr_pvalues.append(pvalue)
antenatal_coverage_2016['p_value'] = corr_pvalues
antenatal_coverage_2016 = antenatal_coverage_2016[antenatal_coverage_2016['p_value'] < 0.1]
antenatal_coverage_2016.sort_values(by = 'abs_antenatal_coverage', ascending=False)
display(antenatal_coverage_2016)
print(antenatal_coverage_2016.shape)
antenatal_coverage_2016.to_csv(OUT+'/corr_antenatal_coverage_2016.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  r = r_num / r_den


Unnamed: 0,index,prop_antenatal_coverage,abs_antenatal_coverage,p_value
0,prop_attendant_delivery,0.774294,0.774294,6.170175e-14
1,prop_institutional_delivery,0.772737,0.772737,7.44453e-14
2,prop_caesarean,0.74155,0.74155,2.405134e-12
3,prop_antenatal_care4.,0.49564,0.49564,3.123136e-05
4,dependency_ratio.1,-0.40646,0.40646,0.0008599801
5,prop_women_15.45y_overwomen.1,0.400571,0.400571,0.00103838
6,07Vaccine&LogisticsstockofUpazilaMunCC: upazil...,-0.288421,0.288421,0.02081952
7,VitACoverage_Children12-59M,-0.266893,0.266893,0.03301491
8,Imp12DistrNGOMonthThana_CAR,-0.265818,0.265818,0.03375458
9,imp11subdistr_ngothanaprocessNGO_CAR,-0.265818,0.265818,0.03375458


(18, 4)


### Combine results

In [27]:
antenatal_coverage_vars_2016 = list(set(CCA_coeff_anc['Indicators']).union(set(antenatal_coverage_2016['index'])))
len(list(set(CCA_coeff_anc['Indicators']).union(set(antenatal_coverage_2016['index']))))

36

## Intersect variables

In [28]:
print(len(antenatal_coverage_vars_2011))
print(len(antenatal_coverage_vars_2016))
antenatal_coverage_vars_union = list(set(antenatal_coverage_vars_2011).union(set(antenatal_coverage_vars_2016)))
antenatal_coverage_vars_inter = list(set(antenatal_coverage_vars_2011).intersection(set(antenatal_coverage_vars_2016)))
print(len(antenatal_coverage_vars_union))
print(len(antenatal_coverage_vars_inter))

84
36
101
19


## Performing HDBSCAN or Kmean clustering: Maternal mortality rate 

## TO DO LIST:

- Evaluate the optimal number of clusters using HDBSCAN and K-means
- Run clustering on 2011 and predict clusters for 2016
- Take average values for clusters for maternal mortality for 2011 and 2016
- Take average values for all variables in corresponding clustering option for 2011 and 2016
- Take difference for maternal mortality and all other indicators
- Create spreadsheet and share with Marelize

In [29]:
import os
import re
import glob
import conda
import hdbscan
import operator
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from config import Config
from collections import Counter
from matplotlib import pyplot as plt
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans

In [30]:
def evaluate_hdbscan(input_df, min_samples, min_cluster_size, 
                     output, cluster_selection_method, 
                     fmin_samples, fmin_cluster_size,
                     prune=False, plot=True):
    samples = list(itertools.product(min_samples, min_cluster_size))
    counter = 0
    models = pd.DataFrame(columns=['min_samples',
                                   'min_cluster_size',
                                   'num_clusters_including_unclustered',
                                   'percent_of_unclustered_geos',
                                   'percent_of_maxclass',],index=range(len(samples)))
    #geo = input_df['index']
    #input_df = input_df.drop('index', axis=1)
    df = input_df.copy(deep=True)
    for iteration in samples:
        model = hdbscan.HDBSCAN(min_samples=int(iteration[0]), 
                                min_cluster_size=int(iteration[1]), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)
        models.loc[counter,'min_cluster_size'] = iteration[1]
        models.loc[counter, 'min_samples'] = iteration[0]
        models.loc[counter, 'num_clusters_including_unclustered'] = len(Counter(model.labels_))
        tmp_dict = dict(Counter(model.labels_))
        total = sum([v for k,v in tmp_dict.items()])
        tmp_dict = {k:round(v/total*100,2) for k,v in tmp_dict.items()}
        try:
            models.loc[counter, 'percent_of_unclustered_geos'] = tmp_dict.pop(-1)
        except KeyError as error:
            models.loc[counter, 'percent_of_unclustered_geos'] = 0 
        if len(tmp_dict) > 1:
            models.loc[counter, 'percent_of_maxclass'] = tmp_dict[max(tmp_dict.items(), key=operator.itemgetter(1))[0]]
        else:
            models.loc[counter, 'percent_of_maxclass'] = 100
        counter += 1
    if prune:
        out_model = hdbscan.HDBSCAN(min_samples=int(fmin_samples), 
                                min_cluster_size=int(fmin_cluster_size), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)

    else:
        out_model = None

    if plot:
        plt.rcParams['figure.figsize'] = [20,10]
        plt.plot(models['num_clusters_including_unclustered'], label='Number of clusters including unclustered')
        plt.plot(models['percent_of_unclustered_geos'], label='Percent of unclustered geographies')
        plt.plot(models['percent_of_maxclass'], label='Size of larges cluster (%)')
        plt.xlabel("Iterations", fontsize=20)
        plt.ylabel("Value", fontsize=20)
        plt.savefig(os.path.split(output)[1] + "/finetune_parameteres.jpeg")
        plt.legend()
        plt.show()
    del(input_df, df)
    return models, out_model

In [31]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/antenatal_coverage_2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/antenatal_coverage_2016.csv'
SDATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_antenatal_coverage_2011.csv'
SDATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_antenatal_coverage_2016.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

## Evaluate clustering method: HDBSCAN - leaf - 2011

In [32]:
tmp, out = evaluate_hdbscan(input_df=s_data2011[antenatal_coverage_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [33]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
51,2,5,3,56.25,35.94
2,1,4,3,56.25,35.94
3,1,5,3,56.25,35.94
50,2,4,3,56.25,35.94
49,2,3,4,57.81,29.69
1,1,3,4,57.81,29.69
336,8,2,3,75.0,21.88
96,3,2,5,75.0,15.62
144,4,2,5,79.69,9.38
145,4,3,3,85.94,9.38


## Evaluate clustering method: HDBSCAN - eom - 2011

In [34]:
tmp, out = evaluate_hdbscan(input_df=s_data2011[antenatal_coverage_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [35]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,5,18.75,71.88
48,2,2,5,18.75,71.88
96,3,2,3,29.69,67.19
144,4,2,3,48.44,48.44
3,1,5,3,56.25,35.94
2,1,4,3,56.25,35.94
51,2,5,3,56.25,35.94
50,2,4,3,56.25,35.94
49,2,3,4,57.81,29.69
1,1,3,4,57.81,29.69


## Evaluate clustering method: HDBSCAN - leaf - 2016

In [36]:
tmp, out = evaluate_hdbscan(input_df=s_data2016[antenatal_coverage_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [37]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,8,73.44,7.81
48,2,2,8,73.44,7.81
96,3,2,3,93.75,3.12
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0


## Evaluate clustering method: HDBSCAN - eom - 2016

In [38]:
tmp, out = evaluate_hdbscan(input_df=s_data2016[antenatal_coverage_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [39]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,4,43.75,50.0
48,2,2,4,43.75,50.0
96,3,2,3,93.75,3.12
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0


# Clustering with K-means

### Clustering Union List

In [40]:
kmeans_model = KMeans(n_clusters=4, random_state=0).fit(s_data2011[antenatal_coverage_vars_union])
predicted2016 = kmeans_model.predict(s_data2016[antenatal_coverage_vars_union])
d2011['cluster'] = kmeans_model.labels_
d2016['cluster'] = predicted2016

In [41]:
antenatal_coverage_vars_union.append('cluster')
d2011[antenatal_coverage_vars_union].to_csv(OUT+'/clusters_antenatal_coverage_union_2011.csv')
d2016[antenatal_coverage_vars_union].to_csv(OUT+'/clusters_antenatal_coverage_union_2016.csv')
print(d2011[antenatal_coverage_vars_union].shape)
print(d2016[antenatal_coverage_vars_union].shape)

(64, 102)
(64, 102)


In [42]:
print(Counter(d2011['cluster']))

Counter({2: 30, 1: 19, 0: 8, 3: 7})


In [43]:
print(Counter(d2016['cluster']))

Counter({2: 36, 1: 16, 0: 7, 3: 5})


### Clustering Intersection List

In [44]:
d2011 = d2011.drop('cluster', axis=1)
d2016 = d2016.drop('cluster', axis=1)

In [45]:
kmeans_model = KMeans(n_clusters=4, random_state=0).fit(s_data2011[antenatal_coverage_vars_inter])
predicted2016 = kmeans_model.predict(s_data2016[antenatal_coverage_vars_inter])
d2011['cluster'] = kmeans_model.labels_
d2016['cluster'] = predicted2016

In [46]:
print(Counter(d2011['cluster']))

Counter({1: 20, 2: 20, 3: 17, 0: 7})


In [47]:
print(Counter(d2016['cluster']))

Counter({1: 25, 2: 18, 3: 14, 0: 7})


In [48]:
antenatal_coverage_vars_inter.append('cluster')
d2011[antenatal_coverage_vars_inter].to_csv(OUT+'/clusters_antenatal_coverage_intersect_2011.csv')
d2016[antenatal_coverage_vars_inter].to_csv(OUT+'/clusters_antenatal_coverage_intersect_2016.csv')
print(d2011[antenatal_coverage_vars_inter].shape)
print(d2016[antenatal_coverage_vars_inter].shape)

(64, 20)
(64, 20)


In [49]:
tmp1 = d2011[antenatal_coverage_vars_inter]
tmp1 = tmp1.reset_index(col_level='DistricName')
tmp1['year'] = 2011
tmp2 = d2016[antenatal_coverage_vars_inter]
tmp2 = tmp2.reset_index(col_level='DistricName')
tmp2['year'] = 2016
tmp = pd.concat([tmp1, tmp2], axis=0)
print(tmp.shape)
tmp.to_csv(OUT+'clusters_antenatal_coverage_intersect_all.csv', index=False)

(128, 22)


In [50]:
tmp1 = d2011[antenatal_coverage_vars_union]
tmp1 = tmp1.reset_index(col_level='DistricName')
tmp1['year'] = 2011
tmp2 = d2016[antenatal_coverage_vars_union]
tmp2 = tmp2.reset_index(col_level='DistricName')
tmp2['year'] = 2016
tmp = pd.concat([tmp1, tmp2], axis=0)
print(tmp.shape)
tmp.to_csv(OUT+'clusters_antenatal_coverage_union_all.csv', index=False)

(128, 104)
