# Correlation analysis, CCA and feature importance

## Outcome variables:
- Maternal mortality rate: rate_maternal_mortality
- Under 5 mortality rate: rate_under5y_mortality
- Antenatal coverage (ANC): prop_antenatal_coverage
- Proportion of unmet contraceptive need: prop_unmet_need_family_planing
- ORS: 

In [1]:
import re
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
def remove_miss_vars(input_df):
    df = input_df.copy(deep=True)
    remove_list = []
    for var in df.columns:
        if any(df[var].isna()):
            remove_list.append(var)
    return df.drop(remove_list, axis=1)

def impute_miss_vars(input_df):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if any(df[var].isna()):
            df[var].fillna(df[var].mean, inplace=True)
    return df

def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

## STEP 1: Import data and data processing: remove absolute and remove missing values

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv'
DHIS2_VARS = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/DHIS_Rate_Absolute.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

In [4]:
d2011 = pd.read_csv(DATA2011)
d2016 = pd.read_csv(DATA2016)
dhis2vars = pd.read_csv(DHIS2_VARS)
print(dhis2vars.shape)
tmp = dhis2vars[dhis2vars['Rate_Absolute'] == 'Absolute']
print(tmp.shape)
vars_remove = list(tmp['Full_name'])

(349, 4)
(279, 4)


In [5]:
d2011 = d2011.drop(vars_remove, axis=1)
d2016 = d2016.drop(vars_remove, axis=1)

In [6]:
d2011.shape
d2011 = d2011.set_index(['DistrictName'])
print(d2011.shape)
d2011 = d2011.drop(['DistrictGeo'], axis=1)
print(d2011.shape)
subset_vars = [var for var, var_type in zip(d2011.dtypes.index, d2011.dtypes) if str(var_type) != 'object'] 
d2011 = d2011[subset_vars]
d2011 = d2011.fillna(d2011.mean())
#d2011 = remove_miss_vars(input_df=d2011)
print(d2011.shape)
d2011.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,BCG_Children12M,PENTA1_Children23M,OPV2_Children12M,PENTA2_Children12M,PENTA2_Children23M,Fully_Children12M,Measles_Children23M,PENTA1_Children12M,PENTA3_Children12M,TT4_Mother0-11MChildren,...,prop_current_contraceptive,prop_pop_women.1,prop_unmet_need_family_planing,prop_pop_rural_women.1,prop_female_head,prop_pop_rural.1,prop_women_15.45y_overwomen.1,prop_antenatal_coverage,dependency_ratio.1,prop_caesarean
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.1,98.1,96.0,96.0,96.0,78.3,88.1,98.1,85.6,52.9,...,65.52,44.85,7.89,5.67,11.29,11.48,46.74,29.91,74.37,15.74
Bandarban,94.3,94.3,90.8,90.2,90.2,73.1,82.3,94.3,83.8,54.3,...,45.45,127.03,27.27,0.0,11.29,0.0,35.56,30.0,107.5,0.0
Barguna,100.0,100.0,98.6,98.6,98.6,74.4,88.1,100.0,87.3,47.6,...,72.36,25.33,11.14,2.98,2.97,6.48,49.26,37.91,70.73,5.21
Barisal,98.6,98.6,98.1,98.1,98.1,73.4,86.8,98.6,86.9,52.4,...,64.33,24.48,13.31,4.37,5.1,8.22,48.17,35.21,75.53,13.06
Bhola,100.0,100.0,96.5,95.8,95.8,74.7,85.0,100.0,86.3,67.6,...,68.4,26.74,10.81,3.86,5.65,7.4,45.49,25.64,80.16,3.75


In [7]:
d2016.shape
d2016 = d2016.set_index(['DistrictName'])
print(d2016.shape)
d2016 = d2016.drop(['DistrictGeo'], axis=1)
print(d2016.shape)
subset_vars = [var for var, var_type in zip(d2016.dtypes.index, d2016.dtypes) if str(var_type) != 'object'] 
d2016 = d2016[subset_vars]
d2016 = d2016.fillna(d2016.mean())
# d2016 = remove_miss_vars(input_df=d2016)
print(d2016.shape)
d2016.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,BCG_Children12M,PENTA1_Children23M,OPV2_Children12M,PENTA2_Children12M,PENTA2_Children23M,Fully_Children12M,Measles_Children23M,PENTA1_Children12M,PENTA3_Children12M,TT4_Mother0-11MChildren,...,prop_current_contraceptive,prop_pop_women.1,prop_unmet_need_family_planing,prop_pop_rural_women.1,prop_female_head,prop_pop_rural.1,prop_women_15.45y_overwomen.1,prop_antenatal_coverage,dependency_ratio.1,prop_caesarean
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.2,97.4,96.8,96.8,97.1,86.8,92.2,97.4,90.8,61.9,...,67.82,39.86,11.0,7.53,4.17,14.38,48.65,48.69,56.82,22.38
Bandarban,99.0,96.3,94.8,94.8,94.8,80.9,89.8,96.3,87.8,79.6,...,63.64,77.52,13.64,0.0,25.49,0.0,54.79,100.0,45.16,50.0
Barguna,99.7,98.8,97.9,97.9,98.3,87.6,94.9,98.8,93.0,64.4,...,73.3,28.64,8.22,5.13,8.06,9.48,44.84,37.97,68.24,22.9
Barisal,99.7,99.1,99.3,99.3,99.3,91.0,97.1,99.1,95.5,79.3,...,64.26,30.97,10.08,13.91,6.99,28.48,51.1,64.01,59.68,31.22
Bhola,99.8,99.8,99.8,99.8,99.8,91.3,96.6,99.8,94.5,79.0,...,67.28,26.58,10.01,5.06,3.53,9.99,47.39,29.44,69.35,4.01


## STEP 2: Outcome variables: Maternal mortality rate

### Maternal mortality rate

In [8]:
print(f"Mean: {d2011['rate_maternal_mortality'].mean()}",
      f" Standard deviation: {d2011['rate_maternal_mortality'].std()}")

Mean: 3.7296874999999994  Standard deviation: 4.17278939432187


In [9]:
print(f"Mean: {d2016['rate_maternal_mortality'].mean()}",
      f" Standard deviation: {d2016['rate_maternal_mortality'].std()}")

Mean: 2.65078125  Standard deviation: 4.957185358804691


### Scaled and normalized data


In [10]:
print(d2011.shape)
drop_columns = []
for var in d2011.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2011.drop(drop_columns, inplace=True, axis=1)
print(d2011.shape)

(64, 187)
[]
(64, 187)


In [11]:
print(d2016.shape)
drop_columns = []
for var in d2016.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2016.drop(drop_columns, inplace=True, axis=1)
print(d2016.shape)

(64, 187)
[]
(64, 187)


In [12]:
s_data2011 = StandardScaler().fit_transform(d2011)
s_data2011 = pd.DataFrame(s_data2011, columns=d2011.columns)
print(s_data2011.shape)
s_data2011 = remove_miss_vars(input_df=s_data2011)
print(s_data2011.shape)

(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [13]:
d2016[d2016==np.inf]=np.nan
d2016.fillna(d2016.mean(), inplace=True)
s_data2016 = StandardScaler().fit_transform(d2016)
s_data2016 = pd.DataFrame(s_data2016, columns=d2016.columns)
print(s_data2016.shape)
s_data2016 = remove_miss_vars(input_df=s_data2016)
print(s_data2016.shape)


(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Performing PCA on both years 2011 and 2016: Preliminary

In [14]:
pca = PCA(.95)
pca2011 = pca.fit(s_data2011)
print(pca.n_components_)

40


In [15]:
pca = PCA(.95)
pca2016 = pca.fit(s_data2016)
print(pca.n_components_)

42


## Performing Canonical Correlation Analysis (CCA) and correlation analysis for 2011
- The aim of this part is to identify variables highly correlated with maternal mortality rate

### Canonical Correlation Analysis (CCA) 2011

In [16]:
cca_maternal = CCA(copy=True, max_iter=1000, n_components=40, scale=True, tol=1e-06)
cca_maternal.fit(s_data2011.drop('rate_maternal_mortality', axis=1),
                 s_data2011['rate_maternal_mortality'])
print(cca_maternal.score(s_data2011.drop('rate_maternal_mortality', axis=1),
                         s_data2011['rate_maternal_mortality']))

0.8719748512696558




In [17]:
CCA_coeff_maternal = pd.DataFrame({'Indicators': list(s_data2011.drop('rate_maternal_mortality', axis=1).columns),
                                   'CCA_coeff': cca_maternal.coef_[:,0],
                                   'CCA_coeff_abs': np.absolute(cca_maternal.coef_[:,0]),})
CCA_coeff_maternal.sort_values(by='CCA_coeff_abs', ascending =False).head()
CCA_coeff_maternal = CCA_coeff_maternal[CCA_coeff_maternal['CCA_coeff_abs'] > 0.1]
display(CCA_coeff_maternal.head())
print(CCA_coeff_maternal.shape)
CCA_coeff_maternal.to_csv(OUT+'/cca_maternal_mortality_2016.csv', index=False, index_label=False)

Unnamed: 0,Indicators,CCA_coeff,CCA_coeff_abs
10,TT1_Mother0-11MChildren,-0.271485,0.271485
11,VitACoverage_Children12-59M,-0.10422,0.10422
14,Measles_Children12M,0.109101,0.109101
15,TT2_Mother0-11MChildren,-0.158135,0.158135
19,TT5_Mother0-11MChildren,0.114816,0.114816


(28, 3)


### Correlation analysis 2011

In [18]:
import scipy.stats  as stats
all(s_data2011.columns == s_data2016.columns)

True

In [19]:
corr2011 = s_data2011.corr()

In [20]:
maternal_mortality_2011 = corr2011[['rate_maternal_mortality']]
maternal_mortality_2011['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2011['rate_maternal_mortality'])
maternal_mortality_2011 = maternal_mortality_2011.sort_values(by='abs_rate_maternal_mortality', ascending=False)
maternal_mortality_2011.drop('rate_maternal_mortality',axis=0, inplace=True)
maternal_mortality_2011.reset_index(inplace=True)
corr_pvalues = []
for var in maternal_mortality_2011['index']:
    pvalue = stats.pearsonr(s_data2011[var], s_data2011['rate_maternal_mortality'])[1]
    corr_pvalues.append(pvalue)
maternal_mortality_2011['p_value'] = corr_pvalues
maternal_mortality_2011 = maternal_mortality_2011[maternal_mortality_2011['p_value'] < 0.1]
maternal_mortality_2011.sort_values(by = 'abs_rate_maternal_mortality', ascending=False)
display(maternal_mortality_2011)
print(maternal_mortality_2011.shape)
maternal_mortality_2011.to_csv(OUT+'/corr_maternal_mortality_2011.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  r = r_num / r_den


Unnamed: 0,index,rate_maternal_mortality,abs_rate_maternal_mortality,p_value
0,TT2_Mother0-11MChildren,-0.39328,0.39328,0.001305
1,TT1_Mother0-11MChildren,-0.379661,0.379661,0.001974
2,TT3_Mother0-11MChildren,-0.33761,0.33761,0.006367
3,04Newborn: Percentage of newborns delivered by...,0.29137,0.29137,0.019494
4,03Immunization: Penta 1 to MR 2 drop out Rate,0.271447,0.271447,0.03003
5,Imp12DistrNGOMonthThana_Percent_Injectable,0.264959,0.264959,0.034356
6,imp11subdistr_ngothanaprocessNGO_Percent_Injec...,0.264959,0.264959,0.034356
7,03Immunization: TT vial wastage rate,-0.252406,0.252406,0.044203
8,04Newborn: Nurse Bed ratio at SCANU (Recommend...,-0.239982,0.239982,0.056129
9,03Immunization: MR 2 Crude Coverage %,-0.224465,0.224465,0.07456


(12, 4)


### Combine results

In [21]:
maternal_mortality_vars_2011 = list(set(CCA_coeff_maternal['Indicators']).union(set(maternal_mortality_2011['index'])))
len(list(set(CCA_coeff_maternal['Indicators']).union(set(maternal_mortality_2011['index']))))

34

## Performing Canonical Correlation Analysis (CCA) and correlation analysis for 2016
- The aim of this part is to identify variables highly correlated with maternal mortality rate

### Canonical Correlation Analysis (CCA) 2016

In [22]:
cca_maternal = CCA(copy=True, max_iter=500, n_components=40, scale=True, tol=1e-06)
cca_maternal.fit(s_data2016.drop('rate_maternal_mortality', axis=1),
                 s_data2016['rate_maternal_mortality'])
print(cca_maternal.score(s_data2016.drop('rate_maternal_mortality', axis=1),
                         s_data2016['rate_maternal_mortality']))

0.8921521460955979




In [23]:
CCA_coeff_maternal = pd.DataFrame({'Indicators': list(s_data2016.drop('rate_maternal_mortality', axis=1).columns),
                                   'CCA_coeff': cca_maternal.coef_[:,0],
                                   'CCA_coeff_abs': np.absolute(cca_maternal.coef_[:,0]),})
CCA_coeff_maternal.sort_values(by='CCA_coeff_abs', ascending =False).head()
CCA_coeff_maternal = CCA_coeff_maternal[CCA_coeff_maternal['CCA_coeff_abs'] > 0.1]
display(CCA_coeff_maternal.head())
print(CCA_coeff_maternal.shape)
CCA_coeff_maternal.to_csv(OUT+'/cca_maternal_mortality_2016.csv', index=False, index_label=False)

Unnamed: 0,Indicators,CCA_coeff,CCA_coeff_abs
9,TT4_Mother0-11MChildren,-0.109596,0.109596
10,TT1_Mother0-11MChildren,-0.136305,0.136305
20,TT3_Mother0-11MChildren,-0.121278,0.121278
26,sex_ratio,0.178354,0.178354
28,prop_married_..15y,-0.109768,0.109768


(32, 3)


### Correlation analysis 2016

In [24]:
import scipy.stats  as stats
all(s_data2011.columns == s_data2016.columns)

True

In [25]:
corr2016 = s_data2016.corr()

In [26]:
maternal_mortality_2016 = corr2016[['rate_maternal_mortality']]
maternal_mortality_2016['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2016['rate_maternal_mortality'])
maternal_mortality_2016 = maternal_mortality_2016.sort_values(by='abs_rate_maternal_mortality', ascending=False)
maternal_mortality_2016.drop('rate_maternal_mortality',axis=0, inplace=True)
maternal_mortality_2016.reset_index(inplace=True)
corr_pvalues = []
for var in maternal_mortality_2016['index']:
    pvalue = stats.pearsonr(s_data2016[var], s_data2016['rate_maternal_mortality'])[1]
    corr_pvalues.append(pvalue)
maternal_mortality_2016['p_value'] = corr_pvalues
maternal_mortality_2016 = maternal_mortality_2016[maternal_mortality_2016['p_value'] < 0.1]
maternal_mortality_2016.sort_values(by = 'abs_rate_maternal_mortality', ascending=False)
display(maternal_mortality_2016)
print(maternal_mortality_2016.shape)
maternal_mortality_2016.to_csv(OUT+'/corr_maternal_mortality_2016.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  r = r_num / r_den


Unnamed: 0,index,rate_maternal_mortality,abs_rate_maternal_mortality,p_value
0,02ChildHealth: % of neonatal death reported in...,0.33463,0.33463,0.006878
1,sex_ratio.1,-0.270372,0.270372,0.030714
2,03Immunization: PCV vial wastage rate,0.226345,0.226345,0.072099
3,imp11subdistr_thanaprocessPercent_PerMale,-0.223323,0.223323,0.076088
4,imp12distr_monthprocessImp12DistrMonthThana_Pe...,-0.223307,0.223307,0.07611
5,02ChildHealth: IMCI Stunting (%),-0.21333,0.21333,0.090541


(6, 4)


### Combine results

In [27]:
maternal_mortality_vars_2016 = list(set(CCA_coeff_maternal['Indicators']).union(set(maternal_mortality_2016['index'])))
len(list(set(CCA_coeff_maternal['Indicators']).union(set(maternal_mortality_2016['index']))))

35

## Intersect variables

In [28]:
print(len(maternal_mortality_vars_2011))
print(len(maternal_mortality_vars_2016))
maternal_mortality_vars_union = list(set(maternal_mortality_vars_2011).union(set(maternal_mortality_vars_2016)))
maternal_mortality_vars_inter = list(set(maternal_mortality_vars_2011).intersection(set(maternal_mortality_vars_2016)))
print(len(maternal_mortality_vars_union))
print(len(maternal_mortality_vars_inter))

34
35
60
9


## Performing HDBSCAN or Kmean clustering: Maternal mortality rate 

## TO DO LIST:

- Evaluate the optimal number of clusters using HDBSCAN and K-means
- Run clustering on 2011 and predict clusters for 2016
- Take average values for clusters for maternal mortality for 2011 and 2016
- Take average values for all variables in corresponding clustering option for 2011 and 2016
- Take difference for maternal mortality and all other indicators
- Create spreadsheet and share with Marelize

In [29]:
import os
import re
import glob
import conda
import hdbscan
import operator
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from config import Config
from collections import Counter
from matplotlib import pyplot as plt
conda_file_dir = conda.__file__
conda_dir = conda_file_dir.split('lib')[0]
proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj')
os.environ["PROJ_LIB"] = proj_lib
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.lines import Line2D
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans

In [30]:
def evaluate_hdbscan(input_df, min_samples, min_cluster_size, 
                     output, cluster_selection_method, 
                     fmin_samples, fmin_cluster_size,
                     prune=False, plot=True):
    samples = list(itertools.product(min_samples, min_cluster_size))
    counter = 0
    models = pd.DataFrame(columns=['min_samples',
                                   'min_cluster_size',
                                   'num_clusters_including_unclustered',
                                   'percent_of_unclustered_geos',
                                   'percent_of_maxclass',],index=range(len(samples)))
    #geo = input_df['index']
    #input_df = input_df.drop('index', axis=1)
    df = input_df.copy(deep=True)
    for iteration in samples:
        model = hdbscan.HDBSCAN(min_samples=int(iteration[0]), 
                                min_cluster_size=int(iteration[1]), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)
        models.loc[counter,'min_cluster_size'] = iteration[1]
        models.loc[counter, 'min_samples'] = iteration[0]
        models.loc[counter, 'num_clusters_including_unclustered'] = len(Counter(model.labels_))
        tmp_dict = dict(Counter(model.labels_))
        total = sum([v for k,v in tmp_dict.items()])
        tmp_dict = {k:round(v/total*100,2) for k,v in tmp_dict.items()}
        try:
            models.loc[counter, 'percent_of_unclustered_geos'] = tmp_dict.pop(-1)
        except KeyError as error:
            models.loc[counter, 'percent_of_unclustered_geos'] = 0 
        if len(tmp_dict) > 1:
            models.loc[counter, 'percent_of_maxclass'] = tmp_dict[max(tmp_dict.items(), key=operator.itemgetter(1))[0]]
        else:
            models.loc[counter, 'percent_of_maxclass'] = 100
        counter += 1
    if prune:
        out_model = hdbscan.HDBSCAN(min_samples=int(fmin_samples), 
                                min_cluster_size=int(fmin_cluster_size), 
                                metric='euclidean', 
                                algorithm='best',
                                cluster_selection_method=cluster_selection_method, prediction_data=False).fit(df)

    else:
        out_model = None

    if plot:
        plt.rcParams['figure.figsize'] = [20,10]
        plt.plot(models['num_clusters_including_unclustered'], label='Number of clusters including unclustered')
        plt.plot(models['percent_of_unclustered_geos'], label='Percent of unclustered geographies')
        plt.plot(models['percent_of_maxclass'], label='Size of larges cluster (%)')
        plt.xlabel("Iterations", fontsize=20)
        plt.ylabel("Value", fontsize=20)
        plt.savefig(os.path.split(output)[1] + "/finetune_parameteres.jpeg")
        plt.legend()
        plt.show()
    del(input_df, df)
    return models, out_model

In [31]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/maternal_mortality_2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/maternal_mortality_2016.csv'
SDATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_maternal_mortality_2011.csv'
SDATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/s_maternal_mortality_2016.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

## Evaluate clustering method: HDBSCAN - leaf - 2011

In [32]:
tmp, out = evaluate_hdbscan(input_df=s_data2011[maternal_mortality_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [33]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
1,1,3,3,26.56,68.75
0,1,2,8,75.0,4.69
48,2,2,4,75.0,18.75
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0


## Evaluate clustering method: HDBSCAN - eom - 2011

In [34]:
tmp, out = evaluate_hdbscan(input_df=s_data2011[maternal_mortality_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [35]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,3,26.56,68.75
1,1,3,3,26.56,68.75
48,2,2,3,34.38,62.5
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0


## Evaluate clustering method: HDBSCAN - leaf - 2016

In [36]:
tmp, out = evaluate_hdbscan(input_df=s_data2016[maternal_mortality_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='leaf',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [37]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,7,78.12,6.25
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0
287,6,49,1,100.0,100.0
286,6,48,1,100.0,100.0


## Evaluate clustering method: HDBSCAN - eom - 2016

In [38]:
tmp, out = evaluate_hdbscan(input_df=s_data2016[maternal_mortality_vars_union], 
                       min_samples=Config.tune_min_sample, 
                       min_cluster_size=Config.tune_min_cluster,
                       output=OUT, cluster_selection_method ='eom',
                       fmin_samples=3, fmin_cluster_size=12,
                       prune=True, plot=False)

In [39]:
tmp = tmp[tmp['num_clusters_including_unclustered'] <10]
tmp.sort_values('percent_of_unclustered_geos', ascending=True).head(10)

Unnamed: 0,min_samples,min_cluster_size,num_clusters_including_unclustered,percent_of_unclustered_geos,percent_of_maxclass
0,1,2,4,32.81,60.94
294,7,8,1,100.0,100.0
293,7,7,1,100.0,100.0
292,7,6,1,100.0,100.0
291,7,5,1,100.0,100.0
290,7,4,1,100.0,100.0
289,7,3,1,100.0,100.0
288,7,2,1,100.0,100.0
287,6,49,1,100.0,100.0
286,6,48,1,100.0,100.0


# Clustering with K-means

### Clustering Union List

In [40]:
kmeans_model = KMeans(n_clusters=3, random_state=0).fit(s_data2011[maternal_mortality_vars_union])
predicted2016 = kmeans_model.predict(s_data2016[maternal_mortality_vars_union])
d2011['cluster'] = kmeans_model.labels_
d2016['cluster'] = predicted2016

In [41]:
maternal_mortality_vars_union.append('cluster')
d2011[maternal_mortality_vars_union].to_csv(OUT+'/clusters_maternal_mortality_union_2011.csv')
d2016[maternal_mortality_vars_union].to_csv(OUT+'/clusters_maternal_mortality_union_2016.csv')
print(d2011[maternal_mortality_vars_union].shape)
print(d2016[maternal_mortality_vars_union].shape)

(64, 61)
(64, 61)


In [42]:
print(Counter(d2011['cluster']))

Counter({2: 40, 1: 18, 0: 6})


In [43]:
print(Counter(d2016['cluster']))

Counter({2: 41, 1: 18, 0: 5})


### Clustering Intersection List

In [44]:
d2011 = d2011.drop('cluster', axis=1)
d2016 = d2016.drop('cluster', axis=1)

In [45]:
kmeans_model = KMeans(n_clusters=3, random_state=0).fit(s_data2011[maternal_mortality_vars_inter])
predicted2016 = kmeans_model.predict(s_data2016[maternal_mortality_vars_inter])
d2011['cluster'] = kmeans_model.labels_
d2016['cluster'] = predicted2016

In [46]:
print(Counter(d2011['cluster']))

Counter({1: 50, 2: 11, 0: 3})


In [47]:
print(Counter(d2016['cluster']))

Counter({1: 49, 2: 13, 0: 2})


In [48]:
maternal_mortality_vars_inter.append('cluster')
d2011[maternal_mortality_vars_inter].to_csv(OUT+'/clusters_maternal_mortality_intersect_2011.csv')
d2016[maternal_mortality_vars_inter].to_csv(OUT+'/clusters_maternal_mortality_intersect_2016.csv')
print(d2011[maternal_mortality_vars_inter].shape)
print(d2016[maternal_mortality_vars_inter].shape)

(64, 10)
(64, 10)
