# Correlation analysis, CCA and feature importance

## Outcome variables:
- Maternal mortality rate: rate_maternal_mortality
- Under 5 mortality rate: rate_under5y_mortality
- Antenatal coverage (ANC): prop_antenatal_coverage
- Proportion of unmet contraceptive need: prop_unmet_need_family_planing

In [11]:
import re
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [12]:
def remove_miss_vars(input_df):
    df = input_df.copy(deep=True)
    remove_list = []
    for var in df.columns:
        if any(df[var].isna()):
            remove_list.append(var)
    return df.drop(remove_list, axis=1)

def impute_miss_vars(input_df):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if any(df[var].isna()):
            df[var].fillna(df[var].mean, inplace=True)
    return df

def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

## STEP 1: Import data and data processing: remove absolute and remove missing values

In [13]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv'
DHIS2_VARS = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/DHIS_Rate_Absolute.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

In [14]:
d2011 = pd.read_csv(DATA2011)
d2016 = pd.read_csv(DATA2016)
dhis2vars = pd.read_csv(DHIS2_VARS)
print(dhis2vars.shape)
tmp = dhis2vars[dhis2vars['Rate_Absolute'] == 'Absolute']
print(tmp.shape)
vars_remove = list(tmp['Full_name'])

(349, 4)
(279, 4)


In [15]:
d2011 = d2011.drop(vars_remove, axis=1)
d2016 = d2016.drop(vars_remove, axis=1)

In [16]:
d2011.shape
d2011 = d2011.set_index(['DistrictName'])
print(d2011.shape)
d2011 = d2011.drop(['DistrictGeo'], axis=1)
print(d2011.shape)
subset_vars = [var for var, var_type in zip(d2011.dtypes.index, d2011.dtypes) if str(var_type) != 'object'] 
d2011 = d2011[subset_vars]
d2011 = d2011.fillna(d2011.mean())
#d2011 = remove_miss_vars(input_df=d2011)
print(d2011.shape)
d2011.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,BCG_Children12M,PENTA1_Children23M,OPV2_Children12M,PENTA2_Children12M,PENTA2_Children23M,Fully_Children12M,Measles_Children23M,PENTA1_Children12M,PENTA3_Children12M,TT4_Mother0-11MChildren,...,prop_current_contraceptive,prop_pop_women.1,prop_unmet_need_family_planing,prop_pop_rural_women.1,prop_female_head,prop_pop_rural.1,prop_women_15.45y_overwomen.1,prop_antenatal_coverage,dependency_ratio.1,prop_caesarean
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.1,98.1,96.0,96.0,96.0,78.3,88.1,98.1,85.6,52.9,...,65.52,44.85,7.89,5.67,11.29,11.48,46.74,29.91,74.37,15.74
Bandarban,94.3,94.3,90.8,90.2,90.2,73.1,82.3,94.3,83.8,54.3,...,45.45,127.03,27.27,0.0,11.29,0.0,35.56,30.0,107.5,0.0
Barguna,100.0,100.0,98.6,98.6,98.6,74.4,88.1,100.0,87.3,47.6,...,72.36,25.33,11.14,2.98,2.97,6.48,49.26,37.91,70.73,5.21
Barisal,98.6,98.6,98.1,98.1,98.1,73.4,86.8,98.6,86.9,52.4,...,64.33,24.48,13.31,4.37,5.1,8.22,48.17,35.21,75.53,13.06
Bhola,100.0,100.0,96.5,95.8,95.8,74.7,85.0,100.0,86.3,67.6,...,68.4,26.74,10.81,3.86,5.65,7.4,45.49,25.64,80.16,3.75


In [17]:
d2016.shape
d2016 = d2016.set_index(['DistrictName'])
print(d2016.shape)
d2016 = d2016.drop(['DistrictGeo'], axis=1)
print(d2016.shape)
subset_vars = [var for var, var_type in zip(d2016.dtypes.index, d2016.dtypes) if str(var_type) != 'object'] 
d2016 = d2016[subset_vars]
d2016 = d2016.fillna(d2016.mean())
# d2016 = remove_miss_vars(input_df=d2016)
print(d2016.shape)
d2016.head()

(64, 188)
(64, 187)
(64, 187)


Unnamed: 0_level_0,BCG_Children12M,PENTA1_Children23M,OPV2_Children12M,PENTA2_Children12M,PENTA2_Children23M,Fully_Children12M,Measles_Children23M,PENTA1_Children12M,PENTA3_Children12M,TT4_Mother0-11MChildren,...,prop_current_contraceptive,prop_pop_women.1,prop_unmet_need_family_planing,prop_pop_rural_women.1,prop_female_head,prop_pop_rural.1,prop_women_15.45y_overwomen.1,prop_antenatal_coverage,dependency_ratio.1,prop_caesarean
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.2,97.4,96.8,96.8,97.1,86.8,92.2,97.4,90.8,61.9,...,67.82,39.86,11.0,7.53,4.17,14.38,48.65,48.69,56.82,22.38
Bandarban,99.0,96.3,94.8,94.8,94.8,80.9,89.8,96.3,87.8,79.6,...,63.64,77.52,13.64,0.0,25.49,0.0,54.79,100.0,45.16,50.0
Barguna,99.7,98.8,97.9,97.9,98.3,87.6,94.9,98.8,93.0,64.4,...,73.3,28.64,8.22,5.13,8.06,9.48,44.84,37.97,68.24,22.9
Barisal,99.7,99.1,99.3,99.3,99.3,91.0,97.1,99.1,95.5,79.3,...,64.26,30.97,10.08,13.91,6.99,28.48,51.1,64.01,59.68,31.22
Bhola,99.8,99.8,99.8,99.8,99.8,91.3,96.6,99.8,94.5,79.0,...,67.28,26.58,10.01,5.06,3.53,9.99,47.39,29.44,69.35,4.01


## STEP 2: Outcome variables: Maternal mortality rate

### Maternal mortality rate

In [88]:
print(f"Mean: {d2011['rate_maternal_mortality'].mean()}",
      f" Standard deviation: {d2011['rate_maternal_mortality'].std()}")

Mean: 3.7296874999999994  Standard deviation: 4.17278939432187


In [89]:
print(f"Mean: {d2016['rate_maternal_mortality'].mean()}",
      f" Standard deviation: {d2016['rate_maternal_mortality'].std()}")

Mean: 2.65078125  Standard deviation: 4.957185358804691


### Under 5 mortality rate

In [90]:
print(f"Mean: {d2011['rate_under5y_mortality'].mean()}",
      f" Standard deviation: {d2011['rate_under5y_mortality'].std()}")

Mean: 42.829375  Standard deviation: 10.648460634706263


In [91]:
print(f"Mean: {d2016['rate_under5y_mortality'].mean()}",
      f" Standard deviation: {d2016['rate_under5y_mortality'].std()}")

Mean: 39.446562500000006  Standard deviation: 19.795472884631746


### ANC 

In [94]:
print(f"Mean: {d2011['prop_antenatal_care4.'].mean()}",
      f" Standard deviation: {d2011['prop_antenatal_care4.'].std()}")

Mean: 22.2790625  Standard deviation: 12.296192302452368


In [95]:
print(f"Mean: {d2016['prop_antenatal_care4.'].mean()}",
      f" Standard deviation: {d2016['prop_antenatal_care4.'].std()}")

Mean: 30.083124999999992  Standard deviation: 16.787184915314192


### Unmet contraceptive need

In [96]:
print(f"Mean: {d2011['prop_unmet_need_family_planing'].mean()}",
      f" Standard deviation: {d2011['prop_unmet_need_family_planing'].std()}")

Mean: 13.562187500000004  Standard deviation: 6.310768189284793


In [97]:
print(f"Mean: {d2016['prop_unmet_need_family_planing'].mean()}",
      f" Standard deviation: {d2016['prop_unmet_need_family_planing'].std()}")

Mean: 11.942968750000002  Standard deviation: 5.531831890296378


## Performing PCA and correlation analysis

### Scaled and normalized data


In [99]:
print(d2011.shape)
drop_columns = []
for var in d2011.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2011.drop(drop_columns, inplace=True, axis=1)
print(d2011.shape)

(64, 187)
[]
(64, 187)


In [100]:
print(d2016.shape)
drop_columns = []
for var in d2016.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2016.drop(drop_columns, inplace=True, axis=1)
print(d2016.shape)

(64, 187)
[]
(64, 187)


In [101]:
s_data2011 = StandardScaler().fit_transform(d2011)
s_data2011 = pd.DataFrame(s_data2011, columns=d2011.columns)
print(s_data2011.shape)
s_data2011 = remove_miss_vars(input_df=s_data2011)
print(s_data2011.shape)

(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [102]:
d2016[d2016==np.inf]=np.nan
d2016.fillna(d2016.mean(), inplace=True)
s_data2016 = StandardScaler().fit_transform(d2016)
s_data2016 = pd.DataFrame(s_data2016, columns=d2016.columns)
print(s_data2016.shape)
s_data2016 = remove_miss_vars(input_df=s_data2016)
print(s_data2016.shape)


(64, 187)
(64, 187)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Performing PCA on both years 2011 and 2016: Preliminary

In [34]:
pca = PCA(.95)
pca2011 = pca.fit(s_data2011)
print(pca.n_components_)

40


In [35]:
pca = PCA(.95)
pca2016 = pca.fit(s_data2016)
print(pca.n_components_)

42


- Maternal mortality rate: rate_maternal_mortality
- Under 5 mortality rate: rate_under5y_mortality
- Antenatal coverage (ANC): prop_antenatal_coverage
- Proportion of unmet contraceptive need: prop_unmet_need_family_planing

In [None]:
- Perform CCA and correlation analysis and compare results
- Perform HDBSCAN and Kmeans
- 

In [125]:
s_data2011.shape

(64, 187)

In [137]:
from sklearn.cross_decomposition import CCA
cca_maternal = CCA(copy=True, max_iter=500, n_components=40, scale=True, tol=1e-06)
cca_maternal.fit(s_data2011.drop('rate_maternal_mortality', axis=1), s_data2011['rate_maternal_mortality'])
cca_maternal.score(s_data2011.drop('rate_maternal_mortality', axis=1), s_data2011['rate_maternal_mortality'])
print(sorted(cca_maternal.coef_,reverse=True))
cca_maternal.score

[array([0.22552299]), array([0.19463575]), array([0.14648812]), array([0.13025207]), array([0.11481562]), array([0.11075815]), array([0.10910105]), array([0.10831948]), array([0.10415679]), array([0.10344528]), array([0.10344528]), array([0.10308316]), array([0.09914956]), array([0.09709225]), array([0.09553478]), array([0.09491146]), array([0.09307898]), array([0.09307898]), array([0.09251791]), array([0.09251791]), array([0.08022503]), array([0.08015836]), array([0.07797078]), array([0.0778185]), array([0.0775575]), array([0.07726829]), array([0.07603214]), array([0.07549381]), array([0.07516327]), array([0.06975654]), array([0.0695324]), array([0.0695324]), array([0.06820807]), array([0.06751975]), array([0.0666025]), array([0.06504488]), array([0.06328794]), array([0.06118453]), array([0.0592436]), array([0.05790883]), array([0.05570642]), array([0.05263294]), array([0.05170834]), array([0.04951227]), array([0.04918968]), array([0.04747109]), array([0.04652246]), array([0.04479132]



<bound method RegressorMixin.score of CCA(copy=True, max_iter=500, n_components=40, scale=True, tol=1e-06)>

### Performing correlation analysis on the two subsets

In [36]:
all(s_data2016.columns == s_data2016.columns)

True

In [18]:
corr2011 = s_data2011.corr()
corr2016 = s_data2016.corr()

#### Correlation analysis for 2011

In [None]:
maternal_mortality_2011 = corr2011[['rate_maternal_mortality']]
maternal_mortality_2011['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2011['rate_maternal_mortality'])
maternal_mortality_2011 = maternal_mortality_2011.sort_values(by='abs_rate_maternal_mortality', ascending=False)
print(maternal_mortality_2011.shape)
maternal_mortality_2011 = maternal_mortality_2011.loc[maternal_mortality_2011['abs_rate_maternal_mortality'] > 0.05,:]
maternal_mortality_2011.reset_index(inplace=True)
print(maternal_mortality_2011.shape)
maternal_mortality_2011.to_csv(OUT+'corr_maternal_mortality_2011.csv', index=False, index_label=False)

In [None]:
under5_mortality_2011 = corr2011[['rate_under5y_mortality']]
under5_mortality_2011['abs_rate_under5y_mortality'] = np.absolute(under5_mortality_2011['rate_under5y_mortality'])
under5_mortality_2011 = under5_mortality_2011.sort_values(by='abs_rate_under5y_mortality', ascending=False)
print(under5_mortality_2011.shape)
under5_mortality_2011 = under5_mortality_2011.loc[under5_mortality_2011['abs_rate_under5y_mortality'] > 0.1,:]
under5_mortality_2011.reset_index(inplace=True)
print(under5_mortality_2011.shape)
under5_mortality_2011.to_csv(OUT+'corr_rate_under5y_mortality_2011.csv', index=False, index_label=False)

In [None]:
fully_immunized_2011 = corr2011[['Fully_Children12M']]
fully_immunized_2011['abs_Fully_Children12M'] = np.absolute(fully_immunized_2011['Fully_Children12M'])
fully_immunized_2011 = fully_immunized_2011.sort_values(by='abs_Fully_Children12M', ascending=False)
print(fully_immunized_2011.shape)
fully_immunized_2011 = fully_immunized_2011.loc[fully_immunized_2011['abs_Fully_Children12M'] > 0.1,:]
fully_immunized_2011.reset_index(inplace=True)
print(fully_immunized_2011.shape)
fully_immunized_2011.to_csv(OUT+'corr_Fully_Children12M_2011.csv', index=False, index_label=False)

#### Correlation analysis for 2016

In [None]:
maternal_mortality_2016 = corr2016[['rate_maternal_mortality']]
maternal_mortality_2016['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2016['rate_maternal_mortality'])
maternal_mortality_2016 = maternal_mortality_2016.sort_values(by='abs_rate_maternal_mortality', ascending=False)
print(maternal_mortality_2016.shape)
maternal_mortality_2016 = maternal_mortality_2016.loc[maternal_mortality_2016['abs_rate_maternal_mortality'] > 0.05,:]
maternal_mortality_2016.reset_index(inplace=True)
print(maternal_mortality_2016.shape)
maternal_mortality_2016.to_csv(OUT+'corr_maternal_mortality_2016.csv', index=False, index_label=False)


In [None]:
under5_mortality_2016 = corr2016[['rate_under5y_mortality']]
under5_mortality_2016['abs_rate_under5y_mortality'] = np.absolute(under5_mortality_2016['rate_under5y_mortality'])
under5_mortality_2016 = under5_mortality_2016.sort_values(by='abs_rate_under5y_mortality', ascending=False)
print(under5_mortality_2016.shape)
under5_mortality_2016 = under5_mortality_2016.loc[under5_mortality_2016['abs_rate_under5y_mortality'] > 0.1,:]
under5_mortality_2016.reset_index(inplace=True)
print(under5_mortality_2016.shape)
under5_mortality_2016.to_csv(OUT+'corr_rate_under5y_mortality_2016.csv', index=False, index_label=False)

In [None]:
fully_immunized_2016 = corr2016[['Fully_Children12M']]
fully_immunized_2016['abs_Fully_Children12M'] = np.absolute(fully_immunized_2016['Fully_Children12M'])
fully_immunized_2016 = fully_immunized_2016.sort_values(by='abs_Fully_Children12M', ascending=False)
print(fully_immunized_2016.shape)
fully_immunized_2016 = fully_immunized_2016.loc[fully_immunized_2016['abs_Fully_Children12M'] > 0.1,:]
fully_immunized_2016.reset_index(inplace=True)
print(fully_immunized_2016.shape)
fully_immunized_2016.to_csv(OUT+'corr_Fully_Children12M_2016.csv', index=False, index_label=False)

## Write out results

### Maternal mortality

In [None]:
full_maternal_mortality = list(set(maternal_mortality_2011['index']).intersection(set(maternal_mortality_2016['index'])))
data_maternal_mortality_2011 = d2011[full_maternal_mortality]
s_data_maternal_mortality_2011 = s_data2011[full_maternal_mortality]
data_maternal_mortality_2016 = d2016[full_maternal_mortality]
s_data_maternal_mortality_2016 = s_data2016[full_maternal_mortality]


In [None]:
print(data_maternal_mortality_2011.shape)
print(data_maternal_mortality_2016.shape)
print(s_data_maternal_mortality_2011.shape)
print(s_data_maternal_mortality_2016.shape)

In [None]:
data_maternal_mortality_2011.to_csv(OUT+'maternal_mortality_2011.csv')
s_data_maternal_mortality_2011.to_csv(OUT+'s_maternal_mortality_2011.csv')
data_maternal_mortality_2016.to_csv(OUT+'maternal_mortality_2016.csv')
s_data_maternal_mortality_2016.to_csv(OUT+'s_maternal_mortality_2016.csv')

### Under 5 mortality

In [None]:
full_under5_mortality = list(set(under5_mortality_2011['index']).intersection(set(under5_mortality_2016['index'])))
data_under5_mortality_2011 = d2011[full_under5_mortality]
s_data_under5_mortality_2011 = s_data2011[full_under5_mortality]
data_under5_mortality_2016 = d2016[full_under5_mortality]
s_data_under5_mortality_2016 = s_data2016[full_under5_mortality]


In [None]:
print(data_under5_mortality_2011.shape)
print(s_data_under5_mortality_2011.shape)
print(data_under5_mortality_2016.shape)
print(s_data_under5_mortality_2016.shape)

In [None]:
data_under5_mortality_2011.to_csv(OUT+'under5_mortality_2011.csv')
s_data_under5_mortality_2011.to_csv(OUT+'s_under5_mortality_2011.csv')
data_under5_mortality_2016.to_csv(OUT+'under5_mortality_2016.csv')
s_data_under5_mortality_2016.to_csv(OUT+'s_under5_mortality_2016.csv')


### Fully immunized children

In [None]:
full_immunized = list(set(fully_immunized_2011['index']).intersection(set(fully_immunized_2016['index'])))
data_full_immunized_2011 = d2011[full_immunized]
s_data_full_immunized_2011 = s_data2011[full_immunized]
data_full_immunized_2016 = d2016[full_immunized]
s_data_full_immunized_2016 = s_data2016[full_immunized]


In [None]:
print(data_full_immunized_2011.shape)
print(s_data_full_immunized_2011.shape)
print(data_full_immunized_2016.shape)
print(s_data_full_immunized_2016.shape)

In [None]:
data_full_immunized_2011.to_csv(OUT+'full_immunized_2011.csv')
s_data_full_immunized_2011.to_csv(OUT+'s_full_immunized_2011.csv')
data_full_immunized_2016.to_csv(OUT+'full_immunized_2016.csv')
s_data_full_immunized_2016.to_csv(OUT+'s_full_immunized_2016.csv')