# Principal Component Analysis of All Data

In [1]:
import re
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
def remove_miss_vars(input_df):
    df = input_df.copy(deep=True)
    remove_list = []
    for var in df.columns:
        if any(df[var].isna()):
            remove_list.append(var)
    return df.drop(remove_list, axis=1)

def impute_miss_vars(input_df):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if any(df[var].isna()):
            df[var].fillna(df[var].mean, inplace=True)
    return df

def intersect_dfs(input_df1, input_df2):
    df1 = input_df1.copy(deep=True)
    df2 = input_df2.copy(deep=True)
    subset_var = list(set(list(df1.columns)).intersection(set(list(df2.columns))))
    return df1[subset_var], df2[subset_var]

## STEP 1: Import data, remove absolute and remove missing values

In [3]:
DATA2011 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2011.csv'
DATA2016 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/all2016.csv'
DHIS2_VARS = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/DHIS_Rate_Absolute.csv'
OUT = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/all/'

In [4]:
d2011 = pd.read_csv(DATA2011)
d2016 = pd.read_csv(DATA2016)
dhis2vars = pd.read_csv(DHIS2_VARS)
print(dhis2vars.shape)
tmp = dhis2vars[dhis2vars['Rate_Absolute'] == 'Absolute']
print(tmp.shape)
vars_remove = list(tmp['Full_name'])

(349, 4)
(279, 4)


In [5]:
d2011 = d2011.drop(vars_remove, axis=1)
d2016 = d2016.drop(vars_remove, axis=1)

In [6]:
print(d2016['rate_maternal_mortality'].mean())
print(d2011['rate_maternal_mortality'].mean())

2.65078125
3.7296874999999994


In [7]:
source_labels = []
counter = 0
for var in list(d2011.columns):
    if bool(re.search(pattern="index", string=var)):
        counter += 1
        source_labels.append(counter)
    else:
        source_labels.append(counter)
        
vars_labels2011 = pd.DataFrame({'vars':list(d2011.columns), 'data source': source_labels})
vars_labels2011.head()

Unnamed: 0,vars,data source
0,index,1
1,DistrictName,1
2,BCG_Children12M,1
3,TT4_Mother0-11MChildren,1
4,TT1_Mother0-11MChildren,1


In [8]:
source_labels = []
counter = 0
for var in list(d2016.columns):
    if bool(re.search(pattern="index", string=var)):
        counter += 1
        source_labels.append(counter)
    else:
        source_labels.append(counter)
        
vars_labels2016 = pd.DataFrame({'vars':list(d2016.columns), 'data source': source_labels})
vars_labels2016.head()

Unnamed: 0,vars,data source
0,index,1
1,DistrictName,1
2,BCG_Children12M,1
3,TT4_Mother0-11MChildren,1
4,TT1_Mother0-11MChildren,1


In [9]:
d2011.shape
d2011 = d2011.set_index(['DistrictName'])
print(d2011.shape)
d2011 = d2011.drop(['index', 'DistrictGeo'], axis=1)
print(d2011.shape)
subset_vars = [var for var, var_type in zip(d2011.dtypes.index, d2011.dtypes) if str(var_type) != 'object'] 
d2011 = d2011[subset_vars]
d2011 = d2011.fillna(d2011.mean())
#d2011 = remove_miss_vars(input_df=d2011)
print(d2011.shape)
d2011.head()

(64, 175)
(64, 173)
(64, 173)


Unnamed: 0_level_0,BCG_Children12M,TT4_Mother0-11MChildren,TT1_Mother0-11MChildren,TT3_Mother0-11MChildren,PENTA1_Children12M,PENTA2_Children12M,OPV2_Children12M,Measles_Children12M,OPV1_Children23M,Fully_Children23M,...,imp11subdistr_thanaprocessCAR,distr_GOPill_total,distr_GOfemalenormal,imp11subdistr_thanaprocessPercent_PerMale,imp11subdistr_ngothanaprocessNGO_Percent_Pill,distr_GOimplant_remove,imp11subdistr_ngothanaprocessNGO_Percent_PerMale,distr_GOImp_total,imp11subdistr_ngothanaprocessNGO_Percent_PerFemale,distr_GOfemale_total
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.1,52.9,90.0,71.4,98.1,96.0,96.0,86.7,98.1,79.7,...,80.83,3657.7126,3.0646,5.53,53.61,0.0,3.81,8.4006,4.04,3.0646
Bandarban,94.3,54.3,89.0,72.4,94.3,90.2,90.8,79.2,93.8,76.2,...,77.01,655.7759,0.5443,4.76,48.9905,0.0,2.088167,1.7435,6.310333,0.5443
Barguna,100.0,47.6,99.0,72.4,100.0,98.6,98.6,82.6,99.5,79.9,...,75.47,1302.0635,0.4057,7.91,41.21,0.0,3.19,9.565,6.13,0.4057
Barisal,98.6,52.4,97.1,77.1,98.6,98.1,98.1,82.6,98.6,77.4,...,72.62,2935.3205,3.673,2.46,56.44,0.0,0.55,7.5193,6.54,3.673
Bhola,100.0,67.6,98.6,85.7,100.0,95.8,96.5,83.0,99.0,76.7,...,72.23,2403.5347,0.7171,2.71,30.32,0.0,1.48,7.4587,2.57,0.7171


In [10]:
d2016.shape
d2016 = d2016.set_index(['DistrictName'])
print(d2016.shape)
d2016 = d2016.drop(['index', 'DistrictGeo'], axis=1)
print(d2016.shape)
subset_vars = [var for var, var_type in zip(d2016.dtypes.index, d2016.dtypes) if str(var_type) != 'object'] 
d2016 = d2016[subset_vars]
d2016 = d2016.fillna(d2016.mean())
# d2016 = remove_miss_vars(input_df=d2016)
print(d2016.shape)
d2016.head()

(64, 175)
(64, 173)
(64, 173)


Unnamed: 0_level_0,BCG_Children12M,TT4_Mother0-11MChildren,TT1_Mother0-11MChildren,TT3_Mother0-11MChildren,PENTA1_Children12M,PENTA2_Children12M,OPV2_Children12M,Measles_Children12M,OPV1_Children23M,Fully_Children23M,...,imp11subdistr_thanaprocessCAR,distr_GOPill_total,distr_GOfemalenormal,imp11subdistr_thanaprocessPercent_PerMale,imp11subdistr_ngothanaprocessNGO_Percent_Pill,distr_GOimplant_remove,imp11subdistr_ngothanaprocessNGO_Percent_PerMale,distr_GOImp_total,imp11subdistr_ngothanaprocessNGO_Percent_PerFemale,distr_GOfemale_total
DistrictName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bagerhat,98.2,61.9,98.1,85.3,97.4,96.8,96.8,90.3,97.4,88.3,...,81.39,4209.646,1.7114,5.33,44.81,2.5874,4.69,10.1809,4.87,2.3371
Bandarban,99.0,79.6,94.8,86.1,96.3,94.8,94.8,86.3,96.3,83.9,...,77.95,671.4688,0.1996,9.05,44.724032,0.757,2.697097,2.8958,6.479516,0.2255
Barguna,99.7,64.4,98.8,88.4,98.8,97.9,97.9,91.0,98.8,91.1,...,76.12,1196.4034,0.3266,9.46,41.16,1.8511,3.2,8.5741,3.25,0.8348
Barisal,99.7,79.3,100.0,96.7,99.1,99.3,99.3,93.1,99.1,94.6,...,75.77,611.6951,0.3437,2.98,53.3,0.7047,0.97,2.8073,4.07,0.5894
Bhola,99.8,79.0,100.0,94.1,99.8,99.8,99.8,95.4,99.8,91.9,...,79.61,2050.1934,0.0398,3.21,32.15,1.0568,2.31,13.331,2.55,0.3316


## Performing PCA and correlation analysis

### Scaled and normalized data


In [11]:
print(d2011.shape)
drop_columns = []
for var in d2011.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2011.drop(drop_columns, inplace=True, axis=1)
print(d2011.shape)

(64, 173)
['index.1', 'index.3', 'index.2']
(64, 170)


In [12]:
print(d2016.shape)
drop_columns = []
for var in d2016.columns:
    if "index" in var:
        drop_columns.append(var)
drop_columns = list(set(drop_columns))
print(drop_columns)
d2016.drop(drop_columns, inplace=True, axis=1)
print(d2016.shape)

(64, 173)
['index.1', 'index.3', 'index.2']
(64, 170)


In [13]:
s_data2011 = StandardScaler().fit_transform(d2011)
s_data2011 = pd.DataFrame(s_data2011, columns=d2011.columns)
print(s_data2011.shape)
s_data2011 = remove_miss_vars(input_df=s_data2011)
print(s_data2011.shape)

(64, 170)
(64, 170)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [14]:
d2016[d2016==np.inf]=np.nan
d2016.fillna(d2016.mean(), inplace=True)
s_data2016 = StandardScaler().fit_transform(d2016)
s_data2016 = pd.DataFrame(s_data2016, columns=d2016.columns)
print(s_data2016.shape)
s_data2016 = remove_miss_vars(input_df=s_data2016)
print(s_data2016.shape)


(64, 170)
(64, 170)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Performing PCA on both years 2011 and 2016: Preliminary

In [15]:
pca = PCA(.95)
pca2011 = pca.fit(s_data2011)
print(pca.n_components_)

39


In [16]:
pca = PCA(.95)
pca2016 = pca.fit(s_data2016)
print(pca.n_components_)

40


### Performing correlation analysis on the two subsets

In [17]:
all(s_data2016.columns == s_data2016.columns)

True

In [18]:
corr2011 = s_data2011.corr()
corr2016 = s_data2016.corr()

In [19]:
for var in corr2011.columns:
    if 'maternal' in var:
        print(var)
for var in corr2011.columns:
    if 'under5' in var:
        print(var)
for var in corr2011.columns:
    if 'Fully_Children12M' in var:
        print(var)

rate_maternal_mortality
01MaternalHealth: % of maternal death reported individually with causes of death
rate_under5y_mortality
Fully_Children12M


#### Correlation analysis for 2011

In [20]:
maternal_mortality_2011 = corr2011[['rate_maternal_mortality']]
maternal_mortality_2011['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2011['rate_maternal_mortality'])
maternal_mortality_2011 = maternal_mortality_2011.sort_values(by='abs_rate_maternal_mortality', ascending=False)
print(maternal_mortality_2011.shape)
maternal_mortality_2011 = maternal_mortality_2011.loc[maternal_mortality_2011['abs_rate_maternal_mortality'] > 0.05,:]
maternal_mortality_2011.reset_index(inplace=True)
print(maternal_mortality_2011.shape)
maternal_mortality_2011.to_csv(OUT+'corr_maternal_mortality_2011.csv', index=False, index_label=False)

(170, 2)
(99, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
under5_mortality_2011 = corr2011[['rate_under5y_mortality']]
under5_mortality_2011['abs_rate_under5y_mortality'] = np.absolute(under5_mortality_2011['rate_under5y_mortality'])
under5_mortality_2011 = under5_mortality_2011.sort_values(by='abs_rate_under5y_mortality', ascending=False)
print(under5_mortality_2011.shape)
under5_mortality_2011 = under5_mortality_2011.loc[under5_mortality_2011['abs_rate_under5y_mortality'] > 0.1,:]
under5_mortality_2011.reset_index(inplace=True)
print(under5_mortality_2011.shape)
under5_mortality_2011.to_csv(OUT+'corr_rate_under5y_mortality_2011.csv', index=False, index_label=False)

(170, 2)
(99, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
fully_immunized_2011 = corr2011[['Fully_Children12M']]
fully_immunized_2011['abs_Fully_Children12M'] = np.absolute(fully_immunized_2011['Fully_Children12M'])
fully_immunized_2011 = fully_immunized_2011.sort_values(by='abs_Fully_Children12M', ascending=False)
print(fully_immunized_2011.shape)
fully_immunized_2011 = fully_immunized_2011.loc[fully_immunized_2011['abs_Fully_Children12M'] > 0.1,:]
fully_immunized_2011.reset_index(inplace=True)
print(fully_immunized_2011.shape)
fully_immunized_2011.to_csv(OUT+'corr_Fully_Children12M_2011.csv', index=False, index_label=False)

(170, 2)
(98, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Correlation analysis for 2016

In [23]:
maternal_mortality_2016 = corr2016[['rate_maternal_mortality']]
maternal_mortality_2016['abs_rate_maternal_mortality'] = np.absolute(maternal_mortality_2016['rate_maternal_mortality'])
maternal_mortality_2016 = maternal_mortality_2016.sort_values(by='abs_rate_maternal_mortality', ascending=False)
print(maternal_mortality_2016.shape)
maternal_mortality_2016 = maternal_mortality_2016.loc[maternal_mortality_2016['abs_rate_maternal_mortality'] > 0.05,:]
maternal_mortality_2016.reset_index(inplace=True)
print(maternal_mortality_2016.shape)
maternal_mortality_2016.to_csv(OUT+'corr_maternal_mortality_2016.csv', index=False, index_label=False)


(170, 2)
(112, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [24]:
under5_mortality_2016 = corr2016[['rate_under5y_mortality']]
under5_mortality_2016['abs_rate_under5y_mortality'] = np.absolute(under5_mortality_2016['rate_under5y_mortality'])
under5_mortality_2016 = under5_mortality_2016.sort_values(by='abs_rate_under5y_mortality', ascending=False)
print(under5_mortality_2016.shape)
under5_mortality_2016 = under5_mortality_2016.loc[under5_mortality_2016['abs_rate_under5y_mortality'] > 0.1,:]
under5_mortality_2016.reset_index(inplace=True)
print(under5_mortality_2016.shape)
under5_mortality_2016.to_csv(OUT+'corr_rate_under5y_mortality_2016.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(170, 2)
(62, 3)


In [25]:
fully_immunized_2016 = corr2016[['Fully_Children12M']]
fully_immunized_2016['abs_Fully_Children12M'] = np.absolute(fully_immunized_2016['Fully_Children12M'])
fully_immunized_2016 = fully_immunized_2016.sort_values(by='abs_Fully_Children12M', ascending=False)
print(fully_immunized_2016.shape)
fully_immunized_2016 = fully_immunized_2016.loc[fully_immunized_2016['abs_Fully_Children12M'] > 0.1,:]
fully_immunized_2016.reset_index(inplace=True)
print(fully_immunized_2016.shape)
fully_immunized_2016.to_csv(OUT+'corr_Fully_Children12M_2016.csv', index=False, index_label=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(170, 2)
(78, 3)


## Write out results

### Maternal mortality

In [26]:
full_maternal_mortality = list(set(maternal_mortality_2011['index']).intersection(set(maternal_mortality_2016['index'])))
data_maternal_mortality_2011 = d2011[full_maternal_mortality]
s_data_maternal_mortality_2011 = s_data2011[full_maternal_mortality]
data_maternal_mortality_2016 = d2016[full_maternal_mortality]
s_data_maternal_mortality_2016 = s_data2016[full_maternal_mortality]


In [27]:
print(data_maternal_mortality_2011.shape)
print(data_maternal_mortality_2016.shape)
print(s_data_maternal_mortality_2011.shape)
print(s_data_maternal_mortality_2016.shape)

(64, 65)
(64, 65)
(64, 65)
(64, 65)


In [28]:
data_maternal_mortality_2011.to_csv(OUT+'maternal_mortality_2011.csv')
s_data_maternal_mortality_2011.to_csv(OUT+'s_maternal_mortality_2011.csv')
data_maternal_mortality_2016.to_csv(OUT+'maternal_mortality_2016.csv')
s_data_maternal_mortality_2016.to_csv(OUT+'s_maternal_mortality_2016.csv')

### Under 5 mortality

In [29]:
full_under5_mortality = list(set(under5_mortality_2011['index']).intersection(set(under5_mortality_2016['index'])))
data_under5_mortality_2011 = d2011[full_under5_mortality]
s_data_under5_mortality_2011 = s_data2011[full_under5_mortality]
data_under5_mortality_2016 = d2016[full_under5_mortality]
s_data_under5_mortality_2016 = s_data2016[full_under5_mortality]


In [30]:
print(data_under5_mortality_2011.shape)
print(s_data_under5_mortality_2011.shape)
print(data_under5_mortality_2016.shape)
print(s_data_under5_mortality_2016.shape)

(64, 44)
(64, 44)
(64, 44)
(64, 44)


In [31]:
data_under5_mortality_2011.to_csv(OUT+'under5_mortality_2011.csv')
s_data_under5_mortality_2011.to_csv(OUT+'s_under5_mortality_2011.csv')
data_under5_mortality_2016.to_csv(OUT+'under5_mortality_2016.csv')
s_data_under5_mortality_2016.to_csv(OUT+'s_under5_mortality_2016.csv')


### Fully immunized children

In [32]:
full_immunized = list(set(fully_immunized_2011['index']).intersection(set(fully_immunized_2016['index'])))
data_full_immunized_2011 = d2011[full_immunized]
s_data_full_immunized_2011 = s_data2011[full_immunized]
data_full_immunized_2016 = d2016[full_immunized]
s_data_full_immunized_2016 = s_data2016[full_immunized]


In [33]:
print(data_full_immunized_2011.shape)
print(s_data_full_immunized_2011.shape)
print(data_full_immunized_2016.shape)
print(s_data_full_immunized_2016.shape)

(64, 56)
(64, 56)
(64, 56)
(64, 56)


In [34]:
data_full_immunized_2011.to_csv(OUT+'full_immunized_2011.csv')
s_data_full_immunized_2011.to_csv(OUT+'s_full_immunized_2011.csv')
data_full_immunized_2016.to_csv(OUT+'full_immunized_2016.csv')
s_data_full_immunized_2016.to_csv(OUT+'s_full_immunized_2016.csv')