# Analysis of Child Health Data

Looking at the child health data in Ghana from various perspectives.

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline

In [2]:
# Dataset location
database = 'C:\Ghana\Ghana hackfest\Ghana hackfest\dhs-childhealth\dhs_14_93_u5disease_gps.csv'

# Read in a CSV file and store the contents in a dataframe (df)
df = pd.read_csv(database, low_memory=False)
df.head()

Unnamed: 0,caseid,midx,v000,v001,v002,v003,v004,v005,v006,v007,...,adm1name,dhsregco,dhsregna,source,urban_rura,latnum,longnum,alt_gps,alt_dem,datum
0,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
1,1 3 1,,GH2,1,3,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
2,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
3,1 4 1,,GH2,1,4,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
4,1 7 2,,GH2,1,7,2,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84


In [3]:
df.shape

(18222, 68)

In [4]:
df.columns

Index(['caseid', 'midx', 'v000', 'v001', 'v002', 'v003', 'v004', 'v005',
       'v006', 'v007', 'v008', 'v012', 'v024', 'v130', 'v131', 'v133', 'v201',
       'v202', 'v203', 'v213', 'v218', 'v461', 'v501', 'v714', 'b8', 'h11',
       'h22', 'h31', 'hhid', 'hv000', 'hv003', 'hv004', 'hv005', 'hv006',
       'hv007', 'hv008', 'hv009', 'hv014', 'hv025', 'hv204', 'hv205', 'hv213',
       'hv219', 'hv220', 'hv226', 'hv227', 'hv228', 'hv270', 'hv108_01',
       'dhsyear', 'dhsid', 'dhscc', 'ccfips', 'adm1fips', 'adm1fipsna',
       'adm1salbna', 'adm1salbco', 'adm1dhs', 'adm1name', 'dhsregco',
       'dhsregna', 'source', 'urban_rura', 'latnum', 'longnum', 'alt_gps',
       'alt_dem', 'datum'],
      dtype='object')

###  Column Readability

Load Column Names from separate List and change for readability 

In [5]:
# Dataset location
dbColumns = 'C:\Ghana\Ghana hackfest\Ghana hackfest\dhs-childhealth\\variablelistcsv.csv'
# Read in a CSV file 
dfc = pd.read_csv(dbColumns, low_memory=False)
dfc.head()


Unnamed: 0,name,varlab
0,caseid,case identification
1,midx,index to birth history
2,v000,country code and phase
3,v001,cluster number
4,v002,household number


In [6]:
new_column_names = dfc['varlab'].tolist()

In [7]:
df.columns = new_column_names
df.head()

Unnamed: 0,case identification,index to birth history,country code and phase,cluster number,household number,respondent's line number,ultimate area unit,women's individual sample weight (6 decimals),month of interview,year of interview,...,ADM1NAME,DHSREGCO,DHSREGNA,SOURCE,URBAN_RURA,LATNUM,LONGNUM,ALT_GPS,ALT_DEM,DATUM
0,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
1,1 3 1,,GH2,1,3,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
2,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
3,1 4 1,,GH2,1,4,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
4,1 7 2,,GH2,1,7,2,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84


In [8]:
print(new_column_names)

['case identification', 'index to birth history', 'country code and phase', 'cluster number', 'household number', "respondent's line number", 'ultimate area unit', "women's individual sample weight (6 decimals)", 'month of interview', 'year of interview', 'date of interview (cmc)', "respondent's current age", 'region', 'religion', 'ethnicity', 'education in single years', 'total children ever born', 'sons at home', 'daughters at home', 'currently pregnant', 'number of living children', 'respondent slept under mosquito bed net', 'current marital status', 'respondent currently working', 'current age of child', 'had diarrhea recently', 'had fever in last two weeks', 'had cough in last two weeks', 'case identification', 'country code and phase', "respondent's line number (answering household questionnaire)", 'ultimate area unit', 'household sample weight (6 decimals)', 'month of interview', 'year of interview', 'date of interview (cmc)', 'number of household members', 'number of children 5

In [9]:
new_column_names = ['caseid', 'indextobirthhistory', 'countrycode', 'cluster', 'household', "line", 'ultimateareaunit', "womensampleweight", 'monthofinterview', 'yearofinterview', 'dateofinterview', "age", 'region', 'religion', 'ethnicity', 'education', 'totalchildren', 'sons', 'daughters', 'pregnant', 'livingchildren', 'mosquito', 'maritalstatus', 'working', 'ageofchild', 'diarrhea', 'fever', 'cough', 'caseidentification', 'country code and phase', "respondent's line number (answering household questionnaire)", 'ultimate area unit', 'household sample weight (6 decimals)', 'month of interview', 'year of interview', 'date of interview (cmc)', 'number of household members', 'number of children 5 and under (de jure)', 'type of place of residence', 'time to get to water source (minutes)', 'type of toilet facility', 'main floor material', 'sex of head of household', 'age of head of household', 'type of cooking fuel', 'has mosquito bed net for sleeping', 'children under 5 slept under mosquito bed net last night', 'wealth index', 'education completed in single years', 'Survey year', 'DHSID', 'DHSCC', 'CCFIPS', 'ADM1FIPS', 'ADM1FIPSNA', 'ADM1SALBNA', 'ADM1SALBCO', 'ADM1DHS', 'ADM1NAME', 'DHSREGCO', 'DHSREGNA', 'SOURCE', 'URBAN_RURA', 'LATNUM', 'LONGNUM', 'ALT_GPS', 'ALT_DEM', 'DATUM']

In [10]:
df.columns = new_column_names
df.head()

Unnamed: 0,caseid,indextobirthhistory,countrycode,cluster,household,line,ultimateareaunit,womensampleweight,monthofinterview,yearofinterview,...,ADM1NAME,DHSREGCO,DHSREGNA,SOURCE,URBAN_RURA,LATNUM,LONGNUM,ALT_GPS,ALT_DEM,DATUM
0,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
1,1 3 1,,GH2,1,3,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
2,1 2 1,,GH2,1,2,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
3,1 4 1,,GH2,1,4,1,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84
4,1 7 2,,GH2,1,7,2,1,1000000,11,93,...,Western,1,western,GAZ,R,5.1,-3.0833,9999,8,WGS84


# New Combined GLSS Data Set

We now have a full data set that combines Income, Lighting and Deforestation Data.

In [11]:
# Dataset location
maindb = 'C:\Ghana\Ghana hackfest\Ghana hackfest\combinedGLSS\CSV2MAINglss4_5_6_alignedfclmal_nl.csv'

# Read in a CSV file and store the contents in a dataframe (df)
df2 = pd.read_csv(maindb, low_memory=False, sep=',')
df2.head()


Unnamed: 0,hhid,glss,year,region,clust,nh,pid,sex,mining_region,age,...,nl_2011_sum_1km,nl_2011_mean_1km,nl_2011_media_1km,nl_2012_sum_1km,nl_2012_mean_1km,nl_2012_media_1km,nl_2013_sum_1km,nl_2013_mean_1km,nl_2013_media_1km,_merge_glss_mal_nl1
0,1.0,4,98.0,1.0,4002,1.0,2.0,2.0,1.0,51.0,...,12.0,4.0,4.0,10.0,3.333333,5.0,10.0,3.333333,5.0,matched (3)
1,1.0,4,98.0,1.0,4002,1.0,6.0,1.0,1.0,17.0,...,12.0,4.0,4.0,10.0,3.333333,5.0,10.0,3.333333,5.0,matched (3)
2,1.0,4,98.0,1.0,4002,1.0,1.0,1.0,1.0,57.0,...,12.0,4.0,4.0,10.0,3.333333,5.0,10.0,3.333333,5.0,matched (3)
3,1.0,4,98.0,1.0,4002,1.0,4.0,1.0,1.0,22.0,...,12.0,4.0,4.0,10.0,3.333333,5.0,10.0,3.333333,5.0,matched (3)
4,1.0,4,98.0,1.0,4002,1.0,3.0,2.0,1.0,26.0,...,12.0,4.0,4.0,10.0,3.333333,5.0,10.0,3.333333,5.0,matched (3)


In [None]:
df2.shape

(136285, 700)

In [None]:
corrdf = df2.corr()

### Create a Correlation Heatmap

In [None]:
#Correlation Matrix

corr = (corrdf)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
# sns.title('Heatmap of Correlation Matrix')
corr

In [None]:
plt.show()

## Split Data
Split data column-wise to better visualize the uncorrelated feature, taking approximately the first 30 % of features.

In [None]:
dfA = pd.DataFrame(df2[df2.columns[0:190]])

In [None]:
#Correlation Matrix
corr2 = dfA.corr()
corr = (corr2)
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)
# sns.title('Heatmap of Correlation Matrix')
corr

In [None]:
dfA.columns

In [None]:
# Check to see if there are any missing values in our data set
dfA.isnull().any()

In [None]:
dfA.describe()

In [None]:
dfA.dtypes

In [None]:
dfA.to_csv('200FeatureRedMainData.csv')