In [33]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

file = pd.read_csv("individual_Recode.csv", low_memory=False)
population = pd.read_csv("population_info.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['V001', 'Population']

clust_admin_1 = pd.read_csv("cluster_to_admin_1.csv")
clust_admin_2 = pd.read_csv("cluster_to_admin_2.csv")
clust_admin_3 = pd.read_csv("cluster_to_admin_3.csv")

# Rename columns in preparation for joining.
clust_admin_1 = clust_admin_1.rename(columns = {'DHSCLUST':'V001'})
clust_admin_2 = clust_admin_2.rename(columns = {'DHSCLUST':'V001'})
clust_admin_3 = clust_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv("voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv("voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv("voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
c1 = pd.merge(file, clust_admin_1, how='inner', on='V001')
c2 = pd.merge(c1, clust_admin_2, how='inner', on='V001')
c3 = pd.merge(c2, clust_admin_3, how='inner', on='V001')
c3 = pd.merge(c3, population, how='inner', on='V001')

keep_columns = ['V001','V002','V005', 'Population','M14$1', 'ID_1', 'ID_2', 'ID_3']
c3 = c3 [keep_columns]

c3.to_csv("complete_health_access_data.csv", index=False)

In [34]:
def compute_simple_wha_aggregate(region_name, col_name, data, cluster_file):
    admin_data = pd.DataFrame(columns=[region_name, 'Zero_Care_Row_Count', 'Zero_Care_Pop_Count', 'Zero_Care_SW_Count',
                                       'Care_SW_Count','Care_Pop_Count', 'Care_Row_Count'])
    admin_data[region_name] = list(cluster_file[col_name].unique())
    
    # Set index to region numbers.   
    wha_data = data
    wha_data.reset_index(inplace=True)
    wha_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)
    
    for index, row in wha_data.iterrows():
        result = row['M14$1']
        sample_weight = row['V005'] / 1000000
        population = row['Population']
        
       #  Use the sample weights to summate the total number of antenatal care visits. Ignore missing values i.e. -1
        if(result >= 0):
            prev = admin_data.loc[index, 'Care_SW_Count'] 
            admin_data.loc[index, 'Care_SW_Count'] = prev + (sample_weight * result)
            
            prev = admin_data.loc[index, 'Care_Pop_Count']
            admin_data.loc[index, 'Care_Pop_Count'] = prev + (result/population)
            
            prev = admin_data.loc[index, 'Care_Row_Count']
            admin_data.loc[index, 'Care_Row_Count'] = prev + result
            
        #  Increment the number of participants who have not had any antenatal care.
        if(result == 0):
            prev = admin_data.loc[index, 'Zero_Care_Row_Count']
            admin_data.loc[index, 'Zero_Care_Row_Count'] = prev + 1
            
            prev = admin_data.loc[index, 'Zero_Care_Pop_Count']
            admin_data.loc[index, 'Zero_Care_Pop_Count'] = prev + (1/population)
            
            prev = admin_data.loc[index, 'Zero_Care_SW_Count']
            admin_data.loc[index, 'Zero_Care_SW_Count'] = prev + sample_weight
            
    return admin_data

In [35]:
admin_1 = compute_simple_wha_aggregate("Admin_1_Region", "ID_1", c3, clust_admin_1)
admin_2 = compute_simple_wha_aggregate("Admin_2_Region", "ID_2", c3, clust_admin_2)
admin_3 = compute_simple_wha_aggregate("Admin_3_Region", "ID_3", c3, clust_admin_3)


print(admin_1.head())

admin_1.to_csv("wha_aggregate_1.csv", index=True)
admin_2.to_csv("wha_aggregate_2.csv", index=True)
admin_3.to_csv("wha_aggregate_3.csv", index=True)

                Zero_Care_Row_Count  Zero_Care_Pop_Count  Zero_Care_SW_Count  \
Admin_1_Region                                                                 
1                               0.0             0.000000            0.000000   
2                              47.0             0.009310           15.109753   
3                              39.0             0.013531           52.367440   
4                              48.0             0.012789           20.282583   
5                              18.0             0.001206           23.113793   

                Care_SW_Count  Care_Pop_Count  Care_Row_Count  
Admin_1_Region                                                 
1                  485.523894        0.011555           299.0  
2                   74.710150        0.034567           256.0  
3                 1369.349947        0.341173          1491.0  
4                  184.832714        0.077267           430.0  
5                 1812.066880        0.118231          

In [36]:
def wha_voronoi_aggregate(region_name, col_name, wha_recode, voronoi_data):
    wha_data = wha_recode
    wha_data.reset_index(inplace=True)
    
    cluster_data = pd.DataFrame(columns=['V001','No_Care_Count_Aggregate', 'No_Care_Pop_Aggregate', 'No_Care_SW_Aggregate',
                                        'Care_Count_Aggregate', 'Care_Pop_Aggregate', 'Care_SW_Aggregate'])
    cluster_data['V001'] = list(wha_recode['V001'].unique())

    cluster_data.set_index('V001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    admin_region_data = pd.DataFrame(columns=[region_name, 'Care_Row_Count', 'Care_SW_Count', 'Care_Pop_Count',
                                             'Zero_Row_Count', 'Zero_SW_Count','Zero_Pop_Count'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    wha_data.set_index('V001', inplace=True)
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)
    
    for index, row in wha_data.iterrows():
        value = row['M14$1']
        sample_weight = row['V005'] / 1000000
        population = row['Population']
        
        if(value >= 0):
            prev = cluster_data.loc[index, 'Care_SW_Aggregate']
            cluster_data.loc[index, 'Care_SW_Aggregate'] = prev + (sample_weight * value)
            
            prev = cluster_data.loc[index, 'Care_Pop_Aggregate']
            cluster_data.loc[index,'Care_Pop_Aggregate'] = prev + (value / population)
            
            prev = cluster_data.loc[index, 'Care_Count_Aggregate']
            cluster_data.loc[index, 'Care_Count_Aggregate'] = prev + value
        
        if(value == 0):
            prev = cluster_data.loc[index, 'No_Care_Pop_Aggregate']
            cluster_data.loc[index, 'No_Care_Pop_Aggregate'] = prev + (1/population)
            
            prev = cluster_data.loc[index, 'No_Care_Count_Aggregate']
            cluster_data.loc[index,'No_Care_Count_Aggregate' ] = prev + 1
            
            prev = cluster_data.loc[index,'No_Care_SW_Aggregate']
            cluster_data.loc[index, 'No_Care_SW_Aggregate'] = prev + sample_weight
            
            
    cluster_data.reset_index(inplace=True)

    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='V001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
        
    for ind, row in vor_data.iterrows():
        care_count_prop = row['Proportion'] * row['Care_Count_Aggregate']
        care_sw_prop = row['Proportion'] * row['Care_SW_Aggregate']
        care_pop_prop = row['Proportion'] * row['Care_Pop_Aggregate']
        
        prev = admin_region_data.loc[ind,'Care_Row_Count']
        admin_region_data.loc[ind, 'Care_Row_Count'] = prev + care_count_prop
        
        prev = admin_region_data.loc[ind, 'Care_SW_Count']
        admin_region_data.loc[ind, 'Care_SW_Count'] = prev + care_sw_prop
        
        prev = admin_region_data.loc[ind, 'Care_Pop_Count']
        admin_region_data.loc[ind, 'Care_Pop_Count'] = prev + care_pop_prop
        
        zero_count_prop = row['Proportion'] * row['No_Care_Count_Aggregate']
        zero_pop_prop = row['Proportion'] * row['No_Care_Pop_Aggregate']
        zero_sw_prop = row['Proportion'] * row['No_Care_SW_Aggregate']
        
        prev = admin_region_data.loc[ind,'Zero_Row_Count']
        admin_region_data.loc[ind, 'Zero_Row_Count'] = prev + zero_count_prop
        
        prev = admin_region_data.loc[ind, 'Zero_SW_Count']
        admin_region_data.loc[ind, 'Zero_SW_Count'] = prev + zero_sw_prop
        
        prev = admin_region_data.loc[ind, 'Zero_Pop_Count']
        admin_region_data.loc[ind, 'Zero_Pop_Count'] = prev + zero_pop_prop
        
    return admin_region_data   

In [37]:
admin_vor_1 = wha_voronoi_aggregate("Admin_Region_1", "ID_1", c3, voronoi_admin_1)
admin_vor_2 = wha_voronoi_aggregate("Admin_Region_2", "ID_2", c3, voronoi_admin_2)
admin_vor_3 = wha_voronoi_aggregate("Admin_Region_3", "ID_3", c3, voronoi_admin_3)

print(admin_vor_1.head())
print()

print("---------------------HOW TO AGGREGATE-----------------")
print("-----NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_1['Care_Row_Count'])
print("Admin 1 - (Visits row count VOR, Visits count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_2['Care_Row_Count'])
print("Admin 2 - (Visits row count VOR, Visits count PIP): " +  repr(x) + "\n")

print("----NUMBER OF ZER0 ANTENATAL VISITS-----")
x, y = stats.kendalltau(admin_vor_1['Zero_Row_Count'], admin_1['Zero_Care_Row_Count'])
print("Admin 1 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Zero_Row_Count'], admin_2['Zero_Care_Row_Count'])
print("Admin 2 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x) + "\n")
# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

print("------------------HOW TO COUNT-------------------")
print("------NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_SW_Count'])
print("Admin 1 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_Pop_Count'])
print("Admin 1 - (Visits row count VOR, Visits pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_SW_Count'])
print("Admin 2 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_Pop_Count'])
print("Admin 2 - (Visits row count VOR, Visits pop VOR): " + repr(x))

admin_vor_1.to_csv("wha_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv("wha_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv("wha_vor_aggregate_3.csv", index=True)



                Care_Row_Count  Care_SW_Count  Care_Pop_Count  Zero_Row_Count  \
Admin_Region_1                                                                  
1                   489.512976     576.116919        0.033532        0.841524   
2                   223.761491      62.220590        0.029902       35.626242   
3                  1444.188317    1362.324606        0.330037       34.602809   
4                   472.500382     208.388983        0.077181       54.638931   
5                  1293.623200    1718.622061        0.116313       22.337735   

                Zero_SW_Count  Zero_Pop_Count  
Admin_Region_1                                 
1                    1.397492        0.000032  
2                   11.183156        0.007424  
3                   50.211500        0.012805  
4                   23.713599        0.013263  
5                   23.026124        0.002127  

---------------------HOW TO AGGREGATE-----------------
-----NUMBER OF ANTENATAL VISITS-------
A