In [26]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

In [27]:
# Counts the number of relevant responses and assigns to each region.
def compute_simple_wha_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name, 'Zero_Care_Row_Count', 'Zero_Care_SW_Count',
                                       'Care_SW_Count', 'Care_Row_Count'])
    
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    wha_data = data.copy()
    wha_data.set_index(col_name, inplace=True)
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    
    for index, row in wha_data.iterrows():
        result = row['M14$1']
        sample_weight = row['V005'] / 1000000
        
       #  Use the sample weights to summate the total number of antenatal care visits. Ignore missing values i.e. -1
        if(result > 0 and result < 98):
            prev = admin_data.loc[index, 'Care_SW_Count'] 
            admin_data.loc[index, 'Care_SW_Count'] = prev + (sample_weight * result)
            
            prev = admin_data.loc[index, 'Care_Row_Count']
            admin_data.loc[index, 'Care_Row_Count'] = prev + result
            
        #  Increment the number of participants who have not had any antenatal care.
        if(result == 0):
            prev = admin_data.loc[index, 'Zero_Care_Row_Count']
            admin_data.loc[index, 'Zero_Care_Row_Count'] = prev + 1
            
            prev = admin_data.loc[index, 'Zero_Care_SW_Count']
            admin_data.loc[index, 'Zero_Care_SW_Count'] = prev + sample_weight
         
    return admin_data

In [28]:
# Assigns each region total number of positive responses which have been normalised by the population of the clusters.
def wha_voronoi_aggregate(region_name, col_name, wha_recode, voronoi_data, population):
    pop_data = population.copy()
    vor_data = voronoi_data.copy() 
    wha_data = wha_recode.copy()

    # Replace cells of no responses with -1.     
    wha_data.set_index('V001', inplace=True)
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)

    # Contains aggregate for each cluster. 
    cluster_data = pd.DataFrame(columns=['V001','No_Care_Count_Aggregate','No_Care_SW_Aggregate',
                                        'Care_Count_Aggregate', 'Care_SW_Aggregate'])
    
    cluster_data['V001'] = list(wha_recode['V001'].unique())

    cluster_data.set_index('V001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    # Contains Admin Region and the women's access to health aggregates.
    admin_region_data = pd.DataFrame(columns=[region_name, 'Care_Row_Count', 'Care_SW_Count', 'Care_Pop_Count',
                                             'Zero_Row_Count', 'Zero_SW_Count','Zero_Pop_Count', 'Population'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)
    
    for index, row in wha_data.iterrows():
        value = row['M14$1']
        sample_weight = row['V005'] / 1000000
        
        if(value > 0 and value < 98):
            prev = cluster_data.loc[index, 'Care_SW_Aggregate']
            cluster_data.loc[index, 'Care_SW_Aggregate'] = prev + (sample_weight * value)
            
            prev = cluster_data.loc[index, 'Care_Count_Aggregate']
            cluster_data.loc[index, 'Care_Count_Aggregate'] = prev + value
        
        if(value == 0):
            prev = cluster_data.loc[index, 'No_Care_Count_Aggregate']
            cluster_data.loc[index,'No_Care_Count_Aggregate' ] = prev + 1
            
            prev = cluster_data.loc[index,'No_Care_SW_Aggregate']
            cluster_data.loc[index, 'No_Care_SW_Aggregate'] = prev + sample_weight
            
            
    cluster_data.reset_index(inplace=True)
    
    # Result of vor data is (cluster, region, proportion, SW_aggregate, Row_Aggregate, ..., ...).
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='V001')
    vor_data = pd.merge(vor_data, pop_data, how='inner', on='V001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    # Assigns responses to regions based on voronoi overlap with region. Proportions cluster population in same way. 
    for ind, row in vor_data.iterrows():
        care_count_prop = row['Proportion'] * row['Care_Count_Aggregate']
        prev = admin_region_data.loc[ind,'Care_Row_Count']
        admin_region_data.loc[ind, 'Care_Row_Count'] = prev + care_count_prop
        
        care_sw_prop = row['Proportion'] * row['Care_SW_Aggregate']
        prev = admin_region_data.loc[ind, 'Care_SW_Count']
        admin_region_data.loc[ind, 'Care_SW_Count'] = prev + care_sw_prop
        
        zero_count_prop = row['Proportion'] * row['No_Care_Count_Aggregate']
        prev = admin_region_data.loc[ind,'Zero_Row_Count']
        admin_region_data.loc[ind, 'Zero_Row_Count'] = prev + zero_count_prop
        
        zero_sw_prop = row['Proportion'] * row['No_Care_SW_Aggregate']
        prev = admin_region_data.loc[ind, 'Zero_SW_Count']
        admin_region_data.loc[ind, 'Zero_SW_Count'] = prev + zero_sw_prop 
        
        prop = row['Proportion'] * row['Population']
        prev = admin_region_data.loc[ind, 'Population']
        admin_region_data.loc[ind, 'Population'] = prev + prop 
    
    #  Normalises total responses in region using population of all clusters in the region.
    for ind, row in admin_region_data.iterrows():
        value = row['Zero_Row_Count'] /  row['Population']
        admin_region_data.loc[ind, 'Zero_Pop_Count'] = value * (10**6)
        
        value = row['Care_Row_Count'] /  row['Population']
        admin_region_data.loc[ind, 'Care_Pop_Count'] = value * (10**6)
    return admin_region_data   

In [29]:
# --------------------------------------COTE D'IVOIRE COMPUTATIONS--------------------------
recode_path = "Cote D'Ivoire Data SPSS/ciir62sv/"
path_to_voronoi = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
path_to_clusters = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

# Population of all DHS clusters.
population_file = pd.read_csv("Cote D'Ivoire Data SPSS/Region_Population/voronoi.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'V001'})
population_file = population_file.rename(columns = {'_count':'Population'})

file = pd.read_csv(recode_path + "individual_Recode.csv", usecols=['V001','V002','V005','M14$1'])

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')

# ---------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------------
admin_1 = compute_simple_wha_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_simple_wha_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_simple_wha_aggregate("Admin_3_Region", "ID_3", clust_file)
admin_1.to_csv(recode_path + "wha_aggregate_1.csv", index=True)
admin_2.to_csv(recode_path + "wha_aggregate_2.csv", index=True)
admin_3.to_csv(recode_path + "wha_aggregate_3.csv", index=True)

# ---------------------------AGGREGATES USING VORONOI CALCULATIONS-------------------------------------
admin_vor_1 = wha_voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = wha_voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = wha_voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

print("---------------------HOW TO AGGREGATE-----------------")
print("-----NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_1['Care_Row_Count'])
print("Admin 1 - (Visits row count VOR, Visits count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_2['Care_Row_Count'])
print("Admin 2 - (Visits row count VOR, Visits count PIP): " +  repr(x) + "\n")

print("----NUMBER OF ZER0 ANTENATAL VISITS-----")
x, y = stats.kendalltau(admin_vor_1['Zero_Row_Count'], admin_1['Zero_Care_Row_Count'])
print("Admin 1 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Zero_Row_Count'], admin_2['Zero_Care_Row_Count'])
print("Admin 2 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT-------------------")
print("------NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_SW_Count'])
print("Admin 1 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_Pop_Count'])
print("Admin 1 - (Visits row count VOR, Visits pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_SW_Count'])
print("Admin 2 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_Pop_Count'])
print("Admin 2 - (Visits row count VOR, Visits pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_3['Care_Row_Count'], admin_vor_3['Care_SW_Count'])
print("Admin 3 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_3['Care_Row_Count'], admin_vor_3['Care_Pop_Count'])
print("Admin 3 - (Visits row count VOR, Visits pop VOR): " + repr(x) + "\n")

admin_vor_1.to_csv(recode_path + "wha_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "wha_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "wha_vor_aggregate_3.csv", index=True)

---------------------HOW TO AGGREGATE-----------------
-----NUMBER OF ANTENATAL VISITS-------
Admin 1 - (Visits row count VOR, Visits count PIP): 0.9064327485380117
Admin 2 - (Visits row count VOR, Visits count PIP): 0.80147085571754906

----NUMBER OF ZER0 ANTENATAL VISITS-----
Admin 1 - (Zero Care row count VOR, Zero Care count PIP): 0.92354539019687687
Admin 2 - (Zero Care row count VOR, Zero Care count PIP): 0.86106102394459927

------------------HOW TO COUNT-------------------
------NUMBER OF ANTENATAL VISITS-------
Admin 1 - (Visits row count VOR, Visits Sample Weight VOR): 0.48538011695906425
Admin 1 - (Visits row count VOR, Visits pop VOR): 0.391812865497076

Admin 2 - (Visits row count VOR, Visits Sample Weight VOR): 0.48244897959183675
Admin 2 - (Visits row count VOR, Visits pop VOR): 0.29959183673469386

Admin 3 - (Visits row count VOR, Visits Sample Weight VOR): 0.59964747356051706
Admin 3 - (Visits row count VOR, Visits pop VOR): 0.45757931844888367



In [30]:
#  ------------------------------------SENEGAL COMPUTATIONS--------------------------------------------------
recode_path = "SN_2012-13/SNIR6DSV/"
path_to_voronoi = "SN_2012-13/Voronoi/Proportions/"
path_to_clusters = "SN_2012-13/Cluster_To_Region/"

# Population of all DHS clusters.
population_file = pd.read_csv("SN_2012-13/Region_Populations/voronoi_pop.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'V001'})
population_file = population_file.rename(columns = {'_count':'Population'})

file = pd.read_csv(recode_path + "individual_recode.csv", usecols=['V001','V002','V005','M14$1'])

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='V001')

#  Set empty cells to -1.   
clust_file['M14$1'] = pd.to_numeric(clust_file['M14$1'], errors='coerce').fillna(-1).astype(int)
clust_file.to_csv(recode_path + "complete_health_access_data.csv", index=False)

admin_vor_1 = wha_voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, pop)
admin_vor_2 = wha_voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, pop)
admin_vor_3 = wha_voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, pop)

admin_vor_1.to_csv(recode_path + "wha_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "wha_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "wha_vor_aggregate_3.csv", index=True)

                Care_Row_Count  Care_SW_Count  Care_Pop_Count  Zero_Row_Count  \
Admin_Region_1                                                                  
2631                 954.81752    2755.138333     8257.887639         1.00000   
2632                1388.20868    1495.669286     2678.673179        22.32572   
2633                1236.51777     972.300120     1256.456698         7.90154   
2634                2007.85961    1447.015313     1094.522536        38.48251   
2635                1878.66417    1010.970448      759.904604        41.10051   

                Zero_SW_Count  Zero_Pop_Count    Population  
Admin_Region_1                                               
2631                 4.652251        8.648655  1.156249e+05  
2632                23.309948       43.079479  5.182449e+05  
2633                 6.276549        8.028953  9.841308e+05  
2634                25.902290       20.977550  1.834462e+06  
2635                22.124021       16.624827  2.472237e+06 