In [1]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

In [2]:
def compute_simple_wha_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name, 'Zero_Care_Row_Count', 'Zero_Care_Pop_Count', 'Zero_Care_SW_Count',
                                       'Care_SW_Count','Care_Pop_Count', 'Care_Row_Count'])
    
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    wha_data = data
    wha_data.reset_index(inplace=True)
    wha_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)
    
    for index, row in wha_data.iterrows():
        result = row['M14$1']
        sample_weight = row['V005'] / 1000000
#         population = row['Population']
        
       #  Use the sample weights to summate the total number of antenatal care visits. Ignore missing values i.e. -1
        if(result > 0 and result < 98):
            prev = admin_data.loc[index, 'Care_SW_Count'] 
            admin_data.loc[index, 'Care_SW_Count'] = prev + (sample_weight * result)
            
            prev = admin_data.loc[index, 'Care_Row_Count']
            admin_data.loc[index, 'Care_Row_Count'] = prev + result
            
#             prev = admin_data.loc[index, 'Care_Pop_Count']
#             admin_data.loc[index, 'Care_Pop_Count'] = prev + (result/population)
            
        #  Increment the number of participants who have not had any antenatal care.
        if(result == 0):
            prev = admin_data.loc[index, 'Zero_Care_Row_Count']
            admin_data.loc[index, 'Zero_Care_Row_Count'] = prev + 1
            
            prev = admin_data.loc[index, 'Zero_Care_SW_Count']
            admin_data.loc[index, 'Zero_Care_SW_Count'] = prev + sample_weight
            
#             prev = admin_data.loc[index, 'Zero_Care_Pop_Count']
#             admin_data.loc[index, 'Zero_Care_Pop_Count'] = prev + (1/population)
         
    return admin_data

In [3]:
def wha_voronoi_aggregate(region_name, col_name, wha_recode, voronoi_data):
    wha_data = wha_recode
    wha_data.reset_index(inplace=True)
    
    cluster_data = pd.DataFrame(columns=['V001','No_Care_Count_Aggregate', 'No_Care_Pop_Aggregate', 'No_Care_SW_Aggregate',
                                        'Care_Count_Aggregate', 'Care_Pop_Aggregate', 'Care_SW_Aggregate'])
    
    cluster_data['V001'] = list(wha_recode['V001'].unique())

    cluster_data.set_index('V001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    admin_region_data = pd.DataFrame(columns=[region_name, 'Care_Row_Count', 'Care_SW_Count', 'Care_Pop_Count',
                                             'Zero_Row_Count', 'Zero_SW_Count','Zero_Pop_Count'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    wha_data.set_index('V001', inplace=True)
    wha_data['M14$1'] = pd.to_numeric(wha_data['M14$1'], errors='coerce').fillna(-1).astype(int)
    
    for index, row in wha_data.iterrows():
        value = row['M14$1']
        sample_weight = row['V005'] / 1000000
#         population = row['Population']
        
        if(value > 0 and value < 98):
            prev = cluster_data.loc[index, 'Care_SW_Aggregate']
            cluster_data.loc[index, 'Care_SW_Aggregate'] = prev + (sample_weight * value)
            
#             prev = cluster_data.loc[index, 'Care_Pop_Aggregate']
#             cluster_data.loc[index,'Care_Pop_Aggregate'] = prev + (value / population)
            
            prev = cluster_data.loc[index, 'Care_Count_Aggregate']
            cluster_data.loc[index, 'Care_Count_Aggregate'] = prev + value
        
        if(value == 0):
#             prev = cluster_data.loc[index, 'No_Care_Pop_Aggregate']
#             cluster_data.loc[index, 'No_Care_Pop_Aggregate'] = prev + (1/population)
            
            prev = cluster_data.loc[index, 'No_Care_Count_Aggregate']
            cluster_data.loc[index,'No_Care_Count_Aggregate' ] = prev + 1
            
            prev = cluster_data.loc[index,'No_Care_SW_Aggregate']
            cluster_data.loc[index, 'No_Care_SW_Aggregate'] = prev + sample_weight
            
            
    cluster_data.reset_index(inplace=True)

    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='V001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
        
    for ind, row in vor_data.iterrows():
        care_count_prop = row['Proportion'] * row['Care_Count_Aggregate']
        prev = admin_region_data.loc[ind,'Care_Row_Count']
        admin_region_data.loc[ind, 'Care_Row_Count'] = prev + care_count_prop
        
        care_sw_prop = row['Proportion'] * row['Care_SW_Aggregate']
        prev = admin_region_data.loc[ind, 'Care_SW_Count']
        admin_region_data.loc[ind, 'Care_SW_Count'] = prev + care_sw_prop
        
        zero_count_prop = row['Proportion'] * row['No_Care_Count_Aggregate']
        prev = admin_region_data.loc[ind,'Zero_Row_Count']
        admin_region_data.loc[ind, 'Zero_Row_Count'] = prev + zero_count_prop
        
        zero_sw_prop = row['Proportion'] * row['No_Care_SW_Aggregate']
        prev = admin_region_data.loc[ind, 'Zero_SW_Count']
        admin_region_data.loc[ind, 'Zero_SW_Count'] = prev + zero_sw_prop   
        
#         care_pop_prop = row['Proportion'] * row['Care_Pop_Aggregate']
#         prev = admin_region_data.loc[ind, 'Care_Pop_Count']
#         admin_region_data.loc[ind, 'Care_Pop_Count'] = prev + care_pop_prop
        
#         zero_pop_prop = row['Proportion'] * row['No_Care_Pop_Aggregate']
#         prev = admin_region_data.loc[ind, 'Zero_Pop_Count']
#         admin_region_data.loc[ind, 'Zero_Pop_Count'] = prev + zero_pop_prop
        
    return admin_region_data   

In [4]:
# --------------------------------------COTE D'IVOIRE COMPUTATIONS--------------------------
rel_path = "Cote D'Ivoire Data SPSS/ciir62sv/"
vor_path = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
pop_path = "Cote D'Ivoire Data SPSS/cigc61fl/"
clu_path = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

file = pd.read_csv(rel_path + "individual_Recode.csv", usecols=['V001','V002','V005','M14$1'])
population = pd.read_csv(pop_path + "CIGC61FL.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['V001', 'Population']

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')
clust_file = pd.merge(clust_file, population, how='inner', on='V001')

clust_file.to_csv(rel_path + "complete_health_access_data.csv", index=False)

# ---------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------------
admin_1 = compute_simple_wha_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_simple_wha_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_simple_wha_aggregate("Admin_3_Region", "ID_3", clust_file)

print(admin_1.head())
admin_1.to_csv(rel_path + "wha_aggregate_1.csv", index=True)
admin_2.to_csv(rel_path + "wha_aggregate_2.csv", index=True)
admin_3.to_csv(rel_path + "wha_aggregate_3.csv", index=True)

# ---------------------------AGGREGATES USING VORONOI CALCULATIONS-------------------------------------
admin_vor_1 = wha_voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = wha_voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = wha_voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print()

print("---------------------HOW TO AGGREGATE-----------------")
print("-----NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_1['Care_Row_Count'])
print("Admin 1 - (Visits row count VOR, Visits count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_2['Care_Row_Count'])
print("Admin 2 - (Visits row count VOR, Visits count PIP): " +  repr(x) + "\n")

print("----NUMBER OF ZER0 ANTENATAL VISITS-----")
x, y = stats.kendalltau(admin_vor_1['Zero_Row_Count'], admin_1['Zero_Care_Row_Count'])
print("Admin 1 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Zero_Row_Count'], admin_2['Zero_Care_Row_Count'])
print("Admin 2 - (Zero Care row count VOR, Zero Care count PIP): " +  repr(x) + "\n")

# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

print("------------------HOW TO COUNT-------------------")
print("------NUMBER OF ANTENATAL VISITS-------")
x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_SW_Count'])
print("Admin 1 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_1['Care_Row_Count'], admin_vor_1['Care_Pop_Count'])
# print("Admin 1 - (Visits row count VOR, Visits pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_SW_Count'])
print("Admin 2 - (Visits row count VOR, Visits Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_2['Care_Row_Count'], admin_vor_2['Care_Pop_Count'])
# print("Admin 2 - (Visits row count VOR, Visits pop VOR): " + repr(x))

admin_vor_1.to_csv(rel_path + "wha_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "wha_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "wha_vor_aggregate_3.csv", index=True)

clust file columns
Index(['V001', 'V002', 'V005', 'M14$1', 'ID_1', 'ID_2', 'ID_3', 'Population'], dtype='object')
                Zero_Care_Row_Count  Zero_Care_Pop_Count  Zero_Care_SW_Count  \
Admin_1_Region                                                                 
1                               0.0                    0            0.000000   
2                              47.0                    0           15.109753   
3                              39.0                    0           52.367440   
4                              48.0                    0           20.282583   
5                              18.0                    0           23.113793   

                Care_SW_Count  Care_Pop_Count  Care_Row_Count  
Admin_1_Region                                                 
1                  485.523894               0           299.0  
2                   74.710150               0           256.0  
3                 1283.531104               0          1392.0  
4    

In [5]:
#  ------------------------------------SENEGAL COMPUTATIONS--------------------------------------------------
rel_path = "Senegal Data SPSS/snir7qsv/"
vor_path = "Senegal Data SPSS/Voronoi/Proportions/"
clu_path = "Senegal Data SPSS/Cluster_To_Region/"

file = pd.read_csv(rel_path + "individual_recode.csv", usecols=['V001','V002','V005','M14$1'])

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='V001')

#  Set empty cells to -1.   
clust_file['M14$1'] = pd.to_numeric(clust_file['M14$1'], errors='coerce').fillna(-1).astype(int)
clust_file.to_csv(rel_path + "complete_health_access_data.csv", index=False)

admin_vor_1 = wha_voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = wha_voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = wha_voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print(admin_vor_2.head())
print(admin_vor_3.head())

admin_vor_1.to_csv(rel_path + "malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "malaria_vor_aggregate_3.csv", index=True)

[2631 2641 2632 2633 2638 2639 2634 2640 2636 2635 2637]
Index(['V001', 'V002', 'V005', 'M14$1', 'ID_1', 'ID_2', 'ID_3'], dtype='object')
                Care_Row_Count  Care_SW_Count  Care_Pop_Count  Zero_Row_Count  \
Admin_Region_1                                                                  
2631                894.248477    2800.007650               0        3.000000   
2632               1311.356324    1592.557545               0        8.886526   
2633               1336.802102    1039.215031               0        2.482577   
2634               2297.827618    1521.168481               0       44.459511   
2635               2387.497951    1241.303335               0       13.246921   

                Zero_SW_Count  Zero_Pop_Count  
Admin_Region_1                                 
2631                12.404689               0  
2632                15.622492               0  
2633                 2.590607               0  
2634                27.405048               0  
2635  