In [6]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import os

In [7]:
# Counts the number of children who died within 3 months of birth and assigns to each region.
def compute_cm_simple_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name, 'CM_Row_Count','CM_SW_Count'])
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    child_mortality_data = data.copy()
    child_mortality_data.set_index(col_name, inplace=True)
    child_mortality_data['B6'] = pd.to_numeric(child_mortality_data['B6'], errors='coerce').fillna(-1).astype(int)
    child_mortality_data['B7'] = pd.to_numeric(child_mortality_data['B7'], errors='coerce').fillna(-1).astype(int)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    for index, row in child_mortality_data.iterrows():
        age_of_death_days = row['B6']
        age_of_death_mths = row['B7']
        sample_weight = row['V005'] / 1000000
        
        #  Target children that died within 3 months of death.
        if(age_of_death_days >= 100 and age_of_death_days <= 203):
            if(age_of_death_mths >= 0 and age_of_death_mths <= 3):
                
                #  Use sample weights to compute aggregates.
                sample_age_death = admin_data.loc[index, 'CM_SW_Count'] + sample_weight
                admin_data.loc[index, 'CM_SW_Count'] = sample_age_death
                
                #  Sum up the count in the rows.
                prev = admin_data.loc[index, 'CM_Row_Count'] + 1
                admin_data.loc[index, 'CM_Row_Count'] = prev
            
    return admin_data

In [8]:
# Assigns each region total number of positive responses which have been normalised by the population of the clusters.
def voronoi_aggregate(region_name, col_name, cm_recode, voronoi_data, population):
    pop_data = population.copy()
    
    # Replace cells of no response with -1.     
    cm_data = cm_recode.copy()
    cm_data.set_index('V001', inplace=True)
    cm_data['B6'] = pd.to_numeric(cm_data['B6'], errors='coerce').fillna(-1).astype(int)
    cm_data['B7'] = pd.to_numeric(cm_data['B7'], errors='coerce').fillna(-1).astype(int)
    
    # Contains aggregate for each cluster.     
    cluster_data = pd.DataFrame(columns=['V001','SW_Aggregate','Row_Aggregate'])
    cluster_data['V001'] = list(cm_data.index.unique())
    cluster_data.set_index('V001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    # Contains Admin Region and the child mortality occurrence aggregate.    
    admin_region_data = pd.DataFrame(columns=[region_name, 'CM_SW_Count', 'CM_Pop_Count', 'Population','CM_Row_Count'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())   
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)
    
    # Computes the aggregate for each cluster to create cluster data. (Unique Cluster, CM Aggregate)    
    for index, row in cm_data.iterrows():
        age_of_death_days = row['B6']
        age_of_death_mths = row['B7']
        sample_weight = row['V005'] / 1000000
        
        if(age_of_death_days >= 100 and age_of_death_days <= 203):
            if(age_of_death_mths >= 0 and age_of_death_mths <= 3):
                
                prev_value = cluster_data.loc[index, 'SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'SW_Aggregate'] = prev_value
                
                prev_value = cluster_data.loc[index, 'Row_Aggregate']
                cluster_data.loc[index, 'Row_Aggregate'] = prev_value + 1
             
    cluster_data.reset_index(inplace=True)
    
    # Result of vor data is (cluster, region, proportion, SW_aggregate, Row_Aggregate).
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='V001')
    vor_data = pd.merge(vor_data, pop_data, how='inner', on='V001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    # Assigns responses to regions based on voronoi overlap with region. Proportions cluster population in same way. 
    for ind, row in vor_data.iterrows():
        proportion = row['Proportion'] * row['SW_Aggregate']
        previous_value = admin_region_data.loc[ind, 'CM_SW_Count']
        admin_region_data.loc[ind, 'CM_SW_Count'] = proportion + previous_value
        
        prop = row['Proportion'] * row['Row_Aggregate']
        prev = admin_region_data.loc[ind, 'CM_Row_Count']
        admin_region_data.loc[ind, 'CM_Row_Count'] = prop + prev
        
        prop = row['Proportion'] * row['Population']
        prev_count = admin_region_data.loc[ind, 'Population']
        admin_region_data.loc[ind, 'Population'] = prop + prev_count
    
    #  Normalises total responses in region using population of all clusters in the region.
    for ind, row in admin_region_data.iterrows():
        value = row['CM_Row_Count'] / row['Population']
        admin_region_data.loc[ind, 'CM_Pop_Count'] = value * (10**6)
    return admin_region_data

In [9]:
# ---------------------------COTE D'IVOIRE COMPUTATIONS--------------------------------------------
recode_path = "IC_DHS/cibr62sv/"
path_to_voronoi = "IC_DHS/Voronoi_clusters/Proportions/"
path_to_clusters = "IC_DHS/CLUSTER_TO_REGION/"

# Population of all DHS clusters.
population_file = pd.read_csv("IC_DHS/Region_Population/voronoi.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'V001'})
population_file = population_file.rename(columns = {'_count':'Population'})

# DHS Birth recode.
file = pd.read_csv(recode_path + "births_recode.csv", usecols=['V001', 'V002', 'V005','B6', 'B7'])

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')

# ------------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------
admin_1 = compute_cm_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_cm_simple_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_cm_simple_aggregate("Admin_3_Region", "ID_3", clust_file)

admin_1.to_csv(recode_path + "cm_aggregate_1.csv", index=True)
admin_2.to_csv(recode_path + "cm_aggregate_2.csv", index=True)
admin_3.to_csv(recode_path + "cm_aggregate_3.csv", index=True)

# ---------------------------AGGREGATATION USING VORONOI CALCULATIONS--------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

print("---------------------HOW TO AGGREGATE-----------------")
x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_1['CM_Row_Count'])
print("Admin 1 - (CM row count VOR, CM count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_2['CM_Row_Count'])
print("Admin 2 - (CM row count VOR, CM count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT------------------------")
x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_vor_1['CM_SW_Count'])
print("Admin 1 - (CM row count VOR, CM Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_vor_1['CM_Pop_Count'])
print("Admin 1 - (CM row count VOR, CM pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_vor_2['CM_SW_Count'])
print("Admin 2 - (CM row count VOR, CM Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_vor_2['CM_Pop_Count'])
print("Admin 2 - (CM row count VOR, CM pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_3['CM_Row_Count'], admin_vor_3['CM_SW_Count'])
print("Admin 3 - (CM row count VOR, CM Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_3['CM_Row_Count'], admin_vor_3['CM_Pop_Count'])
print("Admin 3 - (CM row count VOR, CM pop VOR): " + repr(x))

admin_vor_1.to_csv(recode_path + "cm_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "cm_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "cm_vor_aggregate_3.csv", index=True)

---------------------HOW TO AGGREGATE-----------------
Admin 1 - (CM row count VOR, CM count PIP): 0.88499040675598584
Admin 2 - (CM row count VOR, CM count PIP): 0.82205048235490863

------------------HOW TO COUNT------------------------
Admin 1 - (CM row count VOR, CM Sample Weight VOR): 0.55555555555555547
Admin 1 - (CM row count VOR, CM pop VOR): 0.49707602339181284

Admin 2 - (CM row count VOR, CM Sample Weight VOR): 0.61469387755102045
Admin 2 - (CM row count VOR, CM pop VOR): 0.57714285714285707

Admin 3 - (CM row count VOR, CM Sample Weight VOR): 0.59059929494712105
Admin 3 - (CM row count VOR, CM pop VOR): 0.56462984723854281


In [10]:
#  -------------------------------SENEGAL COMPUTATIONS-----------------------------
recode_path = "SEN_DHS/SNBR6DSV/"
path_to_voronoi = "SEN_DHS/Voronoi/Proportions/"
path_to_clusters = "SEN_DHS/Cluster_To_Region/"

# Population of all DHS clusters.
population_file = pd.read_csv("SEN_DHS/Region_Populations/voronoi_pop.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'V001'})
population_file = population_file.rename(columns = {'_count':'Population'})

file = pd.read_csv(recode_path + "birth_recode.csv", usecols=['V001', 'V002', 'V005', 'B6', 'B7'])

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')
clust_file['B6'] = pd.to_numeric(clust_file['B6'], errors='coerce').fillna(-1).astype(int)
clust_file['B7'] = pd.to_numeric(clust_file['B7'], errors='coerce').fillna(-1).astype(int)

# AGGREGATION USING VORONOI CALCULATIONS
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

admin_vor_1.to_csv(recode_path + "cm_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "cm_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "cm_vor_aggregate_3.csv", index=True)