In [3]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import os

In [4]:
def compute_cm_simple_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name, 'CM_Row_Count','CM_SW_Count', 'CM_Pop_Count'])
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    child_mortality_data = data
    child_mortality_data.reset_index(inplace=True)
    child_mortality_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    child_mortality_data['B6'] = pd.to_numeric(child_mortality_data['B6'], errors='coerce').fillna(-1).astype(int)
    child_mortality_data['B7'] = pd.to_numeric(child_mortality_data['B7'], errors='coerce').fillna(-1).astype(int)
    
    for index, row in child_mortality_data.iterrows():
        age_of_death_days = row['B6']
        age_of_death_mths = row['B7']
#         pop = row['Population']
        sample_weight = row['V005'] / 1000000
        
        #  Target children that died within 3 months of death.
        if(age_of_death_days >= 100 and age_of_death_days <= 203):
            if(age_of_death_mths >= 0 and age_of_death_mths <= 3):
                
                #  Use sample weights to compute aggregates.
                sample_age_death = admin_data.loc[index, 'CM_SW_Count'] + sample_weight
                admin_data.loc[index, 'CM_SW_Count'] = sample_age_death
                
                #  Sum up the count in the rows.
                prev = admin_data.loc[index, 'CM_Row_Count'] + 1
                admin_data.loc[index, 'CM_Row_Count'] = prev
                
                # Normalise count using cluster population.              
#                 prev = admin_data.loc[index, 'CM_Pop_Count']
#                 admin_data.loc[index, 'CM_Pop_Count'] = prev + (1/pop)
            
    return admin_data

In [5]:
def voronoi_aggregate(region_name, col_name, cm_recode, voronoi_data):
    cm_data = cm_recode
    cm_data.reset_index(inplace=True)
    
    cluster_data = pd.DataFrame(columns=['V001','SW_Aggregate', 'Pop_Aggregate', 'Row_Aggregate'])
    cluster_data['V001'] = list(cm_data['V001'].unique())
    
     # Set index to Cluster.
    cluster_data.set_index('V001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    # Contains Admin Region and the CM aggregate.    
    admin_region_data = pd.DataFrame(columns=[region_name, 'CM_SW_Count', 'CM_Pop_Count', 'CM_Row_Count'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
    # Set index to Admin Region.    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    cm_data.set_index('V001', inplace=True)
    
    # Computes the aggregate for each cluster to create cluster data. (Unique Cluster, CM Aggregate)    
    for index, row in cm_data.iterrows():
        age_of_death_days = row['B6']
        age_of_death_mths = row['B7']
#         pop = row['Population']
        sample_weight = row['V005'] / 1000000
        
        if(age_of_death_days >= 100 and age_of_death_days <= 203):
            if(age_of_death_mths >= 0 and age_of_death_mths <= 3):
                
                prev_value = cluster_data.loc[index, 'SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'SW_Aggregate'] = prev_value
                
#                 prev_value = cluster_data.loc[index, 'Pop_Aggregate']
#                 cluster_data.loc[index, 'Pop_Aggregate'] = prev_value + (1/pop)
                
                prev_value = cluster_data.loc[index, 'Row_Aggregate']
                cluster_data.loc[index, 'Row_Aggregate'] = prev_value + 1
             
    cluster_data.reset_index(inplace=True)
    
    # Merge Voronoi & Admin Region Mapping to HIV Aggregate per cluster.
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='V001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    for ind, row in vor_data.iterrows():
        proportion = row['Proportion'] * row['SW_Aggregate']
        previous_value = admin_region_data.loc[ind, 'CM_SW_Count']
        admin_region_data.loc[ind, 'CM_SW_Count'] = proportion + previous_value
        
#         prop = row['Proportion'] * row['Pop_Aggregate']
#         prev = admin_region_data.loc[ind, 'CM_Pop_Count']
#         admin_region_data.loc[ind, 'CM_Pop_Count'] = prop + prev
        
        # change to simple count         
        prop = row['Proportion'] * row['Row_Aggregate']
        prev = admin_region_data.loc[ind, 'CM_Row_Count']
        admin_region_data.loc[ind, 'CM_Row_Count'] = prop + prev
    
    return admin_region_data

In [6]:
# ---------------------------COTE D'IVOIRE COMPUTATIONS--------------------------------------------
# file, clust_admin, rename columns, voronoi admin files, clust_file, keep columns => must have keep columns
rel_path = "Cote D'Ivoire Data SPSS/cibr62sv/"
vor_path = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
pop_path = "Cote D'Ivoire Data SPSS/cigc61fl/"
clu_path = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

file = pd.read_csv(rel_path + "births_recode.csv", usecols=['V001', 'V002', 'V005','B6', 'B7'])

population = pd.read_csv(pop_path + "CIGC61FL.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['V001', 'Population']

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')
clust_file = pd.merge(clust_file, population, how='inner', on='V001')

# keep_columns = ['V001', 'V002', 'V005', 'Population', 'B6', 'B7', 'ID_1', 'ID_2', 'ID_3']
# clust_file = clust_file[keep_columns]

clust_file.to_csv(rel_path + "complete_child_mortality_data.csv", index=False)

# ------------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------
admin_1 = compute_cm_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_cm_simple_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_cm_simple_aggregate("Admin_3_Region", "ID_3", clust_file)

print(admin_1.head())

admin_1.to_csv(rel_path + "cm_aggregate_1.csv", index=True)
admin_2.to_csv(rel_path + "cm_aggregate_2.csv", index=True)
admin_3.to_csv(rel_path + "cm_aggregate_3.csv", index=True)

# ---------------------------AGGREGATATION USING VORONOI CALCULATIONS--------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print()

print("---------------------HOW TO AGGREGATE-----------------")
x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_1['CM_Row_Count'])
print("Admin 1 - (CM row count VOR, CM count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_2['CM_Row_Count'])
print("Admin 2 - (CM row count VOR, CM count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT------------------------")
x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_vor_1['CM_SW_Count'])
print("Admin 1 - (CM row count VOR, CM Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_1['CM_Row_Count'], admin_vor_1['CM_Pop_Count'])
# print("Admin 1 - (CM row count VOR, CM pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_vor_2['CM_SW_Count'])
print("Admin 2 - (CM row count VOR, CM Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_2['CM_Row_Count'], admin_vor_2['CM_Pop_Count'])
# print("Admin 2 - (CM row count VOR, CM pop VOR): " + repr(x))

# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

admin_vor_1.to_csv(rel_path + "cm_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "cm_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "cm_vor_aggregate_3.csv", index=True)

                CM_Row_Count  CM_SW_Count  CM_Pop_Count
Admin_1_Region                                         
1                       13.0    23.106243             0
2                       70.0    20.837361             0
3                       74.0    71.486759             0
4                      100.0    45.954260             0
5                      127.0   183.134456             0
                CM_SW_Count  CM_Pop_Count  CM_Row_Count
Admin_Region_1                                         
1                 33.582070             0     28.555392
2                 16.977715             0     59.916540
3                 73.369762             0     71.495573
4                 53.686163             0    114.187819
5                169.631390             0    123.493092

---------------------HOW TO AGGREGATE-----------------
Admin 1 - (CM row count VOR, CM count PIP): 0.88499040675598584
Admin 2 - (CM row count VOR, CM count PIP): 0.82205048235490863

------------------HOW TO COUNT-

In [7]:
#  -------------------------------SENEGAL COMPUTATIONS-----------------------------
# Has no population data.
rel_path = "Senegal Data SPSS/snbr7qsv/"
vor_path = "Senegal Data SPSS/Voronoi/Proportions/"
clu_path = "Senegal Data SPSS/Cluster_To_Region/"

file = pd.read_csv(rel_path + "birth_recode.csv", usecols=['V001', 'V002', 'V005', 'B6', 'B7'])

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'V001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'V001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'V001'})

# Join the dataframes. 
clust_file = pd.merge(file, clust_admin, how='inner', on='V001')
clust_file['B6'] = pd.to_numeric(clust_file['B6'], errors='coerce').fillna(-1).astype(int)
clust_file['B7'] = pd.to_numeric(clust_file['B7'], errors='coerce').fillna(-1).astype(int)

print(clust_file.head())
clust_file.to_csv(rel_path + "complete_child_mortality_data.csv", index=False)

# AGGREGATION USING VORONOI CALCULATIONS
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print(admin_vor_2.head())
print(admin_vor_3.head())

admin_vor_1.to_csv(rel_path + "cm_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "cm_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "cm_vor_aggregate_3.csv", index=True)

   V001  V002     V005  B6  B7  ID_1   ID_2   ID_3
0     1     1  3466369  -1  -1  2631  31077  92463
1     1     2  3466369  -1  -1  2631  31077  92463
2     1     3  3466369  -1  -1  2631  31077  92463
3     1     3  3466369  -1  -1  2631  31077  92463
4     1     3  3466369  -1  -1  2631  31077  92463
                CM_SW_Count  CM_Pop_Count  CM_Row_Count
Admin_Region_1                                         
2631             108.402199             0     44.175577
2632             106.528206             0     85.486884
2633              67.337644             0     85.474542
2634              82.278833             0    125.995467
2635             103.715581             0    201.986663
                CM_SW_Count  CM_Pop_Count  CM_Row_Count
Admin_Region_2                                         
31077             17.518055             0      4.625000
31078             66.409947             0     17.145115
31079             24.026587             0     22.302014
31080              9.2