In [11]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

In [12]:
def compute_malaria_simple_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name,'Lab_Row_Count', 'Lab_SW_Count', 'Lab_Pop_Count',
                                       'Rapid_Row_Count', 'Rapid_SW_Count', 'Rapid_Pop_Count'])
    
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    malaria_data = data
    malaria_data.reset_index(inplace=True)
    malaria_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
        
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
#         population = row['Population']
        
        #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]
            if(lab_result == 1):
                lab_pos = admin_data.loc[index, 'Lab_Row_Count'] + 1
                admin_data.loc[index, 'Lab_Row_Count'] = lab_pos

                sample_lab = admin_data.loc[index, 'Lab_SW_Count'] + sample_weight
                admin_data.loc[index, 'Lab_SW_Count'] = sample_lab

#                 prev = admin_data.loc[index, 'Lab_Pop_Count']
#                 admin_data.loc[index, 'Lab_Pop_Count'] = prev + (1/population)
        
        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            if(rapid_result == 1):
                rapid_pos = admin_data.loc[index, 'Rapid_Row_Count'] + 1
                admin_data.loc[index, 'Rapid_Row_Count'] = rapid_pos

                sample_rapid = admin_data.loc[index, 'Rapid_SW_Count'] + sample_weight
                admin_data.loc[index, 'Rapid_SW_Count'] = sample_rapid

#                 prev = admin_data.loc[index, 'Rapid_Pop_Count']
#                 admin_data.loc[index, 'Rapid_Pop_Count'] = prev + (1/population)
    
    return admin_data

In [13]:
def voronoi_aggregate(region_name, col_name, malaria_recode, voronoi_data):
    malaria_data = malaria_recode
    malaria_data.reset_index(inplace=True)
    
    cluster_data = pd.DataFrame(columns=['HV001', 'Rapid_SW_Aggregate', 'Lab_SW_Aggregate', 'Lab_Count_Aggregate', 
                                         'Rapid_Count_Aggregate', 'Rapid_Pop_Aggregate', 'Lab_Pop_Aggregate'])
    
    cluster_data['HV001'] = list(malaria_data['HV001'].unique())
    
    cluster_data.set_index('HV001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    # Contains Admin Region and the CM aggregate.    
    admin_region_data = pd.DataFrame(columns=[region_name, 'Lab_Row_Count', 'Lab_SW_Count', 'Lab_Pop_Count',
                                       'Rapid_Row_Count', 'Rapid_SW_Count', 'Rapid_Pop_Count'])
    
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
     # Set index to Admin Region.    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    malaria_data.set_index('HV001', inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
    
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
#         population = row['Population']
        
         #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]

            if(lab_result == 1):
                sample_lab = cluster_data.loc[index, 'Lab_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Lab_SW_Aggregate'] = sample_lab
                
                prev = cluster_data.loc[index, 'Lab_Count_Aggregate']
                cluster_data.loc[index, 'Lab_Count_Aggregate'] = prev + 1
                
#                 prev = cluster_data.loc[index, 'Lab_Pop_Aggregate']
#                 cluster_data.loc[index, 'Lab_Pop_Aggregate'] = prev + (1/population)

        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            
            if(rapid_result == 1):
                sample_rapid = cluster_data.loc[index, 'Rapid_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Rapid_SW_Aggregate'] = sample_rapid
                
                prev = cluster_data.loc[index, 'Rapid_Count_Aggregate']
                cluster_data.loc[index, 'Rapid_Count_Aggregate'] = prev + 1
                
#                 prev = cluster_data.loc[index, 'Rapid_Pop_Aggregate']
#                 cluster_data.loc[index, 'Rapid_Pop_Aggregate'] = prev + (1/population)
    
    cluster_data.reset_index(inplace=True)
    
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='HV001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    for ind, row in vor_data.iterrows():
        lab_sw = row['Proportion'] * row['Lab_SW_Aggregate']
        rapid_sw = row['Proportion'] * row['Rapid_SW_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_SW_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_SW_Count']
        admin_region_data.loc[ind, 'Lab_SW_Count'] = lab_sw + prev_lab
        admin_region_data.loc[ind, 'Rapid_SW_Count'] = rapid_sw + prev_rapid
        
        lab_c = row['Proportion'] * row['Lab_Count_Aggregate']
        rapid_c = row['Proportion'] * row['Rapid_Count_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_Row_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_Row_Count']
        admin_region_data.loc[ind, 'Lab_Row_Count'] = lab_c + prev_lab
        admin_region_data.loc[ind, 'Rapid_Row_Count'] = rapid_c + prev_rapid
        
#         lab_p = row['Proportion'] * row['Lab_Pop_Aggregate']
#         rapid_p = row['Proportion'] * row['Rapid_Pop_Aggregate']
#         prev_lab = admin_region_data.loc[ind, 'Lab_Pop_Count']
#         prev_rapid = admin_region_data.loc[ind, 'Rapid_Pop_Count']
#         admin_region_data.loc[ind, 'Lab_Pop_Count'] = lab_p + prev_lab
#         admin_region_data.loc[ind, 'Rapid_Pop_Count'] = rapid_p + prev_rapid
        
    return admin_region_data
        

In [14]:
# --------------------------------------COTE D'IVOIRE COMPUTATIONS--------------------------
rel_path = "Cote D'Ivoire Data SPSS/cihr62sv/"
vor_path = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
pop_path = "Cote D'Ivoire Data SPSS/cigc61fl/"
clu_path = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

file = pd.read_csv(rel_path + "household_recode.csv", low_memory=False)
population = pd.read_csv(pop_path + "CIGC61FL.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['HV001', 'Population']

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HV001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HV001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HV001')
clust_file = pd.merge(clust_file, population, how='inner', on='HV001')

ids = ['HV001', 'HV002', 'HV005', 'Population']
lab_test = [col for col in file.columns if "HML32$" in col]
rapid_test = [col for col in file.columns if "HML35$" in col]
admin_regions = ['ID_1', 'ID_2', 'ID_3']

keep_columns = ids + lab_test + rapid_test + admin_regions
clust_file = clust_file[keep_columns]

clust_file.to_csv(rel_path + "complete_malaria_data.csv", index=False)

# ---------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------------
admin_1 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_2", clust_file)
admin_3 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_3", clust_file)

print(admin_1.head())
admin_1.to_csv(rel_path + "malaria_aggregate_1.csv", index=True)
admin_2.to_csv(rel_path + "malaria_aggregate_2.csv", index=True)
admin_3.to_csv(rel_path + "malaria_aggregate_3.csv", index=True)

# ---------------------------AGGREGATES USING VORONOI CALCULATIONS-------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print()

print("---------------------HOW TO AGGREGATE-----------------")
print("-----LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_1['Lab_Row_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_2['Lab_Row_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x) + "\n")

print("----RAPID TEST-----")
x, y = stats.kendalltau(admin_vor_1['Rapid_Row_Count'], admin_1['Rapid_Row_Count'])
print("Admin 1 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Rapid_Row_Count'], admin_2['Rapid_Row_Count'])
print("Admin 2 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT-------------------")
print("------LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_SW_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_Pop_Count'])
# print("Admin 1 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_SW_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

# x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_Pop_Count'])
# print("Admin 2 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x))

# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

admin_vor_1.to_csv(rel_path + "malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "malaria_vor_aggregate_3.csv", index=True)

                Lab_Row_Count  Lab_SW_Count  Lab_Pop_Count  Rapid_Row_Count  \
Admin_1_Region                                                                
1                         6.0     11.729821              0             24.0   
2                        24.0      8.401179              0             65.0   
3                        63.0     63.319608              0            188.0   
4                         9.0      4.332841              0             43.0   
5                        63.0     86.279201              0            163.0   

                Rapid_SW_Count  Rapid_Pop_Count  
Admin_1_Region                                   
1                    37.830807                0  
2                    20.417205                0  
3                   174.498329                0  
4                    16.976166                0  
5                   199.087297                0  
                Lab_Row_Count  Lab_SW_Count  Lab_Pop_Count  Rapid_Row_Count  \
Admin_Region_1   

In [15]:
#  ------------------------------------SENEGAL COMPUTATIONS--------------------------------------------------
rel_path = "Senegal Data SPSS/snhr7qsv/"
vor_path = "Senegal Data SPSS/Voronoi/Proportions/"
clu_path = "Senegal Data SPSS/Cluster_To_Region/"

file = pd.read_csv(rel_path + "household_recode.csv", low_memory=False)

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HV001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HV001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HV001')

ids = ['HV001', 'HV002', 'HV005']
lab_test = [col for col in file.columns if "HML32$" in col]
rapid_test = [col for col in file.columns if "HML35$" in col]
admin_regions = ['ID_1', 'ID_2', 'ID_3']

keep_columns = ids + lab_test + rapid_test + admin_regions
clust_file = clust_file[keep_columns]

#  Set empty cells to -1.   
for col in (lab_test + rapid_test):
    clust_file[col] = pd.to_numeric(clust_file[col], errors='coerce').fillna(-1).astype(int)
    
print(clust_file.columns)
clust_file.to_csv(rel_path + "complete_malaria_data.csv", index=False)

admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3)

print(admin_vor_1.head())
print(admin_vor_2.head())
print(admin_vor_3.head())

admin_vor_1.to_csv(rel_path + "malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "malaria_vor_aggregate_3.csv", index=True)

Index(['HV001', 'HV002', 'HV005', 'HML32$01', 'HML32$02', 'HML32$03',
       'HML32$04', 'HML32$05', 'HML32$06', 'HML32$07',
       ...
       'HML35$60', 'HML35$61', 'HML35$62', 'HML35$63', 'HML35$64', 'HML35$65',
       'HML35$66', 'ID_1', 'ID_2', 'ID_3'],
      dtype='object', length=138)
                Lab_Row_Count  Lab_SW_Count  Lab_Pop_Count  Rapid_Row_Count  \
Admin_Region_1                                                                
2631                 0.000000      0.000000              0         0.000000   
2632                 0.000000      0.000000              0         2.124573   
2633                 0.921526      0.806921              0         3.556937   
2634                 7.762934      4.730096              0         6.443435   
2635                 4.067431      1.867810              0        26.000797   

                Rapid_SW_Count  Rapid_Pop_Count  
Admin_Region_1                                   
2631                  0.000000                0  
263