In [19]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

file = pd.read_csv("household_recode.csv", low_memory=False)
population = pd.read_csv("population_info.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['HV001', 'Population']

clust_admin_1 = pd.read_csv("cluster_to_admin_1.csv")
clust_admin_2 = pd.read_csv("cluster_to_admin_2.csv")
clust_admin_3 = pd.read_csv("cluster_to_admin_3.csv")

# Rename columns in preparation for joining.
clust_admin_1 = clust_admin_1.rename(columns = {'DHSCLUST':'HV001'})
clust_admin_2 = clust_admin_2.rename(columns = {'DHSCLUST':'HV001'})
clust_admin_3 = clust_admin_3.rename(columns = {'DHSCLUST':'HV001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv("voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv("voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv("voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HV001'})

# Join the dataframes. 
c1 = pd.merge(file, clust_admin_1, how='inner', on='HV001')
c2 = pd.merge(c1, clust_admin_2, how='inner', on='HV001')
c3 = pd.merge(c2, clust_admin_3, how='inner', on='HV001')
c3 = pd.merge(c3, population, how='inner', on='HV001')

ids = ['HV001', 'HV002', 'HV005', 'Population']
lab_test = [col for col in file.columns if "HML32$" in col]
rapid_test = [col for col in file.columns if "HML35$" in col]
admin_regions = ['ID_1', 'ID_2', 'ID_3']

keep_columns = ids + lab_test + rapid_test + admin_regions
c3 = c3[keep_columns]

c3.to_csv("complete_malaria_data.csv", index=False)

In [20]:
def compute_malaria_simple_aggregate(region_name, col_name, data, cluster_file):
    admin_data = pd.DataFrame(columns=[region_name,'Lab_Row_Count', 'Lab_SW_Count', 'Lab_Pop_Count',
                                       'Rapid_Row_Count', 'Rapid_SW_Count', 'Rapid_Pop_Count'])
    admin_data[region_name] = list(cluster_file[col_name].unique())
    
    # Set index to region numbers.   
    malaria_data = data
    malaria_data.reset_index(inplace=True)
    malaria_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
        
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
        population = row['Population']
        
        #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]
            if(lab_result == 1):
                lab_pos = admin_data.loc[index, 'Lab_Row_Count'] + 1
                admin_data.loc[index, 'Lab_Row_Count'] = lab_pos

                sample_lab = admin_data.loc[index, 'Lab_SW_Count'] + sample_weight
                admin_data.loc[index, 'Lab_SW_Count'] = sample_lab

                prev = admin_data.loc[index, 'Lab_Pop_Count']
                admin_data.loc[index, 'Lab_Pop_Count'] = prev + (1/population)
        
        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            if(rapid_result == 1):
                rapid_pos = admin_data.loc[index, 'Rapid_Row_Count'] + 1
                admin_data.loc[index, 'Rapid_Row_Count'] = rapid_pos

                sample_rapid = admin_data.loc[index, 'Rapid_SW_Count'] + sample_weight
                admin_data.loc[index, 'Rapid_SW_Count'] = sample_rapid

                prev = admin_data.loc[index, 'Rapid_Pop_Count']
                admin_data.loc[index, 'Rapid_Pop_Count'] = prev + (1/population)
    
    return admin_data

In [21]:
admin_1 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_1", c3, clust_admin_1)
admin_2 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_2", c3, clust_admin_2)
admin_3 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_3", c3, clust_admin_3)

print(admin_1.tail())
admin_1.to_csv("malaria_aggregate_1.csv", index=True)
admin_2.to_csv("malaria_aggregate_2.csv", index=True)
admin_3.to_csv("malaria_aggregate_3.csv", index=True)

                Lab_Row_Count  Lab_SW_Count  Lab_Pop_Count  Rapid_Row_Count  \
Admin_1_Region                                                                
15                       19.0     43.222158       0.001133             32.0   
16                        2.0      3.531939       0.000067              7.0   
17                       70.0     42.131361       0.006898            176.0   
18                       41.0     13.505650       0.007569            177.0   
19                       55.0     39.709757       0.007787            194.0   

                Rapid_SW_Count  Rapid_Pop_Count  
Admin_1_Region                                   
15                   71.324227         0.001907  
16                   13.010884         0.000418  
17                  131.057430         0.020075  
18                   67.905609         0.034655  
19                  112.479409         0.026420  


In [22]:
def voronoi_aggregate(region_name, col_name, malaria_recode, voronoi_data):
    malaria_data = malaria_recode
    malaria_data.reset_index(inplace=True)
    
    cluster_data = pd.DataFrame(columns=['HV001', 'Rapid_SW_Aggregate', 'Lab_SW_Aggregate', 'Lab_Count_Aggregate', 
                                         'Rapid_Count_Aggregate', 'Rapid_Pop_Aggregate', 'Lab_Pop_Aggregate'])
    cluster_data['HV001'] = list(malaria_data['HV001'].unique())
    
    cluster_data.set_index('HV001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    # Contains Admin Region and the CM aggregate.    
    admin_region_data = pd.DataFrame(columns=[region_name, 'Lab_Row_Count', 'Lab_SW_Count', 'Lab_Pop_Count',
                                       'Rapid_Row_Count', 'Rapid_SW_Count', 'Rapid_Pop_Count'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
     # Set index to Admin Region.    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    malaria_data.set_index('HV001', inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
    
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
        population = row['Population']
        
         #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]

            if(lab_result == 1):
                sample_lab = cluster_data.loc[index, 'Lab_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Lab_SW_Aggregate'] = sample_lab
                
                prev = cluster_data.loc[index, 'Lab_Count_Aggregate']
                cluster_data.loc[index, 'Lab_Count_Aggregate'] = prev + 1
                
                prev = cluster_data.loc[index, 'Lab_Pop_Aggregate']
                cluster_data.loc[index, 'Lab_Pop_Aggregate'] = prev + (1/population)

        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            
            if(rapid_result == 1):
                sample_rapid = cluster_data.loc[index, 'Rapid_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Rapid_SW_Aggregate'] = sample_rapid
                
                prev = cluster_data.loc[index, 'Rapid_Count_Aggregate']
                cluster_data.loc[index, 'Rapid_Count_Aggregate'] = prev + 1
                
                prev = cluster_data.loc[index, 'Rapid_Pop_Aggregate']
                cluster_data.loc[index, 'Rapid_Pop_Aggregate'] = prev + (1/population)
    
    cluster_data.reset_index(inplace=True)
    
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='HV001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    for ind, row in vor_data.iterrows():
        lab_sw = row['Proportion'] * row['Lab_SW_Aggregate']
        rapid_sw = row['Proportion'] * row['Rapid_SW_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_SW_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_SW_Count']
        admin_region_data.loc[ind, 'Lab_SW_Count'] = lab_sw + prev_lab
        admin_region_data.loc[ind, 'Rapid_SW_Count'] = rapid_sw + prev_rapid
        
        lab_c = row['Proportion'] * row['Lab_Count_Aggregate']
        rapid_c = row['Proportion'] * row['Rapid_Count_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_Row_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_Row_Count']
        admin_region_data.loc[ind, 'Lab_Row_Count'] = lab_c + prev_lab
        admin_region_data.loc[ind, 'Rapid_Row_Count'] = rapid_c + prev_rapid
        
        lab_p = row['Proportion'] * row['Lab_Pop_Aggregate']
        rapid_p = row['Proportion'] * row['Rapid_Pop_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_Pop_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_Pop_Count']
        admin_region_data.loc[ind, 'Lab_Pop_Count'] = lab_p + prev_lab
        admin_region_data.loc[ind, 'Rapid_Pop_Count'] = rapid_p + prev_rapid
        
    return admin_region_data
    
    

In [23]:
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", c3, voronoi_admin_1)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", c3, voronoi_admin_2)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", c3, voronoi_admin_3)

print(admin_vor_1.head())
print()

print("---------------------HOW TO AGGREGATE-----------------")
print("-----LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_1['Lab_Row_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_2['Lab_Row_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x) + "\n")

print("----RAPID TEST-----")
x, y = stats.kendalltau(admin_vor_1['Rapid_Row_Count'], admin_1['Rapid_Row_Count'])
print("Admin 1 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Rapid_Row_Count'], admin_2['Rapid_Row_Count'])
print("Admin 2 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT-------------------")
print("------LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_SW_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_Pop_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_SW_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_Pop_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x))

# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

admin_vor_1.to_csv("malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv("malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv("malaria_vor_aggregate_3.csv", index=True)

                Lab_Row_Count  Lab_SW_Count  Lab_Pop_Count  Rapid_Row_Count  \
Admin_Region_1                                                                
1                   13.806899     13.893442       0.000961        42.674490   
2                   18.055503      6.414634       0.003480        51.779670   
3                   60.854734     64.913256       0.015197       177.990509   
4                   13.091964      6.521630       0.001961        50.684188   
5                   61.152236     82.651709       0.004561       156.348332   

                Rapid_SW_Count  Rapid_Pop_Count  
Admin_Region_1                                   
1                    43.666085         0.003118  
2                    16.110664         0.009809  
3                   172.804288         0.048123  
4                    21.124478         0.010605  
5                   185.268808         0.010975  

---------------------HOW TO AGGREGATE-----------------
-----LAB TEST-------
Admin 1 - (Lab Test