In [1]:
from __future__ import division
import scipy.stats as stats
import pandas as pd
import numpy as np

In [2]:
# Counts the number of relevant responses and assigns to each region.
def compute_malaria_simple_aggregate(region_name, col_name, data):
    admin_data = pd.DataFrame(columns=[region_name,'Lab_Row_Count', 'Lab_SW_Count', 'Rapid_Row_Count', 'Rapid_SW_Count'])
    
    admin_data[region_name] = list(data[col_name].unique())
    
    # Set index to region numbers.   
    malaria_data = data.copy()
    malaria_data.set_index(col_name, inplace=True)
    
    # Set index to region numbers and sort index.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
        
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
        
        #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]
            if(lab_result == 1):
                lab_pos = admin_data.loc[index, 'Lab_Row_Count'] + 1
                admin_data.loc[index, 'Lab_Row_Count'] = lab_pos

                sample_lab = admin_data.loc[index, 'Lab_SW_Count'] + sample_weight
                admin_data.loc[index, 'Lab_SW_Count'] = sample_lab
        
        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            if(rapid_result == 1):
                rapid_pos = admin_data.loc[index, 'Rapid_Row_Count'] + 1
                admin_data.loc[index, 'Rapid_Row_Count'] = rapid_pos

                sample_rapid = admin_data.loc[index, 'Rapid_SW_Count'] + sample_weight
                admin_data.loc[index, 'Rapid_SW_Count'] = sample_rapid
    
    return admin_data

In [None]:
# Assigns each region total number of positive responses which have been normalised by the population of the clusters.
def voronoi_aggregate(region_name, col_name, malaria_recode, voronoi_data, population):
    pop_data = population.copy()
    vor_data = voronoi_data.copy()
    malaria_data = malaria_recode.copy()
    malaria_data.set_index('HV001', inplace=True)
    
    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
    
    # Contains aggregate for each cluster. 
    cluster_data = pd.DataFrame(columns=['HV001', 'Rapid_SW_Aggregate', 'Lab_SW_Aggregate', 'Lab_Count_Aggregate', 
                                         'Rapid_Count_Aggregate', 'Rapid_Pop_Aggregate', 'Lab_Pop_Aggregate'])
    
    cluster_data['HV001'] = list(malaria_data.index.unique())
    cluster_data.set_index('HV001', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    ## Contains Admin Region and malaria aggregates.   
    admin_region_data = pd.DataFrame(columns=[region_name, 'Lab_Row_Count', 'Lab_SW_Count', 'Lab_Pop_Count',
                                       'Rapid_Row_Count', 'Rapid_SW_Count', 'Rapid_Pop_Count', 'Population'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())  
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    #  Set empty cells to -1.   
    for col in (lab_test + rapid_test):
        malaria_data[col] = pd.to_numeric(malaria_data[col], errors='coerce').fillna(-1).astype(int)
    
    for index, row in malaria_data.iterrows():
        sample_weight = row['HV005'] / 1000000
        
         #   Results for the Lab test.      
        for col in lab_test:
            lab_result = row[col]

            if(lab_result == 1):
                sample_lab = cluster_data.loc[index, 'Lab_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Lab_SW_Aggregate'] = sample_lab
                
                prev = cluster_data.loc[index, 'Lab_Count_Aggregate']
                cluster_data.loc[index, 'Lab_Count_Aggregate'] = prev + 1

        #   Results for the Rapid test.  
        for col in rapid_test:
            rapid_result = row[col]
            
            if(rapid_result == 1):
                sample_rapid = cluster_data.loc[index, 'Rapid_SW_Aggregate'] + sample_weight
                cluster_data.loc[index, 'Rapid_SW_Aggregate'] = sample_rapid
                
                prev = cluster_data.loc[index, 'Rapid_Count_Aggregate']
                cluster_data.loc[index, 'Rapid_Count_Aggregate'] = prev + 1
    
    cluster_data.reset_index(inplace=True)
    
    # Result of vor data is (cluster, region, proportion, Rapid_SW_aggregate, Lab_SW_Aggregate, ..., ...).
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='HV001')
    vor_data = pd.merge(vor_data, pop_data, how='inner', on='HV001')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    # Assigns responses to regions based on voronoi overlap with region. Proportions cluster population in same way. 
    for ind, row in vor_data.iterrows():
        lab_sw = row['Proportion'] * row['Lab_SW_Aggregate']
        rapid_sw = row['Proportion'] * row['Rapid_SW_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_SW_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_SW_Count']
        admin_region_data.loc[ind, 'Lab_SW_Count'] = lab_sw + prev_lab
        admin_region_data.loc[ind, 'Rapid_SW_Count'] = rapid_sw + prev_rapid
        
        lab_c = row['Proportion'] * row['Lab_Count_Aggregate']
        rapid_c = row['Proportion'] * row['Rapid_Count_Aggregate']
        prev_lab = admin_region_data.loc[ind, 'Lab_Row_Count']
        prev_rapid = admin_region_data.loc[ind, 'Rapid_Row_Count']
        admin_region_data.loc[ind, 'Lab_Row_Count'] = lab_c + prev_lab
        admin_region_data.loc[ind, 'Rapid_Row_Count'] = rapid_c + prev_rapid
        
        prop = row['Proportion'] * row['Population']
        prev = admin_region_data.loc[ind, 'Population']
        admin_region_data.loc[ind, 'Population'] = prop + prev
        
    #  Normalises total responses in region using population of all clusters in the region.
    for ind, row in admin_region_data.iterrows():
        value = row['Lab_Row_Count'] /  row['Population']
        admin_region_data.loc[ind, 'Lab_Pop_Count'] = value * (10**6)
        
        value = row['Rapid_Row_Count'] /  row['Population']
        admin_region_data.loc[ind, 'Rapid_Pop_Count'] = value * (10**6)
        
    return admin_region_data
        

In [None]:
# --------------------------------------COTE D'IVOIRE COMPUTATIONS--------------------------
recode_path = "Cote D'Ivoire Data SPSS/cihr62sv/"
path_to_voronoi = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
path_to_clusters = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

population_file = pd.read_csv("Cote D'Ivoire Data SPSS/Region_Population/voronoi.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'HV001'})
population_file = population_file.rename(columns = {'_count':'Population'})

file = pd.read_csv(recode_path + "household_recode.csv", low_memory=False)

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HV001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HV001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HV001')

ids = ['HV001', 'HV002', 'HV005']
lab_test = [col for col in file.columns if "HML32$" in col]
rapid_test = [col for col in file.columns if "HML35$" in col]
admin_regions = ['ID_1', 'ID_2', 'ID_3']

keep_columns = ids + lab_test + rapid_test + admin_regions
clust_file = clust_file[keep_columns]

# ---------------------------SIMPLE AGGREGATES USING POINT IN POLYGON-------------------------------------
admin_1 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_2", clust_file)
admin_3 = compute_malaria_simple_aggregate("Admin_1_Region", "ID_3", clust_file)

admin_1.to_csv(recode_path + "malaria_aggregate_1.csv", index=True)
admin_2.to_csv(recode_path + "malaria_aggregate_2.csv", index=True)
admin_3.to_csv(recode_path + "malaria_aggregate_3.csv", index=True)

# ---------------------------AGGREGATES USING VORONOI CALCULATIONS-------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

print("---------------------HOW TO AGGREGATE-----------------")
print("-----LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_1['Lab_Row_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_2['Lab_Row_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test count PIP): " +  repr(x) + "\n")

print("----RAPID TEST-----")
x, y = stats.kendalltau(admin_vor_1['Rapid_Row_Count'], admin_1['Rapid_Row_Count'])
print("Admin 1 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Rapid_Row_Count'], admin_2['Rapid_Row_Count'])
print("Admin 2 - (Rapid Test row count VOR, Rapid Test count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT-------------------")
print("------LAB TEST-------")
x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_SW_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['Lab_Row_Count'], admin_vor_1['Lab_Pop_Count'])
print("Admin 1 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_SW_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['Lab_Row_Count'], admin_vor_2['Lab_Pop_Count'])
print("Admin 2 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_3['Lab_Row_Count'], admin_vor_3['Lab_SW_Count'])
print("Admin 3 - (Lab Test row count VOR, Lab Test Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_3['Lab_Row_Count'], admin_vor_3['Lab_Pop_Count'])
print("Admin 3 - (Lab Test row count VOR, Lab Test pop VOR): " + repr(x))

admin_vor_1.to_csv(recode_path + "malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "malaria_vor_aggregate_3.csv", index=True)

In [None]:
#  ------------------------------------SENEGAL COMPUTATIONS--------------------------------------------------
recode_path = "SN_2012-13/SNHR6DSV/"
path_to_voronoi = "SN_2012-13/Voronoi/Proportions/"
path_to_clusters = "SN_2012-13/Cluster_To_Region/"

population_file = pd.read_csv("SN_2012-13/Region_Populations/voronoi_pop.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'HV001'})
population_file = population_file.rename(columns = {'_count':'Population'})

file = pd.read_csv(recode_path + "household_recode.csv", low_memory=False)

clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HV001'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HV001'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HV001'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HV001')

ids = ['HV001', 'HV002', 'HV005']
lab_test = [col for col in file.columns if "HML32$" in col]
rapid_test = [col for col in file.columns if "HML35$" in col]
admin_regions = ['ID_1', 'ID_2', 'ID_3']

keep_columns = ids + lab_test + rapid_test + admin_regions
clust_file = clust_file[keep_columns]

#  Set empty cells to -1.   
for col in (lab_test + rapid_test):
    clust_file[col] = pd.to_numeric(clust_file[col], errors='coerce').fillna(-1).astype(int)
    
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

admin_vor_1.to_csv(recode_path + "malaria_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "malaria_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "malaria_vor_aggregate_3.csv", index=True)