In [8]:
from __future__ import division
import scipy.stats as stats
import pandas as pd

In [9]:
# Counts the number of positive responses and assigns to each region.
def compute_simple_aggregate(region_name, col_name, aids_recode):
    admin_data = pd.DataFrame(columns=[region_name,'Positive_Row_Count', 'Positive_HIV_SW'])
    admin_data[region_name] = list(aids_recode[col_name].unique())
    
    # Set index to Admin Region.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)

    # Set index to Admin Region.   
    aids_data = aids_recode.copy()
    aids_data.set_index(col_name, inplace=True)
    
    for region, row in aids_data.iterrows():
        result = row['HIV03']
        sample_weight = row['HIV05'] / 1000000
        
        # HIV test is positive. 
        if(result == 1):
            # Counting.            
            pos = admin_data.loc[region, 'Positive_Row_Count'] + 1
            admin_data.loc[region,'Positive_Row_Count'] = pos
            
            # Normalise using sample weights.
            sample_pos = admin_data.loc[region, 'Positive_HIV_SW'] + sample_weight
            admin_data.loc[region, 'Positive_HIV_SW'] = sample_pos
              
    return admin_data

In [10]:
# Assigns each region total number of positive responses which have been normalised by the population of the clusters.
def voronoi_aggregate(region_name, col_name, aids_recode, voronoi_data, population):
    pop_data = population.copy()
    aids_data = aids_recode.copy()
    vor_data = voronoi_data.copy()
    
    # Contains total number of positive responses for each cluster.
    cluster_data = pd.DataFrame(columns=['HIVCLUST','SW_Aggregate','Count_Aggregate'])
    cluster_data['HIVCLUST'] = list(aids_recode['HIVCLUST'].unique())
    cluster_data.set_index('HIVCLUST', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    # Contains total number of positive responses for each region. 
    admin_region_data = pd.DataFrame(columns=[region_name, 'Positive_HIV_SW', 'Positive_Row_Count', 'Population','Positive_HIV_Pop'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    aids_data.set_index('HIVCLUST', inplace=True)
    
    # Computes the aggregate for each cluster to create (Unique Cluster, Aids Aggregate).   
    for index, row in aids_data.iterrows():
        result = row['HIV03']
        sample_weight = row['HIV05'] / 1000000
        
        # HIV test is positive.        
        if(result == 1):
            total = cluster_data.loc[index, 'SW_Aggregate'] + sample_weight
            cluster_data.loc[index, 'SW_Aggregate'] = total
            
            prev = cluster_data.loc[index, 'Count_Aggregate']
            cluster_data.loc[index, 'Count_Aggregate'] = prev + 1

    cluster_data.reset_index(inplace=True)
    
    # Result of vor data is (cluster, region, proportion, SW aggregate, Count_Aggregate).
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='HIVCLUST')
    vor_data = pd.merge(vor_data, pop_data, how='inner', on='HIVCLUST')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    # Assigns responses to regions based on voronoi overlap with region. Proportions cluster population in same way.    
    for ind, row in vor_data.iterrows():
        proportion = row['Proportion'] * row['SW_Aggregate']
        previous_value = admin_region_data.loc[ind, 'Positive_HIV_SW']
        admin_region_data.loc[ind, 'Positive_HIV_SW'] = proportion + previous_value
        
        prop = row['Proportion'] * row['Count_Aggregate']
        prev_count = admin_region_data.loc[ind, 'Positive_Row_Count']
        admin_region_data.loc[ind, 'Positive_Row_Count'] = prop + prev_count
        
        prop = row['Proportion'] * row['Population']
        prev_count = admin_region_data.loc[ind, 'Population']
        admin_region_data.loc[ind, 'Population'] = prop + prev_count
        
    #  Normalises total positive responses in region using population of all clusters in the region.
    for ind, row in admin_region_data.iterrows():
        value = row['Positive_Row_Count'] / row['Population']
        admin_region_data.loc[ind, 'Positive_HIV_Pop'] = value * (10**6)
    return admin_region_data   

In [11]:
# ----------------------------------COTE D'IVOIRE COMPUTATIONS-------------------------------------
recode_path = "Cote D'Ivoire Data SPSS/ciar61sv/"
path_to_voronoi = "Cote D'Ivoire Data SPSS/Voronoi_fclusters/Proportions/"
path_to_clusters = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

# Population of all DHS clusters.
population_file = pd.read_csv("Cote D'Ivoire Data SPSS/Region_Population/voronoi.csv")
population_file = population_file.rename(columns = {'DHSCLUST':'HIVCLUST'})
population_file = population_file.rename(columns = {'_count':'Population'})

# DHS Aids recode file.
file = pd.read_csv(recode_path + "aids_recode.csv")

# Contains point in polygon mapping of cluster -> region.
clust_admin = pd.read_csv(path_to_clusters + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HIVCLUST'})

# Voronoi Polygons for all administrative levels.
voronoi_admin_1 = pd.read_csv(path_to_voronoi + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(path_to_voronoi + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(path_to_voronoi + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HIVCLUST'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HIVCLUST'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HIVCLUST'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HIVCLUST')
clust_file = pd.merge(clust_file, population_file, how='inner', on='HIVCLUST')

# Relevant columns in Aids recode file. 
keep_columns = ['HIVCLUST', 'HIVNUMB','HIV03', 'HIV05', 'ID_1', 'ID_2', 'ID_3']
clust_file = clust_file[keep_columns]

# -------------------------------SIMPLE AGGREGATIONS USING POINTS IN POLYGON--------------------------------
admin_1 = compute_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_simple_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_simple_aggregate("Admin_3_Region", "ID_3", clust_file)

admin_1.to_csv(recode_path + "aids_aggregate_1.csv", index=True)
admin_2.to_csv(recode_path + "aids_aggregate_2.csv", index=True)
admin_3.to_csv(recode_path + "aids_aggregate_3.csv", index=True)

# ------------------------------AGGREGATIONS USING VORONOI CALCULATIONS--------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population_file)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population_file)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population_file)

# -------------------------------------------KENDALL CORRELATIONS-----------------------------------------------
print("---------------------HOW TO AGGREGATE-----------------")
x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_1['Positive_Row_Count'])
print("Admin 1 - (HIV row count VOR, HIV count PIP): " +  repr(x) + "p: " + str(y))

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_2['Positive_Row_Count'])
print("Admin 2 - (HIV row count VOR, HIV count PIP): " +  repr(x) + "p: " + str(y)+ "\n")

print("------------------HOW TO COUNT------------------------")
x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_vor_1['Positive_HIV_SW'])
print("Admin 1 - (HIV row count VOR, HIV Sample Weight VOR): " + repr(x) + "p: " + str(y))

x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_vor_1['Positive_HIV_Pop'])
print("Admin 1 - (HIV row count VOR, HIV pop VOR): " + repr(x)  + "p: " + str(y)+ "\n")

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_vor_2['Positive_HIV_SW'])
print("Admin 2 - (HIV row count VOR, HIV Sample Weight VOR): " + repr(x) + "p: " + str(y))

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_vor_2['Positive_HIV_Pop'])
print("Admin 2 - (HIV row count VOR, HIV pop VOR): " + repr(x) + "p: " + str(y) + "\n")

x, y = stats.kendalltau(admin_vor_3['Positive_Row_Count'], admin_vor_3['Positive_HIV_SW'])
print("Admin 3 - (HIV row count VOR, HIV Sample Weight VOR): " + repr(x) + "p: " + str(y))

x, y = stats.kendalltau(admin_vor_3['Positive_Row_Count'], admin_vor_3['Positive_HIV_Pop'])
print("Admin 3 - (HIV row count VOR, HIV pop VOR): " + repr(x) + "p: " + str(y))

admin_vor_1.to_csv(recode_path + "aids_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(recode_path + "aids_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(recode_path + "aids_vor_aggregate_3.csv", index=True)

---------------------HOW TO AGGREGATE-----------------
Admin 1 - (HIV row count VOR, HIV count PIP): 0.92039002302622519p: 4.55662863383e-08
Admin 2 - (HIV row count VOR, HIV count PIP): 0.79853451572757839p: 2.25720636336e-15

------------------HOW TO COUNT------------------------
Admin 1 - (HIV row count VOR, HIV Sample Weight VOR): 0.50877192982456132p: 0.00233653321942
Admin 1 - (HIV row count VOR, HIV pop VOR): 0.55555555555555547p: 0.000888542259375

Admin 2 - (HIV row count VOR, HIV Sample Weight VOR): 0.55265306122448976p: 1.48741607408e-08
Admin 2 - (HIV row count VOR, HIV pop VOR): 0.5379591836734694p: 3.53890299387e-08

Admin 3 - (HIV row count VOR, HIV Sample Weight VOR): 0.73656176860843969p: 1.80656513407e-49
Admin 3 - (HIV row count VOR, HIV pop VOR): 0.6892307692307692p: 1.43437807453e-43
