In [1]:
from __future__ import division
import scipy.stats as stats
import pandas as pd

In [2]:
def compute_simple_aggregate(region_name, col_name, aids_recode):

    admin_data = pd.DataFrame(columns=[region_name,'Positive_Row_Count', 'Positive_HIV_Pop', 'Positive_HIV_SW'])
    admin_data[region_name] = list(aids_recode[col_name].unique())
    
    # Set index to Admin Region.
    admin_data.set_index(region_name, inplace=True)
    admin_data.fillna(0, inplace=True)
    admin_data.sort_index(ascending=True, inplace=True)

    # Set index to Admin Region.   
    aids_data = aids_recode
    aids_data.reset_index(inplace=True)
    aids_data.set_index(col_name, inplace=True)
    
    for index, row in aids_data.iterrows():
        region = index;
        pop = row['Population']
        result = row['HIV03']
        sample_weight = row['HIV05'] / 1000000
        
        if(result == 1):
            # Counting.            
            pos = admin_data.loc[region, 'Positive_Row_Count'] + 1
            admin_data.loc[region,'Positive_Row_Count'] = pos
            
            # Normalise using cluster population count.
            prev = admin_data.loc[region, 'Positive_HIV_Pop']
            admin_data.loc[region, 'Positive_HIV_Pop'] = prev + (1 / pop)
            
            # Normalise using sample weights.
            sample_pos = admin_data.loc[region, 'Positive_HIV_SW'] + sample_weight
            admin_data.loc[region, 'Positive_HIV_SW'] = sample_pos
              
    return admin_data

In [3]:
# region name: Admin_Region_x, cluster_data: c3, voronoi_data: proportions data, col_name: "ID_1"
def voronoi_aggregate(region_name, col_name, aids_recode, voronoi_data, pop):
    # Aids Recode data. ()
    aids_data = aids_recode
    aids_data.reset_index(inplace=True)
    
    pop_data = pop
    
    # Contais HIV_Positive aggregate per cluster
    cluster_data = pd.DataFrame(columns=['HIVCLUST','SW_Aggregate','Count_Aggregate'])
    cluster_data['HIVCLUST'] = list(aids_recode['HIVCLUST'].unique())

    # Set index to Cluster.
    cluster_data.set_index('HIVCLUST', inplace=True)
    cluster_data.fillna(0, inplace=True)
    cluster_data.sort_index(ascending=True, inplace=True)
    
    vor_data = voronoi_data
    vor_data.reset_index(inplace=True)
    
    # Contains Admin Region and the HIV_Positive aggregate. HIV Count uses (result / population)  
    admin_region_data = pd.DataFrame(columns=[region_name, 'Positive_HIV_SW', 'Positive_Row_Count', 'Population',
                                             'Positive_HIV_Pop'])
    admin_region_data[region_name] = list(voronoi_data[col_name].unique())
    
    # Set index to Admin Region.    
    admin_region_data.set_index(region_name, inplace=True)
    admin_region_data.fillna(0, inplace=True)
    admin_region_data.sort_index(ascending=True, inplace=True)

    aids_data.set_index('HIVCLUST', inplace=True)
    
    # Computes the aggregate for each cluster to create cluster data. (Unique Cluster, Aids Aggregate)    
    for index, row in aids_data.iterrows():
        result = row['HIV03']
        pop = row['Population']
        sample_weight = row['HIV05'] / 1000000
        
        if(result == 1):
            total = cluster_data.loc[index, 'SW_Aggregate'] + sample_weight
            cluster_data.loc[index, 'SW_Aggregate'] = total
            
#             prev = cluster_data.loc[index, 'Pop_Aggregate']
#             cluster_data.loc[index, 'Pop_Aggregate'] = prev + (1 / pop)
            
            prev = cluster_data.loc[index, 'Count_Aggregate']
            cluster_data.loc[index, 'Count_Aggregate'] = prev + 1

    cluster_data.reset_index(inplace=True)
    
    # Merge Voronoi & Admin Region Mapping to HIV Aggregate per cluster.
    vor_data = pd.merge(voronoi_data, cluster_data, how='inner', on='HIVCLUST')
    vor_data = pd.merge(vor_data, pop_data, how='inner', on='HIVCLUST')
    vor_data.reset_index(inplace=True)
    vor_data.set_index(col_name, inplace=True)
    vor_data.sort_index(ascending=True, inplace=True)
    
    print(vor_data.columns)
    for ind, row in vor_data.iterrows():
        proportion = row['Proportion'] * row['SW_Aggregate']
        previous_value = admin_region_data.loc[ind, 'Positive_HIV_SW']
        admin_region_data.loc[ind, 'Positive_HIV_SW'] = proportion + previous_value
        
#         prop = row['Proportion'] * row['Pop_Aggregate']
#         prev_count = admin_region_data.loc[ind, 'Positive_HIV_Pop']
#         admin_region_data.loc[ind, 'Positive_HIV_Pop'] = prop + prev_count
    
        prop = row['Proportion'] * row['Count_Aggregate']
        prev_count = admin_region_data.loc[ind, 'Positive_Row_Count']
        admin_region_data.loc[ind, 'Positive_Row_Count'] = prop + prev_count
        
        prop = row['Proportion'] * row['Population']
        prev_count = admin_region_data.loc[ind, 'Population']
        admin_region_data.loc[ind, 'Population'] = prop + prev_count
        
    for ind, row in admin_region_data.iterrows():
        value = row['Positive_Row_Count'] / row['Population']
        admin_region_data.loc[ind, 'Positive_HIV_Pop'] = (value * (10**6))
        
    return admin_region_data   

In [4]:
# ----------------------------------COTE D'IVOIRE COMPUTATIONS-------------------------------------
rel_path = "Cote D'Ivoire Data SPSS/ciar61sv/"
vor_path = "Cote D'Ivoire Data SPSS/Voronoi clusters/Proportions/"
pop_path = "Cote D'Ivoire Data SPSS/cigc61fl/"
clu_path = "Cote D'Ivoire Data SPSS/CLUSTER_TO_REGION/"

file = pd.read_csv(rel_path + "aids_recode.csv")
population = pd.read_csv(pop_path + "CIGC61FL.csv", usecols=['DHSCLUST', 'All_Population_Count_2010'])
population.columns = ['HIVCLUST', 'Population']

clust_admin = pd.read_csv(clu_path + "admin_3.csv", usecols=['DHSCLUST', 'ID_1', 'ID_2', 'ID_3'])
clust_admin = clust_admin.rename(columns = {'DHSCLUST':'HIVCLUST'})

# Voronoi Polygon & Admin Region Mapping.
voronoi_admin_1 = pd.read_csv(vor_path + "voronoi_admin_1.csv")
voronoi_admin_2 = pd.read_csv(vor_path + "voronoi_admin_2.csv")
voronoi_admin_3 = pd.read_csv(vor_path + "voronoi_admin_3.csv")

voronoi_admin_1 = voronoi_admin_1.rename(columns = {'DHSCLUST':'HIVCLUST'})
voronoi_admin_2 = voronoi_admin_2.rename(columns = {'DHSCLUST':'HIVCLUST'})
voronoi_admin_3 = voronoi_admin_3.rename(columns = {'DHSCLUST':'HIVCLUST'})

clust_file = pd.merge(file, clust_admin, how='inner', on='HIVCLUST')
clust_file = pd.merge(clust_file, population, how='inner', on='HIVCLUST')

# Relevant columns in Aids recode file. 
keep_columns = ['HIVCLUST', 'HIVNUMB', 'Population','HIV03', 'HIV05', 'ID_1', 'ID_2', 'ID_3']
clust_file = clust_file[keep_columns]

clust_file.to_csv(rel_path + "complete_aids_data.csv", index=False)

# -------------------------------SIMPLE AGGREGATIONS USING POINTS IN POLYGON--------------------------------
admin_1 = compute_simple_aggregate("Admin_1_Region", "ID_1", clust_file)
admin_2 = compute_simple_aggregate("Admin_2_Region", "ID_2", clust_file)
admin_3 = compute_simple_aggregate("Admin_3_Region", "ID_3", clust_file)

admin_1.to_csv(rel_path + "aids_aggregate_1.csv", index=True)
admin_2.to_csv(rel_path + "aids_aggregate_2.csv", index=True)
admin_3.to_csv(rel_path + "aids_aggregate_3.csv", index=True)

# ------------------------------AGGREGATIONS USING VORONOI CALCULATIONS--------------------------------------
admin_vor_1 = voronoi_aggregate("Admin_Region_1", "ID_1", clust_file, voronoi_admin_1, population)
admin_vor_2 = voronoi_aggregate("Admin_Region_2", "ID_2", clust_file, voronoi_admin_2, population)
admin_vor_3 = voronoi_aggregate("Admin_Region_3", "ID_3", clust_file, voronoi_admin_3, population)

print(admin_vor_1.head())
print()

# -------------------------------------------KENDALL CORRELATIONS-----------------------------------------------
print("---------------------HOW TO AGGREGATE-----------------")
x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_1['Positive_Row_Count'])
print("Admin 1 - (HIV row count VOR, HIV count PIP): " +  repr(x))

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_2['Positive_Row_Count'])
print("Admin 2 - (HIV row count VOR, HIV count PIP): " +  repr(x) + "\n")

print("------------------HOW TO COUNT------------------------")
x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_vor_1['Positive_HIV_SW'])
print("Admin 1 - (HIV row count VOR, HIV Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_1['Positive_Row_Count'], admin_vor_1['Positive_HIV_Pop'])
print("Admin 1 - (HIV row count VOR, HIV pop VOR): " + repr(x) + "\n")

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_vor_2['Positive_HIV_SW'])
print("Admin 2 - (HIV row count VOR, HIV Sample Weight VOR): " + repr(x))

x, y = stats.kendalltau(admin_vor_2['Positive_Row_Count'], admin_vor_2['Positive_HIV_Pop'])
print("Admin 2 - (HIV row count VOR, HIV pop VOR): " + repr(x))

# Admin_vor_3 and Admin 3 are not the same size so correlation calculation not possible using Kendell.

admin_vor_1.to_csv(rel_path + "aids_vor_aggregate_1.csv", index=True)
admin_vor_2.to_csv(rel_path + "aids_vor_aggregate_2.csv", index=True)
admin_vor_3.to_csv(rel_path + "aids_vor_aggregate_3.csv", index=True)


Index(['level_0', 'index', 'HIVCLUST', 'O_Area_km2', 'P_Area_km2',
       'Proportion', 'SW_Aggregate', 'Count_Aggregate', 'Population'],
      dtype='object')
Index(['level_0', 'index', 'HIVCLUST', 'O_Area_km2', 'P_Area_km2',
       'Proportion', 'SW_Aggregate', 'Count_Aggregate', 'Population'],
      dtype='object')
Index(['level_0', 'index', 'HIVCLUST', 'O_Area_km2', 'P_Area_km2',
       'Proportion', 'SW_Aggregate', 'Count_Aggregate', 'Population'],
      dtype='object')
                Positive_HIV_SW  Positive_Row_Count     Population  \
Admin_Region_1                                                       
1                     13.977581           11.201658  222505.683121   
2                      0.759120            2.589420   49035.472674   
3                     31.356746           26.013104  318221.906373   
4                      1.829151            5.030108  114605.282271   
5                     18.758965           22.960091  600577.377124   

                Positive_HIV_