In [None]:
import glob
import pandas as pd
import numpy as np
import os
import csv
import sys
import math as m

year='2021'

In [None]:
#Read CSV file containing the persons from the WM 
df_persons_NE_dir = '/home/manon/Documents/Leeds/RAIM/code/RAIM/data/westmidlands/synthetic_pop_SPENSER/'+year+'/NE_only' # use your path
df_persons_NE_file = os.path.join(df_persons_NE_dir, "df_persons_NE_sociodemographics.csv")
df_persons_NE = pd.read_csv(df_persons_NE_file, index_col=None, header=0)

In [None]:
#List containing the range of ages
age_range = [(0,0),(1,4),(5,9),(10,14),(15,19),(20,24),(25,29),(30,34),(35,39),(40,44),(45,49),(50,54),(55,59),(60,64),(65,69),(70,74),(75,79),(80,84),(85,89),(90,120)]

#Probabilities from CT0794 - Age by long-term health problem or disability by general health by sex – Census 2011 
#By gender and age range: 
# % of men by age that have v.good/good health 
men_good_percentage_list = [0.967,0.962,0.957,0.957,0.954,0.937,0.919,
                            0.896,0.866,0.825,0.780,0.721,0.648,0.591,
                            0.555,0.491,0.416,0.337,0.277,0.239]

# % of women by age that have v.good/good health 
women_good_percentage_list = [0.971,0.969,0.968,0.966,0.949,0.930,0.909,
                              0.886,0.846,0.805,0.756,0.697,0.630,0.600,
                              0.545,0.461,0.374,0.287,0.222,0.180]

# % of men by age that have fair health 
men_fair_percentage_list = [0.025,0.029,0.033,0.033,0.035,0.049,0.060,
                            0.075,0.095,0.123,0.152,0.191,0.229,0.264,
                            0.305,0.347,0.388,0.431,0.444,0.457]

# % of women by age that have fair health 
women_fair_percentage_list = [0.020,0.023,0.025,0.027,0.041,0.057,0.072,
                              0.088,0.113,0.140,0.169,0.202,0.235,0.274,
                              0.312,0.363,0.416,0.455,0.476,0.490]

# % of men by age that have bad/v. bad health 
men_bad_percentage_list = [0.008,0.009,0.010,0.010,0.011,0.014,
                            0.021,0.029,0.039,0.052,0.067,0.089,0.123,
                            0.144,0.140,0.162,0.196,0.232,0.279,0.303]

# % of women by age that have bad/v. bad health 
women_bad_percentage_list = [0.009,0.007,0.007,0.008,0.010,0.013,0.019,
                              0.027,0.041,0.055,0.075,0.101,0.135,0.127,
                              0.143,0.176,0.210,0.258,0.302,0.331]

In [None]:
# Create a new empty column for the health status (0/1/2) and initialised to 0)
df_persons_NE["Health_status"] = 0
df_persons_NE

In [None]:
good_health_list = []

age_index = 0
for age in age_range:
    for gender in [1,2]:
        if(gender==1):
            proba = men_good_percentage_list[age_index]
        else:
            proba = women_good_percentage_list[age_index]
                                
        # Select those people within the sex and range of age selected:
        globals()[f"df_{gender}_age_{age[0]}_{age[1]}"] = df_persons_NE.loc[(df_persons_NE['Age'] >= age[0]) & (df_persons_NE['Age'] <= age[1]) & (df_persons_NE['Sex'] == gender)]
    
        # Select randomly the % of people with a good health from the previous dataframe:               
        globals()[f"df_{gender}_age_{age[0]}_{age[1]}_good"] = globals()[f"df_{gender}_age_{age[0]}_{age[1]}"].sample(int(len(globals()[f"df_{gender}_age_{age[0]}_{age[1]}"]) * proba))

        #Append the dataframe into the temp list
        good_health_list.append(globals()[f"df_{gender}_age_{age[0]}_{age[1]}_good"])

    age_index = age_index+1
    
#concatenate all persons with a good health in one dataframe
df_persons_NE_good_health = pd.concat(good_health_list, axis=0, ignore_index=True)

#Update the value of the column "Health_status" to those people with a good health:
df_persons_NE_good_health["Health_status"] = "2"
    

In [None]:
len(df_persons_NE_good_health)

In [None]:
# Concatenate the dataframe of all people + the ones selected before as good health
df_persons_NE_plus = (pd.concat([df_persons_NE, df_persons_NE_good_health]))

# Remove duplicates and keep only those who were not assigned
df_persons_NE_plus = df_persons_NE_plus.drop_duplicates(subset='PID_AreaMSOA', keep = False)
df_persons_NE_plus
len(df_persons_NE_plus)

In [None]:
fair_health_list = []

age_index = 0
for age in age_range:
    for gender in [1,2]:
        if(gender==1):
            proba = men_fair_percentage_list[age_index]
        else:
            proba = women_fair_percentage_list[age_index]
                                
        # Select those people within the sex and range of age selected:
        globals()[f"df_{gender}_age_{age[0]}_{age[1]}"] = df_persons_NE_plus.loc[(df_persons_NE_plus['Age'] >= age[0]) & (df_persons_NE_plus['Age'] <= age[1]) & (df_persons_NE_plus['Sex'] == gender)]
    
        # Select randomly the % of people with a fair health from the previous dataframe:               
        globals()[f"df_{gender}_age_{age[0]}_{age[1]}_fair"] = globals()[f"df_{gender}_age_{age[0]}_{age[1]}"].sample(int(len(globals()[f"df_{gender}_age_{age[0]}_{age[1]}"]) * proba))

        #Append the dataframe into the temp list
        fair_health_list.append(globals()[f"df_{gender}_age_{age[0]}_{age[1]}_fair"])

    age_index = age_index+1
    
#concatenate all persons with a fair health in one dataframe
df_persons_NE_fair_health = pd.concat(fair_health_list, axis=0, ignore_index=True)

#Update the value of the column "Health_status" to those people with a fair health:
df_persons_NE_fair_health["Health_status"] = "1"

In [None]:
len(df_persons_NE_fair_health)

In [None]:
# Concatenate the dataframe of all people + the ones selected before as good health or fair health
df_persons_NE_plus_plus = (pd.concat([df_persons_NE_plus, df_persons_NE_fair_health]))

# Remove duplicates and keep only those who were not assigned
df_persons_NE_bad = df_persons_NE_plus_plus.drop_duplicates(subset='PID_AreaMSOA', keep = False)
df_persons_NE_bad
len(df_persons_NE_bad)

In [None]:
# Check all people with bad health have 0 value
len(df_persons_NE_bad.loc[df_persons_NE_bad["Health_status"] != 0])

In [None]:
#Merge all and write to csv
df_persons_NE_health = (pd.concat([df_persons_NE_good_health, df_persons_NE_fair_health, df_persons_NE_bad]))
len(df_persons_NE_health)
df_persons_NE_health.to_csv('/home/manon/Documents/Leeds/RAIM/code/RAIM/data/westmidlands/synthetic_pop_SPENSER/'+year+'/NE_only/df_persons_NE_sociodemographics_health.csv', encoding='utf-8', header=True)