In [2]:
import pandas as pd
from helpers.dslabs_functions import get_variable_types
import numpy as np

In [3]:
df = pd.read_csv('../../datasets/class_pos_covid.csv')
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
3,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Male,Poor,1.0,0.0,Within past year (anytime less than 12 months ...,No,7.0,,Yes,...,1.8,84.82,26.08,No,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [4]:
variable_types = get_variable_types(df)

variable_types["symbolic"]

['State',
 'GeneralHealth',
 'LastCheckupTime',
 'RemovedTeeth',
 'HadDiabetes',
 'SmokerStatus',
 'ECigaretteUsage',
 'RaceEthnicityCategory',
 'AgeCategory',
 'TetanusLast10Tdap']

In [5]:
df["AgeCategory"].value_counts()

AgeCategory
Age 65 to 69       41071
Age 70 to 74       38192
Age 60 to 64       38166
Age 80 or older    31864
Age 55 to 59       31423
Age 75 to 79       28661
Age 50 to 54       28448
Age 40 to 44       25065
Age 45 to 49       24036
Age 35 to 39       23980
Age 18 to 24       23156
Age 30 to 34       21668
Age 25 to 29       18854
Name: count, dtype: int64

In [6]:
df["AgeCategory"].unique()

array(['Age 80 or older', nan, 'Age 40 to 44', 'Age 75 to 79',
       'Age 70 to 74', 'Age 55 to 59', 'Age 65 to 69', 'Age 60 to 64',
       'Age 50 to 54', 'Age 45 to 49', 'Age 35 to 39', 'Age 30 to 34',
       'Age 25 to 29', 'Age 18 to 24'], dtype=object)

In [7]:
yes_no: dict[str, int] = {"no": 0, "No": 0, "yes": 1, "Yes": 1}
sex_values: dict[str, int] = {"Female": 0, "Male": 1}

variable_types = get_variable_types(df)

encoding = {variable: yes_no for variable in variable_types["binary"][1:]}

def encode_binaries(df, encoding):
    encoding["Sex"] = sex_values
    df = df.replace(encoding, inplace=False)
    return df
encode_binaries(df, encoding).head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,0,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,0.0,8.0,,0.0,...,,,,0.0,0.0,1.0,0.0,"Yes, received tetanus shot but not sure what type",0.0,0
1,Alabama,0,Excellent,0.0,0.0,,0.0,6.0,,0.0,...,1.6,68.04,26.57,0.0,0.0,0.0,0.0,"No, did not receive any tetanus shot in the pa...",0.0,0
2,Alabama,0,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,1.0,7.0,,0.0,...,1.65,63.5,23.3,0.0,0.0,1.0,1.0,"No, did not receive any tetanus shot in the pa...",0.0,0
3,Alabama,0,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,1.0,9.0,,0.0,...,1.57,53.98,21.77,1.0,0.0,0.0,1.0,"No, did not receive any tetanus shot in the pa...",0.0,0
4,Alabama,1,Poor,1.0,0.0,Within past year (anytime less than 12 months ...,0.0,7.0,,1.0,...,1.8,84.82,26.08,0.0,0.0,0.0,1.0,"No, did not receive any tetanus shot in the pa...",0.0,0


In [8]:
variable_types = get_variable_types(df)
variable_types["symbolic"]

['State',
 'GeneralHealth',
 'LastCheckupTime',
 'RemovedTeeth',
 'HadDiabetes',
 'SmokerStatus',
 'ECigaretteUsage',
 'RaceEthnicityCategory',
 'AgeCategory',
 'TetanusLast10Tdap']

In [9]:
df["RaceEthnicityCategory"].value_counts()

RaceEthnicityCategory
White only, Non-Hispanic         277070
Hispanic                          36482
Black only, Non-Hispanic          29403
Other race only, Non-Hispanic     18858
Multiracial, Non-Hispanic          8339
Name: count, dtype: int64

In [10]:
df["RaceEthnicityCategory"].unique()

array(['White only, Non-Hispanic', 'Black only, Non-Hispanic',
       'Multiracial, Non-Hispanic', nan, 'Hispanic',
       'Other race only, Non-Hispanic'], dtype=object)

In [11]:
general_health_encoding = {'Very good':3, 'Excellent':4, 'Fair':1, 'Poor':0, 'Good':1, 'nan':np.nan}
last_checkup_time_encoding = {'Within past year (anytime less than 12 months ago)':0.5, 'nan':np.nan,
       'Within past 2 years (1 year but less than 2 years ago)':1.5,
       'Within past 5 years (2 years but less than 5 years ago)':3.5,
       '5 or more years ago':7}
removed_teeth_encoding = {"nan":np.nan, 'None of them':0, '1 to 5':2, '6 or more, but not all':13, 'All':32}
had_diabetes_encoding = {'Yes':2, 'No':0, 'No, pre-diabetes or borderline diabetes':1, "nan":np.nan, 'Yes, but only during pregnancy (female)':1}
smoker_status_encoding = {'Never smoked':0, 'Current smoker - now smokes some days':2,
       'Former smoker':1, "nan":np.nan, 'Current smoker - now smokes every day':3}
ECiggarette_usage_encoding = {'Not at all (right now)':1,
       'Never used e-cigarettes in my entire life':0, 'Use them every day':3,
       'Use them some days':2, "nan":np.nan}
age_category_encoding = {'Age 80 or older':13, nan, 'Age 40 to 44':5, 'Age 75 to 79':12,
       'Age 70 to 74':11, 'Age 55 to 59':8, 'Age 65 to 69':10, 'Age 60 to 64':9,
       'Age 50 to 54':7, 'Age 45 to 49':6, 'Age 35 to 39':4, 'Age 30 to 34':3,
       'Age 25 to 29':2, 'Age 18 to 24':0}


encoded_race_ethnicity = pd.get_dummies(df, columns=['RaceEthnicityCategory'], dummy_na=False)
encoded_race_ethnicity

SyntaxError: ':' expected after dictionary key (542133067.py, line 13)

In [None]:
variable_types = get_variable_types(df)
variable_types["symbolic"]

['State',
 'GeneralHealth',
 'LastCheckupTime',
 'RemovedTeeth',
 'HadDiabetes',
 'SmokerStatus',
 'ECigaretteUsage',
 'RaceEthnicityCategory',
 'AgeCategory',
 'TetanusLast10Tdap']

In [None]:
def calulate_mean_age(age_categories):
    mean_ages = []
    for value in age_categories:
        split = value.split()
        age1, age2 = int(split[1]), split[3]
        if age2.isdigit():
            mean_ages.append((age1+int(age2))/2)
        else:
            mean_ages.append(85)
    return mean_ages

calulate_mean_age(df["AgeCategory"].value_counts().index)

age_category_value_counts = df["AgeCategory"].value_counts()
mean_ages = calulate_mean_age(age_category_value_counts.index)
age_category_encoding = {age_category_value_counts.index[i]:mean_ages[i] for i in range(len(mean_ages))}
age_category_encoding["nan"] = np.nan
age_category_encoding

{'Age 65 to 69': 67.0,
 'Age 70 to 74': 72.0,
 'Age 60 to 64': 62.0,
 'Age 80 or older': 85,
 'Age 55 to 59': 57.0,
 'Age 75 to 79': 77.0,
 'Age 50 to 54': 52.0,
 'Age 40 to 44': 42.0,
 'Age 45 to 49': 47.0,
 'Age 35 to 39': 37.0,
 'Age 18 to 24': 21.0,
 'Age 30 to 34': 32.0,
 'Age 25 to 29': 27.0,
 'nan': nan}

In [None]:
def yes_no_mapping(answer):
    print(answer)
    if pd.isna(answer):
        return np.nan
    elif answer.startswith('Yes'):
        return 1
    elif answer.startswith('No'):
        return 0
    else:
        return np.nan  # or a specific code for answers that do not start with Yes/No

# Create the mapping dictionary
tetanus_encoding = {answer: yes_no_mapping(answer) for answer in df['TetanusLast10Tdap'].unique()}

tetanus_encoding

Yes, received tetanus shot but not sure what type
No, did not receive any tetanus shot in the past 10 years
nan
Yes, received Tdap
Yes, received tetanus shot, but not Tdap


{'Yes, received tetanus shot but not sure what type': 1,
 'No, did not receive any tetanus shot in the past 10 years': 0,
 nan: nan,
 'Yes, received Tdap': 1,
 'Yes, received tetanus shot, but not Tdap': 1}

In [None]:
def parse_location_file(file_path):
    location_mapping = {}

    with open(file_path, 'r') as file:
        next(file)  # Skip the header line
        for line in file:
            # Remove quotation marks and split the line
            parts = line.strip().replace('"', '').split(':')
            state = parts[0].strip()
            coords = parts[1].strip().strip('[]').split(',')
            longitude = float(coords[0].strip())
            latitude = float(coords[1].strip())

            location_mapping[state] = {'Latitude': latitude, 'Longitude': longitude}

    return location_mapping

# Assuming the file path is 'locations.txt'
file_path = "../datasets/State, LON, LAT.txt"
location_mapping = parse_location_file(file_path)
# location_mapping


In [None]:
def add_coordinates(df, mapping):
    # Extract latitude and longitude from the mapping
    df['Latitude'] = df['State'].map(lambda x: mapping[x]['Latitude'] if x in mapping else None)
    df['Longitude'] = df['State'].map(lambda x: mapping[x]['Longitude'] if x in mapping else None)
    return df

def encode_symbolic(df):
    encoding = {}
    encoding["GeneralHealth"] = general_health_encoding
    encoding["LastCheckupTime"] = last_checkup_time_encoding
    encoding["RemovedTeeth"] = removed_teeth_encoding
    encoding["HadDiabetes"] = had_diabetes_encoding
    encoding["SmokerStatus"] = smoker_status_encoding
    encoding["ECigaretteUsage"] = ECiggarette_usage_encoding
    encoding["AgeCategory"] = age_category_encoding
    encoding["TetanusLast10Tdap"] = tetanus_encoding
    
    df = pd.get_dummies(df, columns=['RaceEthnicityCategory'], dummy_na=False)
    file_path = "../datasets/State, LON, LAT.txt"
    state_encoding = parse_location_file(file_path)
    df = add_coordinates(df, state_encoding)
    # df = df.drop(columns=["RaceEthnicityCategory", "State"])
    df = df.replace(encoding, inplace=False)
    return df


df = encode_symbolic(df)
df = encode_binaries(df, encoding)
df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,TetanusLast10Tdap,HighRiskLastYear,CovidPos,"RaceEthnicityCategory_Black only, Non-Hispanic",RaceEthnicityCategory_Hispanic,"RaceEthnicityCategory_Multiracial, Non-Hispanic","RaceEthnicityCategory_Other race only, Non-Hispanic","RaceEthnicityCategory_White only, Non-Hispanic",Latitude,Longitude
0,Alabama,0,3.0,0.0,0.0,0.5,0.0,8.0,,0.0,...,1.0,0.0,0,False,False,False,False,True,32.743686,-86.846795
1,Alabama,0,4.0,0.0,0.0,,0.0,6.0,,0.0,...,0.0,0.0,0,False,False,False,False,True,32.743686,-86.846795
2,Alabama,0,4.0,0.0,0.0,0.5,1.0,7.0,,0.0,...,0.0,0.0,0,False,False,False,False,True,32.743686,-86.846795
3,Alabama,0,1.0,2.0,0.0,0.5,1.0,9.0,,0.0,...,0.0,0.0,0,False,False,False,False,True,32.743686,-86.846795
4,Alabama,1,0.0,1.0,0.0,0.5,0.0,7.0,,1.0,...,0.0,0.0,0,False,False,False,False,True,32.743686,-86.846795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380927,Virgin Islands,0,1.0,0.0,7.0,0.5,1.0,7.0,0.0,0.0,...,0.0,0.0,1,True,False,False,False,False,18.059379,-64.838762
380928,Virgin Islands,1,1.0,0.0,15.0,0.5,1.0,7.0,2.0,0.0,...,1.0,0.0,1,False,False,True,False,False,18.059379,-64.838762
380929,Virgin Islands,1,1.0,0.0,0.0,1.5,1.0,8.0,0.0,0.0,...,,0.0,1,False,False,False,False,True,18.059379,-64.838762
380930,Virgin Islands,0,1.0,0.0,3.0,1.5,1.0,6.0,0.0,0.0,...,0.0,0.0,1,True,False,False,False,False,18.059379,-64.838762


In [None]:
encoded_df["RaceEthnicityCategory_Black only, Non-Hispanic"]

KeyError: 'RaceEthnicityCategory_Black only, Non-Hispanic'

In [None]:
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
df