In [1]:
from pandas import read_csv, DataFrame

filename = "dataset/class_pos_covid.csv"
file_tag = "class_pos_covid"
data: DataFrame = read_csv(filename, na_values="")

**numeric** = ['PhysicalHealthDays', 'MentalHealthDays', 'SleepHours', 'HeightInMeters', 'WeightInKilograms', 'BMI']

**symbolic** = ['State', 'GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'HadDiabetes', 'SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'AgeCategory', 'TetanusLast10Tdap']

**binary** = ['Sex', 'PhysicalActivities', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear', 'CovidPos']

In [3]:
# Binary Variables - Ordinal/Binary Encoding 
binary = ['Sex', 'PhysicalActivities', 'HadHeartAttack', 'HadAngina', 'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty', 'DifficultyConcentrating', 'DifficultyWalking', 'DifficultyDressingBathing', 'DifficultyErrands', 'ChestScan', 'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'HighRiskLastYear', 'CovidPos']

yes_no: dict[str, int] = {"no": 0, "No": 0, "yes": 1, "Yes": 1,"Male": 0, "Female":1}
encoding: dict[str, dict[str, int]] = {}
for var in binary:
    encoding[var] = yes_no
df: DataFrame = data.replace(encoding, inplace=False)
df.shape

(380932, 40)

In [3]:
data['TetanusLast10Tdap'].unique()

array(['Yes, received tetanus shot but not sure what type',
       'No, did not receive any tetanus shot in the past 10 years', nan,
       'Yes, received Tdap', 'Yes, received tetanus shot, but not Tdap'],
      dtype=object)

In [4]:
# GeneralHealth - Ordinal Encoding based on Taxonomy
generalHealth_type_values = {"Poor": 0, "Fair": 1, "Good": 2, "Very good": 3, "Excellent": 4}
encoding: dict[str, dict[str, int]] = {"GeneralHealth": generalHealth_type_values}

df.replace(encoding, inplace=True)
df.shape

(380932, 40)

In [5]:
# LastCheckupTime - Ordinal Encoding based on Taxonomy
lastCheckupValue_type_values = {
    "5 or more years ago": 0,
    "Within past 5 years (2 years but less than 5 years ago)": 1,
    "Within past 2 years (1 year but less than 2 years ago)": 2,
    "Within past year (anytime less than 12 months ago)": 3
}

encoding: dict[str, dict[str, int]] = {"LastCheckupTime": lastCheckupValue_type_values}
df.replace(encoding, inplace=True)
df.shape

(380932, 40)

In [6]:
# SmokerStatus - Ordinal Encoding based on Taxonomy 
smoker_status_dict = {'Never smoked': 0, 'Former smoker': 1, 'Current smoker - now smokes some days': 2, 'Current smoker - now smokes every day': 3}
encoding: dict[str, dict[str, int]] = {"SmokerStatus": smoker_status_dict}
df.replace(encoding, inplace=True)
df.shape

(380932, 40)

In [7]:
# ECigaretteUsage - Ordinal Encoding based on Taxonomy
e_cigarette_usage_dict = {'Never used e-cigarettes in my entire life': 0, 'Not at all (right now)': 1, 'Use them some days': 2, 'Use them every day': 3}
encoding: dict[str, dict[str, int]] = {"ECigaretteUsage": e_cigarette_usage_dict}
df.replace(encoding, inplace=True)
df.shape

(380932, 40)

In [8]:
# AgeCategory - Ordinal Encoding based on taxonomy
ageCategory_type_values = {
    "Age 18 to 24": 0,
    "Age 25 to 29": 1,
    "Age 30 to 34": 2,
    "Age 35 to 39": 3,
    "Age 40 to 44": 4,
    "Age 45 to 49": 5,
    "Age 50 to 54": 6,
    "Age 55 to 59": 7,
    "Age 60 to 64": 8,
    "Age 65 to 69": 9,
    "Age 70 to 74": 10,
    "Age 75 to 79": 11,
    "Age 80 or older": 12
}

encoding: dict[str, dict[str, int]] = {"AgeCategory": ageCategory_type_values}
df.replace(encoding, inplace=True)
df.shape

(380932, 40)

In [None]:
TetanusLast10Tdap_dic = {
   'No, did not receive any tetanus shot in the past 10 years': 0,
   'Yes, received Tdap': 1,
   'Yes, received tetanus shot but not sure what type' : 2,   
   'Yes, received tetanus shot, but not Tdap': 3
}  
encoding: dict[str, dict[str, int]] = {"TetanusLast10Tdap": TetanusLast10Tdap_dic}
df.replace(encoding, inplace=True)
df.shape

In [5]:
RaceEthnicityCategory_dic = {
       'White only': 0,
       'Non-Hispanic': 1,
       'Black only, Non-Hispanic': 2,
       'Multiracial, Non-Hispanic': 3, 
       'Hispanic': 4,
       'Other race only, Non-Hispanic': 5
}
encoding: dict[str, dict[str, int]] = {"RaceEthnicityCategory": RaceEthnicityCategory_dic}
df.replace(encoding, inplace=True)
df.shape

array(['White only, Non-Hispanic', 'Black only, Non-Hispanic',
       'Multiracial, Non-Hispanic', nan, 'Hispanic',
       'Other race only, Non-Hispanic'], dtype=object)

In [10]:
# RemovedTeeth - Ordinal Encoding based on Taxonomy
RemovedTeeth_type_values = {
    "None of them": 0,
    "1 to 5": 1,
    "6 or more, but not all": 2,
    "All": 3,
}
encoding: dict[str, dict[str, int]] = {"RemovedTeeth": RemovedTeeth_type_values}
df.replace(encoding, inplace=True)
df.shape

(380932, 49)

In [6]:
# HadDiabetes - Dummify
HadDiabetes_type_values = {
    "Yes": 1,
    "No": 0,
    'No, pre-diabetes or borderline diabetes' : 2,
    'Yes, but only during pregnancy (female)' : 3
}
encoding: dict[str, dict[str, int]] = {"HadDiabetes": HadDiabetes_type_values}
df.replace(encoding, inplace=True)
df.shape


array(['Yes', 'No', 'No, pre-diabetes or borderline diabetes', nan,
       'Yes, but only during pregnancy (female)'], dtype=object)

In [12]:
# State
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2

# Define the latitude and longitude for each state
state_coordinates = {
    'Alabama': [32.318231, -86.902298],
    'Alaska': [63.588753, -154.493062],
    'Arizona': [34.048928, -111.093731],
    'Arkansas': [35.20105, -91.831833],
    'California': [36.778261, -119.417932],
    'Colorado': [39.550051, -105.782067],
    'Connecticut': [41.603221, -73.087749],
    'Delaware': [38.910832, -75.52767],
    'District of Columbia': [38.895, -77.03667],
    'Florida': [27.664827, -81.515754],
    'Georgia': [32.157435, -82.907123],
    'Hawaii': [19.898682, -155.665857],
    'Idaho': [44.068202, -114.742041],
    'Illinois': [40.633125, -89.398528],
    'Indiana': [40.551217, -85.602364],
    'Iowa': [41.878003, -93.097702],
    'Kansas': [39.011902, -98.484246],
    'Kentucky': [37.839333, -84.270018],
    'Louisiana': [31.244823, -92.145024],
    'Maine': [45.253783, -69.445469],
    'Maryland': [39.045755, -76.641271],
    'Massachusetts': [42.407211, -71.382437],
    'Michigan': [44.314844, -85.602364],
    'Minnesota': [46.729553, -94.6859],
    'Mississippi': [32.354668, -89.398528],
    'Missouri': [37.964253, -91.831833],
    'Montana': [46.879682, -110.362566],
    'Nebraska': [41.492537, -99.901813],
    'Nevada': [38.80261, -116.419389],
    'New Hampshire': [43.193852, -71.572395],
    'New Jersey': [40.058324, -74.405661],
    'New Mexico': [34.97273, -105.032363],
    'New York': [43.299428, -74.217933],
    'North Carolina': [35.759573, -79.0193],
    'North Dakota': [47.551493, -101.002012],
    'Ohio': [40.417287, -82.907123],
    'Oklahoma': [35.007752, -97.092877],
    'Oregon': [43.804133, -120.554201],
    'Pennsylvania': [41.203322, -77.194525],
    'Rhode Island': [41.580095, -71.477429],
    'South Carolina': [33.836081, -81.163725],
    'South Dakota': [43.969515, -99.901813],
    'Tennessee': [35.517491, -86.580447],
    'Texas': [31.968599, -99.901813],
    'Utah': [39.32098, -111.093731],
    'Vermont': [44.558803, -72.577841],
    'Virginia': [37.431573, -78.656894],
    'Washington': [47.751074, -120.740139],
    'West Virginia': [38.597626, -80.454903],
    'Wisconsin': [43.78444, -88.787868],
    'Wyoming': [43.075968, -107.290284],
    'Guam': [13.444304, 144.793732],
    'Puerto Rico': [18.220833, -66.590149],
    'Virgin Islands': [18.3434, -64.8672]
}

# Convert the dictionary into a DataFrame
df_state = pd.DataFrame.from_dict(state_coordinates, orient='index', columns=['latitude', 'longitude'])
# Convert the dictionary into a DataFrame
df_state = pd.DataFrame.from_dict(state_coordinates, orient='index', columns=['latitude', 'longitude'])

# Calculate the median latitude and longitude
median_latitude = df_state['latitude'].median()
median_longitude = df_state['longitude'].median()

# Create a new column 'quadrant' based on the median latitude and longitude
df_state['quadrant'] = np.select(
    [
        (df_state['latitude'] > median_latitude) & (df_state['longitude'] > median_longitude), 
        (df_state['latitude'] <= median_latitude) & (df_state['longitude'] > median_longitude),
        (df_state['latitude'] > median_latitude) & (df_state['longitude'] <= median_longitude),
        (df_state['latitude'] <= median_latitude) & (df_state['longitude'] <= median_longitude)
    ], 
    [
        'Q1', 
        'Q2',
        'Q3',
        'Q4'
    ], 
    default='Unknown'
)

# Assuming df is your DataFrame and it has a column 'quadrant'
df_state['quadrant'] = df_state['quadrant'].map({'Q1': 0, 'Q2': 1, 'Q3': 2, 'Q4': 3})

# Define the coordinates for each quadrant
quadrant_coordinates = {
    'Q1': [median_latitude, median_longitude],
    'Q2': [median_latitude, -median_longitude],
    'Q3': [-median_latitude, -median_longitude],
    'Q4': [-median_latitude, median_longitude]
}

def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371.0

    # Calculate the distance
    distance = r * c

    return distance

# Assuming df is your DataFrame and it has a column 'state'
# Merge the coordinates into your DataFrame
df = pd.merge(df, df_state, left_on='State', right_index=True)

# Calculate the distance to each quadrant
for quadrant, (lat, lon) in quadrant_coordinates.items():
    df[f'distance_to_{quadrant}'] = df.apply(lambda row: haversine(row['latitude'], row['longitude'], lat, lon), axis=1)

# Drop the 'state' column
df = df.drop(columns=['State'])
df = df.drop(columns=['latitude'])
df = df.drop(columns=['longitude'])

# Define the number of unique categories
n_categories = df['quadrant'].nunique()

# Create new columns for the sine and cosine transformation of the quadrant
df['quadrant_sin'] = np.sin(2 * np.pi * df['quadrant'] / n_categories)
df['quadrant_cos'] = np.cos(2 * np.pi * df['quadrant'] / n_categories)
df = df.drop(columns=['quadrant'])
df.shape

(380932, 58)

In [14]:
df.to_csv("./DataEncoding.csv", index=False)