In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn import tree
from sklearn.model_selection import train_test_split


In [3]:
# Load training data.
df_train = pd.read_csv('./train.csv.zip') # Can read from zip files directly.

# Load test data.
df_test = pd.read_csv('./test.csv.zip')

# Only train on data points where result is known
df_train = df_train.loc[ (df_train.death_yn==1) | (df_train.death_yn==0) ]

#show dataframe
df_test

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,case_month,res_state,state_fips_code,res_county,county_fips_code,age_group,sex,race,ethnicity,case_positive_specimen_interval,case_onset_interval,process,exposure_yn,labconfirmed_yn,symptomatic_yn,hosp_yn,icu_yn,underlying_conditions_yn
0,2021-10,OR,41.0,MARION,41047.0,65+ years,Female,White,Unknown,,0.0,Missing,,1,1.0,0.0,,
1,2021-10,KS,20.0,SEDGWICK,20173.0,18 to 49 years,Male,White,Non-Hispanic/Latino,0.0,0.0,Missing,,0,1.0,0.0,,
2,2021-10,MD,24.0,CALVERT,24009.0,65+ years,Female,White,,,,Missing,,1,,,,
3,2021-10,KS,20.0,CRAWFORD,20037.0,18 to 49 years,Female,White,Non-Hispanic/Latino,0.0,0.0,Missing,1.0,0,1.0,0.0,,
4,2021-10,NJ,34.0,BURLINGTON,34005.0,0 - 17 years,Male,White,Non-Hispanic/Latino,,,Missing,,1,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594407,2021-12,NY,36.0,BROOME,36007.0,18 to 49 years,Male,White,Non-Hispanic/Latino,,,Missing,,1,,,,
2594408,2021-12,ID,16.0,TWIN FALLS,16083.0,18 to 49 years,Male,,,0.0,,Clinical evaluation,,0,,,,
2594409,2021-12,NJ,34.0,ESSEX,34013.0,18 to 49 years,Female,,,,,Missing,,1,,,,
2594410,2021-12,MI,26.0,KALAMAZOO,26077.0,18 to 49 years,Male,White,Non-Hispanic/Latino,,0.0,Missing,,1,1.0,0.0,,


In [4]:
#Look at what states are present in the test data 
print( np.unique(df_test['res_state']) )

#I noted there are no nans here but there are some in the training set, so I will remove them from the training set (very small amount)
df_train = df_train.loc[ ~pd.isna(df_train['res_state']) ]

print( np.unique(df_train['res_state']) )


['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'FL' 'GA' 'ID' 'IL' 'IN' 'KS'
 'KY' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MT' 'NC' 'ND' 'NH' 'NJ' 'NM' 'NV'
 'NY' 'OH' 'OK' 'OR' 'PA' 'PR' 'RI' 'SC' 'TN' 'TX' 'UT' 'VA' 'VI' 'VT'
 'WA' 'WI' 'WY']
['AK' 'AL' 'AR' 'AZ' 'CA' 'CO' 'CT' 'DC' 'DE' 'FL' 'GA' 'GU' 'IA' 'ID'
 'IL' 'IN' 'KS' 'KY' 'LA' 'MA' 'MD' 'ME' 'MI' 'MN' 'MO' 'MS' 'MT' 'NC'
 'ND' 'NH' 'NJ' 'NM' 'NV' 'NY' 'OH' 'OK' 'OR' 'PA' 'PR' 'RI' 'SC' 'TN'
 'TX' 'UT' 'VA' 'VI' 'VT' 'WA' 'WI' 'WY']


In [5]:
# Read CSV with long lat positions of each state
# Obtained from https://developers.google.com/public-data/docs/canonical/states_csv
state_long_lat = pd.read_csv('./states.csv')
# Set state code as index
state_long_lat = state_long_lat.set_index('state')

state_long_lat

Unnamed: 0_level_0,latitude,longitude,name
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,63.588753,-154.493062,Alaska
AL,32.318231,-86.902298,Alabama
AR,35.20105,-91.831833,Arkansas
AZ,34.048928,-111.093731,Arizona
CA,36.778261,-119.417932,California
CO,39.550051,-105.782067,Colorado
CT,41.603221,-73.087749,Connecticut
DC,38.905985,-77.033418,District of Columbia
DE,38.910832,-75.52767,Delaware
FL,27.664827,-81.515754,Florida


In [6]:
# Read CSV with long lat positions of each county
# Obtained from https://simplemaps.com/data/us-counties
county_long_lat = pd.read_csv('./uscounties.csv')
# Set county fips code as index
county_long_lat = county_long_lat.set_index('county_fips')

county_long_lat

Unnamed: 0_level_0,county,county_ascii,county_full,state_id,state_name,lat,lng,population
county_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6037,Los Angeles,Los Angeles,Los Angeles County,CA,California,34.3207,-118.2248,10081570
17031,Cook,Cook,Cook County,IL,Illinois,41.8401,-87.8168,5198275
48201,Harris,Harris,Harris County,TX,Texas,29.8577,-95.3936,4646630
4013,Maricopa,Maricopa,Maricopa County,AZ,Arizona,33.3490,-112.4915,4328810
6073,San Diego,San Diego,San Diego County,CA,California,33.0341,-116.7353,3316073
...,...,...,...,...,...,...,...,...
31005,Arthur,Arthur,Arthur County,NE,Nebraska,41.5689,-101.6958,427
31117,McPherson,McPherson,McPherson County,NE,Nebraska,41.5682,-101.0605,395
48269,King,King,King County,TX,Texas,33.6166,-100.2558,237
48301,Loving,Loving,Loving County,TX,Texas,31.8493,-103.5799,98


In [None]:
#Add longitude and lattitude to df
#If no county info is present, use the state's long/lat 

def get_lat(row):
    if pd.isna(row['county_fips_code']):
        return state_long_lat.loc[row['res_state'],'latitude']
    else:
        return county_long_lat.loc[row['county_fips_code'],'lat']
    
def get_long(row):
    if pd.isna(row['county_fips_code']):
        return state_long_lat.loc[row['res_state'],'longitude']
    else:
        return county_long_lat.loc[row['county_fips_code'],'lng']
    

df_train['lat'] = df_train.apply(lambda row: get_lat(row),axis=1)
df_train['long'] = df_train.apply(lambda row: get_long(row),axis=1)
df_test['lat'] = df_test.apply(lambda row: get_lat(row),axis=1)
df_test['long'] = df_test.apply(lambda row: get_long(row),axis=1)

df_train


In [None]:
#modify icu_yn to be True or False, nan taken as False
df_train['icu_yn'] = df_train['icu_yn']==1
df_test['icu_yn'] = df_test['icu_yn']==1

df_test

In [None]:
# Look at the unique entries for age groups

print(df_train['age_group'].unique())

print(df_test['age_group'].unique())

In [None]:
# The median age of an american falls in the '18 to 49 years' group, I convert missing values into this group
# Instead of using age groups, I assign a number that is the median of the age bin

def get_age(x):
    if pd.isna(x) or x == 'Missing':
        return(34)
    elif x =='65+ years':
        return(80)
    elif x =='18 to 49 years':
        return(34)
    elif x =='50 to 64 years':
        return(57)
    elif x =='0 - 17 years':
        return(9)


df_train['age'] = df_train['age_group'].apply(lambda x: get_age(x))
df_test['age'] = df_test['age_group'].apply(lambda x: get_age(x))

df_train

In [None]:
# Look at the unique entries for sex

print(df_train['sex'].unique())

print(df_test['sex'].unique())

In [None]:
# Modify sex so that nan and missing are the same
# This leaves 4 sex groups: Male, Female, Unknown, and nan
# I will this encode this as one hot

def get_sex(x):
    if pd.isna(x):
        return('Missing')
    else:
        return(x)


df_train['sex2'] = df_train['sex'].apply(lambda x: get_sex(x))
df_test['sex2'] = df_test['sex'].apply(lambda x: get_sex(x))

df_train

In [None]:
# Check how many process entries are missing
sum(df_train['process']=='Missing')

#since 12011438 out of 13394403 are missing I will ignore process

In [None]:
#modify hosp_yn to be True or False, nan is False
df_train['hosp_yn'] = df_train['hosp_yn']==1
df_test['hosp_yn'] = df_test['hosp_yn']==1

In [None]:
print( "Fraction cpsi nan: ", sum(pd.isna(df_train['case_positive_specimen_interval']) ) *1.0 / len(df_train['case_positive_specimen_interval']) )

print("Fraction of death corresponding to cpsi nan: ",sum(df_train['death_yn'].loc[pd.isna(df_train['case_positive_specimen_interval'])])/sum(df_train['death_yn']))


# 70% of the deaths happen when case_positive_specimen_interval = np.nan
# Only 50% of cpsi == np.nan
# cpsi == np.nan increases probability of death, I will assign this a value of 200 to clearly distinguish it from the others


def get_cpsi(x):
    if pd.isna(x):
        return(200)
    else:
        return(x)


df_train['cpsi'] = df_train['case_positive_specimen_interval'].apply(lambda x: get_cpsi(x))
df_test['cpsi'] = df_test['case_positive_specimen_interval'].apply(lambda x: get_cpsi(x))

df_train

In [None]:
#This feature is almost all empty or 0, doesn't seem useful - will remove
sum(pd.isna(df_train['case_onset_interval']) | (df_train['case_onset_interval']==0) )

In [None]:
#modify underlying_conditions_yn to be True or False, nan is False
df_train['underlying_conditions_yn'] = df_train['underlying_conditions_yn']==1
df_test['underlying_conditions_yn'] = df_test['underlying_conditions_yn']==1

In [None]:
# Isolate the data that I will use

df_train = df_train.loc[:,['case_month', 'lat', 'long', 'age', 'sex2', 'icu_yn','hosp_yn', 'cpsi', 'underlying_conditions_yn', 'race', 'ethnicity', 'death_yn']]
df_test = df_test.loc[:,['case_month', 'lat', 'long', 'age', 'sex2', 'icu_yn', 'hosp_yn', 'cpsi','underlying_conditions_yn', 'race', 'ethnicity']]

df_train

In [None]:
print("ethnicities: ",df_train['ethnicity'].unique())

In [None]:
print("ethnicities: ",df_train['ethnicity'].unique())

print("nan death rate: ", sum(df_train['death_yn'].loc[ pd.isna(df_train['ethnicity']) ] ) *1.0 / sum(pd.isna(df_train['ethnicity']) ) )
print("unknown death rate: ",sum(df_train['death_yn'].loc[ df_train['ethnicity'] ==  'Unknown' ] ) *1.0 / sum( df_train['ethnicity'] ==  'Unknown'  ))
print("Hispanic death rate: ",sum(df_train['death_yn'].loc[ df_train['ethnicity'] ==  'Hispanic/Latino' ] ) *1.0 / sum( df_train['ethnicity'] ==  'Hispanic/Latino'  ))
print("Missing death rate: ",sum(df_train['death_yn'].loc[ df_train['ethnicity'] ==  'Missing' ] ) *1.0 / sum( df_train['ethnicity'] ==  'Missing'  ))
print("Non-hispanic death rate: ",sum(df_train['death_yn'].loc[ df_train['ethnicity'] ==  'Non-Hispanic/Latino' ] ) *1.0 / sum( df_train['ethnicity'] ==  'Non-Hispanic/Latino'  ))

# I will merge unknown and missing as they seem equivalent
# I will set nan to Omitted as it seems different
# Hispanics are mor likely to die than Non-hispanics

def get_eth(x):
    if x == 'Missing':
        return('Unknown')
    elif pd.isna(x):
        return('Omitted')
    else:
        return(x)


df_train['ethnicity'] = df_train['ethnicity'].apply(lambda x: get_eth(x))
df_test['ethnicity'] = df_test['ethnicity'].apply(lambda x: get_eth(x))

df_train

In [None]:
print("races: ",df_train['race'].unique())

for race in df_train['race'].unique():
    if pd.isna(race):
        print("nan death rate: ", sum(df_train['death_yn'].loc[ pd.isna(df_train['race']) ] )*1.0 / sum(pd.isna(df_train['race']) ) )
    else:
        print(race + " death rate: ", sum(df_train['death_yn'].loc[df_train['race'] ==race]) *1.0 / sum(df_train['race'] ==race) )
        
        
# I Will keep all of these

In [None]:
#Replace nan with Omitted so I can use a one hot encoder

def get_race(x):
    if pd.isna(x):
        return('Omitted')
    else:
        return(x)


df_train['race'] = df_train['race'].apply(lambda x: get_race(x))
df_test['race'] = df_test['race'].apply(lambda x: get_race(x))

df_train

In [None]:
df_train.to_csv('./trian_mod_final.csv')
df_test.to_csv('./test_mod_final.csv')