## Data Mining Techniques
### COVID-19 data
Kimberley Boersma (2572145), Neil Mizzi (2674737), Selma Muhammad (Stud no)

In [1]:
# Imports
import os
import pandas as pd
import csv
import kaggle

# other imports
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from copy import copy
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import date, datetime
from dateutil.parser import parse
# import us
# import operator 
# plt.style.use('fivethirtyeight')
# %matplotlib inline 

# Covid Tracking Dataset (w/ hospitalised data)

Source: https://covidtracking.com/

## Step 1: Load and Clean the Data

In [2]:
all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
    del all_cases[row]

# TODO missing values
#      Do we get avg or missing values, or predict them?
#      See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72

for i, row in all_cases.iterrows():
    # Set Dates
    s = str(row['date'])
    all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))
    

# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
    if np.isnan(row['death']):
        all_cases.at[i, 'death'] = 0

## Missing values: Retrieving from other datasets or through merging columns (or both)

The following will be done:
- **Active Cases**: Retrieved from JHU dataset and calculating $active = pos-dead-recovered$
- **Beds per State**: Retrieved from External Datasets

In [3]:
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]

In [4]:
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')

# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
    # store state abbreviation in variable
    abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
    # add abbrev to new column 'Abbreviation' in us_population df
    us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
    # add abbrev to new column in hosp_beds df
    hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
    
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]

# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])

# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])

# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]

In [5]:
us_population.head()

Unnamed: 0,State,Abbreviation,Pop
0,Alabama,AL,4908621
1,Alaska,AK,734002
2,Arizona,AZ,7378494
3,Arkansas,AR,3038999
4,California,CA,39937489


In [6]:
hosp_beds.head()

Unnamed: 0,Abbreviation,Total
0,,2.4
1,AL,3.1
2,AK,2.2
3,AZ,1.9
4,AR,3.2


In [7]:
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]

In [8]:
# see what filtered dataframe looks like
all_cases.head()

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized
0,2020-05-03,AK,368.0,97.0,12.0,,,,262.0,9.0,
1,2020-05-03,AL,7725.0,,,1035.0,,,,290.0,1035.0
2,2020-05-03,AR,3431.0,1356.0,100.0,427.0,,20.0,1999.0,76.0,427.0
4,2020-05-03,AZ,8640.0,6681.0,732.0,1348.0,282.0,192.0,1597.0,362.0,1348.0
5,2020-05-03,CA,53616.0,,4734.0,,1468.0,,,2215.0,


In [9]:
# check which states have 0 positive cases
all_cases.loc[all_cases['positive'] == 0]

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized
2686,2020-03-17,WV,0.0,,,,,,,0.0,
2742,2020-03-16,WV,0.0,,,,,,,0.0,
2793,2020-03-15,WV,0.0,,,,,,,0.0,
2844,2020-03-14,WV,0.0,,,,,,,0.0,
2859,2020-03-13,ID,0.0,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
3244,2020-03-05,NE,0.0,,,,,,,0.0,
3249,2020-03-05,OH,0.0,,,,,,,0.0,
3252,2020-03-05,SC,0.0,,,,,,,0.0,
3255,2020-03-05,VA,0.0,,,,,,,0.0,


In [10]:
# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))

# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))

In [11]:
# merge dataframes us_population and all_cases
df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})

# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]

# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})

In [12]:
all_cases.head()

Unnamed: 0,date,state,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,State,bedsPerThousand
0,2020-05-03,AK,734002,368.0,97.0,12.0,,,,262.0,9.0,,Alaska,2.2
1,2020-05-03,AL,4908621,7725.0,,,1035.0,,,,290.0,1035.0,Alabama,3.1
2,2020-05-03,AR,3038999,3431.0,1356.0,100.0,427.0,,20.0,1999.0,76.0,427.0,Arkansas,3.2
3,2020-05-03,AZ,7378494,8640.0,6681.0,732.0,1348.0,282.0,192.0,1597.0,362.0,1348.0,Arizona,1.9
4,2020-05-03,CA,39937489,53616.0,,4734.0,,1468.0,,,2215.0,,California,1.8


In [13]:
# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
all_cases

Unnamed: 0,date,state,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,State,bedsPerThousand,total_beds
0,2020-05-03,AK,734002,368.0,97.0,12.0,,,,262.0,9.0,,Alaska,2.2,1614.8044
1,2020-05-03,AL,4908621,7725.0,,,1035.0,,,,290.0,1035.0,Alabama,3.1,15216.7251
2,2020-05-03,AR,3038999,3431.0,1356.0,100.0,427.0,,20.0,1999.0,76.0,427.0,Arkansas,3.2,9724.7968
3,2020-05-03,AZ,7378494,8640.0,6681.0,732.0,1348.0,282.0,192.0,1597.0,362.0,1348.0,Arizona,1.9,14019.1386
4,2020-05-03,CA,39937489,53616.0,,4734.0,,1468.0,,,2215.0,,California,1.8,71887.4802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,2020-01-26,WA,7797095,1.0,,,,,,,0.0,,Washington,1.7,13255.0615
3072,2020-01-25,WA,7797095,1.0,,,,,,,0.0,,Washington,1.7,13255.0615
3073,2020-01-24,WA,7797095,1.0,,,,,,,0.0,,Washington,1.7,13255.0615
3074,2020-01-23,WA,7797095,1.0,,,,,,,0.0,,Washington,1.7,13255.0615


In [14]:
# change abbreviations to state names
all_cases = all_cases.rename(columns={'state': 'abbrev'})
all_cases = all_cases.rename(columns={'State': 'state'})

In [15]:
# change location of 'state' column
cols = list(all_cases)
cols.insert(1, cols.pop(cols.index('state')))
all_cases = all_cases.loc[:, cols]

In [16]:
all_cases

Unnamed: 0,date,state,abbrev,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,bedsPerThousand,total_beds
0,2020-05-03,Alaska,AK,734002,368.0,97.0,12.0,,,,262.0,9.0,,2.2,1614.8044
1,2020-05-03,Alabama,AL,4908621,7725.0,,,1035.0,,,,290.0,1035.0,3.1,15216.7251
2,2020-05-03,Arkansas,AR,3038999,3431.0,1356.0,100.0,427.0,,20.0,1999.0,76.0,427.0,3.2,9724.7968
3,2020-05-03,Arizona,AZ,7378494,8640.0,6681.0,732.0,1348.0,282.0,192.0,1597.0,362.0,1348.0,1.9,14019.1386
4,2020-05-03,California,CA,39937489,53616.0,,4734.0,,1468.0,,,2215.0,,1.8,71887.4802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,2020-01-26,Washington,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
3072,2020-01-25,Washington,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
3073,2020-01-24,Washington,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
3074,2020-01-23,Washington,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615


- Load and clean JHU data
- Merge JHU dataset with main dataset

In [17]:
# This cell takes some time, as it needs to connect to Kaggle Servers to retrieve data
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)

In [18]:
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'

df_list = []
for dirname, _, files in os.walk(dir_jhu):
    for file in files:
        if 'gitignore' not in file and 'README' not in file:
            full_dir = os.path.join(dirname, file)
            #print(full_dir)
            df_list.append(pd.read_csv(full_dir))
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)

# combine Last Update with Last_Update
jhu_df['LastUpdate'] = jhu_df['Last_Update'].combine_first(jhu_df['Last Update'])

# format dates with parser
for i, row in jhu_df.iterrows():
    # update dates
    jhu_df.at[i, 'LastUpdate'] = parse(jhu_df.at[i, 'LastUpdate']).date()

# combine Country/Region with Country_Region
jhu_df['CountryRegion'] = jhu_df['Country/Region'].combine_first(jhu_df['Country_Region'])

# Retrieve only US data
jhu_df = jhu_df[jhu_df['CountryRegion']=='US']

# combine Province/State with Province_State
jhu_df['ProvinceState'] = jhu_df['Province/State'].combine_first(jhu_df['Province_State'])

# drop unnecessary columns
jhu_df = jhu_df.drop(['Admin2', 'Lat', 'Latitude', 'Long_', 'Longitude', 'Combined_Key', 'Country/Region',
                      'Country_Region', 'Province/State', 'Province_State',
                      'Last Update', 'Last_Update', 'FIPS'], axis=1)

# change column order
cols = list(jhu_df)
cols.insert(0, cols.pop(cols.index('CountryRegion')))
cols.insert(1, cols.pop(cols.index('ProvinceState')))
cols.insert(2, cols.pop(cols.index('LastUpdate')))
jhu_df = jhu_df.loc[:, cols]

# Change region to known US states
import us
state_abbrs = []
for state in us.states.STATES:
    state_abbrs.append(state.abbr)

for i, row in jhu_df.iterrows():
    for j in state_abbrs:
        if j in jhu_df.at[i, 'ProvinceState']:
            jhu_df.at[i, 'ProvinceState'] = us.states.lookup(j).name
        if jhu_df.at[i, 'ProvinceState'] == 'Washington, D.C.':
            jhu_df.at[i, 'ProvinceState'] = 'District of Columbia'

# Filter out unknown states
jhu_df = jhu_df[jhu_df['ProvinceState'].isin(list(all_cases.state.unique()))]

# Merge-sum rows with same date and State
jhu_df = jhu_df.groupby(['LastUpdate', 'ProvinceState']).agg(
    {
        'Active': sum,
        'Confirmed': sum,
        'Deaths': sum,
        'Recovered': sum
    }
).reset_index()

jhu_df

Unnamed: 0,LastUpdate,ProvinceState,Active,Confirmed,Deaths,Recovered
0,2020-01-22,Washington,0.0,1.0,0.0,0.0
1,2020-01-23,Washington,0.0,1.0,0.0,0.0
2,2020-01-24,Washington,0.0,1.0,0.0,0.0
3,2020-01-25,Illinois,0.0,1.0,0.0,0.0
4,2020-01-25,Washington,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
2799,2020-05-03,Virginia,17121.0,17738.0,617.0,0.0
2800,2020-05-03,Washington,14173.0,15003.0,830.0,0.0
2801,2020-05-03,West Virginia,1137.0,1185.0,48.0,0.0
2802,2020-05-03,Wisconsin,7326.0,7660.0,334.0,0.0


In [19]:
# Now that we have the JHU dataset relatively cleaned
# we can go ahead and merge its data with our main dataset

for i, row in all_cases.iterrows():
    last_update = all_cases.at[i, 'date']
    state = all_cases.at[i, 'state']
    matching_row = jhu_df[jhu_df['ProvinceState'] == state]
    matching_row = matching_row[matching_row['LastUpdate'] == last_update].reset_index()

    if len(matching_row.values) > 0:
        #all_cases.at[i, 'positive'] = matching_row['Confirmed'].values[0]
        all_cases.at[i, 'active'] = matching_row['Active'].values[0]
        #all_cases.at[i, 'recovered'] = matching_row['Recovered'].values[0]   --- JHU was inconsistent, therefore removed
        #all_cases.at[i, 'death'] = matching_row['Deaths'].values[0]

    # Replace unknown recovery numbers with 0
    if np.isnan(row['recovered']):
        all_cases.at[i, 'recovered'] = 0

    if all_cases.at[i, 'active'] == 0 or np.isnan(row['active']):
        positive = all_cases.at[i, 'positive']
        recovered = all_cases.at[i, 'recovered']
        dead = all_cases.at[i, 'death']
        all_cases.at[i, 'active'] = positive - recovered - dead

all_cases

Unnamed: 0,date,state,abbrev,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,bedsPerThousand,total_beds
0,2020-05-03,Alaska,AK,734002,368.0,356.0,12.0,,,,262.0,9.0,,2.2,1614.8044
1,2020-05-03,Alabama,AL,4908621,7725.0,7435.0,,1035.0,,,0.0,290.0,1035.0,3.1,15216.7251
2,2020-05-03,Arkansas,AR,3038999,3431.0,3300.0,100.0,427.0,,20.0,1999.0,76.0,427.0,3.2,9724.7968
3,2020-05-03,Arizona,AZ,7378494,8640.0,8034.0,732.0,1348.0,282.0,192.0,1597.0,362.0,1348.0,1.9,14019.1386
4,2020-05-03,California,CA,39937489,53616.0,51401.0,4734.0,,1468.0,,0.0,2215.0,,1.8,71887.4802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,2020-01-26,Washington,WA,7797095,1.0,1.0,,,,,0.0,0.0,,1.7,13255.0615
3072,2020-01-25,Washington,WA,7797095,1.0,1.0,,,,,0.0,0.0,,1.7,13255.0615
3073,2020-01-24,Washington,WA,7797095,1.0,1.0,,,,,0.0,0.0,,1.7,13255.0615
3074,2020-01-23,Washington,WA,7797095,1.0,1.0,,,,,0.0,0.0,,1.7,13255.0615


In [20]:
# Save formatted dataset offline in case of disaster
all_cases.to_csv('all_cases.csv')

## Step 2: Some Exploratory Data Analysis (EDA)

In [21]:
# TODO Get some insights on data

## Step 3: Build model for dependent Variable 
- To be used to predict hospitalizedCurrently
- Having more complete variables for inICUCurrently and onVentilatorCurrently will allow us to predict these numbers as well

In [22]:
# We are going to compare three models:
# - Linear Regression
# - Polynomial Regression
# - ElasticNet

# Copy DFs to not mess up original one
# We will use model_df for our regression model
model_df = all_cases.copy()

# Delete redundant rows
for row in ['abbrev', 'bedsPerThousand', 'hospitalized', 
'state', 'hospitalizedCumulative']:
    del model_df[row]

# Drop NaN values for hospitalizedCurrently
model_df = model_df.dropna(subset=['hospitalizedCurrently'])

# Drop Values with low active-hospitalised ratio
# TODO see if we can instead use this as a variable
# TODO see if we can drop by bedsPerThousand instead
model_df['ratio_hospital'] = model_df['hospitalizedCurrently'] / model_df['active']
model_df = model_df[~(model_df['ratio_hospital'] <= model_df.ratio_hospital.quantile(0.95))]

#model_df = model_df[~(model_df['ratio_hospital'] <= model_df['ratio_hospital'].median())]
del model_df['ratio_hospital']

# Get peek of model to use
model_df.head()

Unnamed: 0,date,population,positive,active,hospitalizedCurrently,inIcuCurrently,onVentilatorCurrently,recovered,death,total_beds
510,2020-04-23,734002,337.0,119.0,42.0,,,209.0,9.0,1614.8044
533,2020-04-23,5700671,2942.0,1206.0,268.0,104.0,,1536.0,200.0,14251.6775
546,2020-04-23,3954821,3017.0,954.0,284.0,156.0,,1884.0,179.0,11073.4988
922,2020-04-15,39937489,24424.0,23603.0,5163.0,1175.0,,0.0,821.0,71887.4802
942,2020-04-15,6169270,4895.0,4748.0,1024.0,,,0.0,147.0,19124.737


### Model 1: Linear Regression

In [23]:
import itertools

mse_df = {'param_comb': [],
'score': []}

independent_cols = [x for x in list(model_df.columns) if not(x in ['hospitalizedCurrently', 'inIcuCurrently', 'onVentilatorCurrently', 'date'])]

max_keep = len(independent_cols) - 5

param_comb = [list(x) for x in list(itertools.combinations(independent_cols, max_keep)) 
+ list(itertools.combinations(independent_cols, max_keep+1))
+ list(itertools.combinations(independent_cols, max_keep+2))
+ list(itertools.combinations(independent_cols, max_keep+3))
+ list(itertools.combinations(independent_cols, max_keep+4))
+ [independent_cols]]
combs = []
for sublist in param_comb:
    combs.append(list(sublist))

for comb in combs:
    # Independent vars
    X = model_df[comb]

    # Dependent var
    y = model_df['hospitalizedCurrently']

    # Fit model
    linear_model = LinearRegression()
    linear_model.fit(X, y)

    score=(mean_squared_error(y_true=y,y_pred=linear_model.predict(X)))
    
    mse_df['param_comb'].append(comb)
    mse_df['score'].append(np.sqrt(score))

mse_df = pd.DataFrame(mse_df)
mse_df.sort_values(by='score', ascending=True).head()

Unnamed: 0,param_comb,score
62,"[population, positive, active, recovered, deat...",174.103753
57,"[population, positive, active, recovered, tota...",174.113627
59,"[population, positive, recovered, death, total...",174.169729
45,"[population, positive, recovered, total_beds]",174.172361
60,"[population, active, recovered, death, total_b...",174.769236


### Model 2: ElasticNet Regression

Step 1: Get all combinations of independent params

In [38]:
comb_list = []
len_ind = len(independent_cols)
for i in range(len_ind):
    sub_list = list(itertools.combinations(independent_cols, len_ind-i))
    comb_list += list(list(x) for x in sub_list)

Step 2: Cross-validate every combination (Look for best hyperparams first)

In [44]:
cross_val_results = {
    'columns': [],
    'alpha': [],
    'l1_ratio': []
}

for ind_cols in comb_list:
    X = model_df[ind_cols]
    y = model_df['hospitalizedCurrently']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

    # Seek best params like we seek the COVID vaccine
    elastic = ElasticNet(normalize=True, max_iter=100000000)
    search=GridSearchCV(estimator=elastic,param_grid={
        'alpha':np.logspace(-5, 2, 8),
        'l1_ratio':[.1, .2, .4, .6, .75, .8, .85, .9, .95, 1]},
        scoring='neg_mean_squared_error',
        n_jobs=1,
        refit=True,
        cv=10)

    search.fit(X_train,y_train)
    cross_val_results['columns'].append(ind_cols)
    cross_val_results['alpha'].append(search.best_params_['alpha'])
    cross_val_results['l1_ratio'].append(search.best_params_['l1_ratio'])

cross_val_results = pd.DataFrame(cross_val_results)
cross_val_results

Unnamed: 0,columns,alpha,l1_ratio
0,"[population, positive, active, recovered, deat...",1.0000,1.00
1,"[population, positive, active, recovered, death]",1.0000,1.00
2,"[population, positive, active, recovered, tota...",0.0010,0.60
3,"[population, positive, active, death, total_beds]",1.0000,1.00
4,"[population, positive, recovered, death, total...",1.0000,1.00
...,...,...,...
58,[positive],0.0010,0.60
59,[active],0.0001,0.85
60,[recovered],0.0100,0.40
61,[death],10.0000,1.00


In [45]:
rmse_list = []
for i, row in cross_val_results.iterrows():
    alpha = cross_val_results.at[i, 'alpha']
    l1 = cross_val_results.at[i, 'l1_ratio']

    X = model_df[cross_val_results.at[i, 'columns']]
    y = model_df['hospitalizedCurrently']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    elastic = ElasticNet(normalize=True, alpha=alpha, l1_ratio=l1, max_iter=100000000)
    elastic.fit(X_train,y_train)
    rmse=(np.sqrt(mean_squared_error(y_true=y_test,y_pred=elastic.predict(X_test))))
    rmse_list.append(rmse)

cross_val_results['RMSE'] = rmse_list

cross_val_results.sort_values(by='RMSE', ascending=True).head()

Unnamed: 0,columns,alpha,l1_ratio,RMSE
20,"[positive, recovered, death, total_beds]",1.0,1.0,42.895878
28,"[population, active, total_beds]",0.0001,0.6,46.059021
48,"[positive, recovered]",0.001,0.85,69.25271
1,"[population, positive, active, recovered, death]",1.0,1.0,70.453937
7,"[population, positive, active, recovered]",0.001,0.6,71.739029


## Step 4: Using findings from dataset of hospital beds, conclude research problem