## Data Mining Techniques
### COVID-19 data
Kimberley Boersma (2572145), Neil Mizzi (2674737), Selma Muhammad (Stud no)

In [1]:
# Imports
import os
import pandas as pd
import csv
import kaggle

# other imports
import numpy as np 
# import matplotlib.pyplot as plt 
# import matplotlib.colors as mcolors
# import random
# import math
# import time
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import date, datetime
# import us
# import operator 
# plt.style.use('fivethirtyeight')
# %matplotlib inline 

# Covid Tracking Dataset (w/ hospitalised data)

Source: https://covidtracking.com/

## Step 1: Load and Clean the Data

In [2]:
all_cases = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')

# Delete unecessary rows
for row in ['negative', 'pending', 'hash', 'negativeIncrease', 'totalTestResults', 'totalTestResultsIncrease', 'dateChecked', 'fips', 'inIcuCumulative', 'onVentilatorCumulative', 'total', 'posNeg', 'deathIncrease', 'hospitalizedIncrease', 'positiveIncrease']:
    del all_cases[row]

# TODO missing values
#      Do we get avg or missing values, or predict them?
#      See https://developerzen.com/data-mining-handling-missing-values-the-database-bd2241882e72

for i, row in all_cases.iterrows():
    # Set Dates
    s = str(row['date'])
    all_cases.at[i, 'date'] = date(year=int(s[0:4]), month=int(s[4:6]), day=int(s[6:8]))
    

# Missing death figures means no death reports yet
# These are set to 0
for i, row in all_cases.iterrows():
    if np.isnan(row['death']):
        all_cases.at[i, 'death'] = 0

## Missing values: Retrieving from other datasets or through merging columns (or both)

The following will be done:
- **Active Cases**: Retrieved from $active = pos-dead-recovered$
- **Beds per State**: Retrieved from External Datasets

In [3]:
# TODO Replace active cases with JHU and/or regression model (Selma)
all_cases['active'] = all_cases['positive'] - all_cases['recovered'] - all_cases['death']
# change location of 'active' column
cols = list(all_cases)
cols.insert(3, cols.pop(cols.index('active')))
all_cases = all_cases.loc[:, cols]

In [4]:
# Load datasets for US population and Hospital beds per 1000
us_population = pd.read_csv('data/us_population.csv')
hosp_beds = pd.read_csv('data/hospital_beds.csv')
state_abbrev = pd.read_csv('data/us_state_names.csv')

# add state abbreviations to us_population and hospital beds dataframe
for state in state_abbrev['State'].tolist():
    abbrev = state_abbrev.loc[state_abbrev['State'] == state, 'Abbreviation'].tolist()[0]
    us_population.loc[us_population['State'] == state, 'Abbreviation'] = abbrev
    hosp_beds.loc[hosp_beds['Location'] == state, 'Abbreviation'] = abbrev
    
# change order of columns of us_population
cols = list(us_population)
cols.insert(2, cols.pop(cols.index('Abbreviation')))
us_population = us_population.loc[:, cols]

# drop unnecessary columns of us_population
us_population = us_population.drop(columns=['rank', 'State', 'Growth', 'Pop2018', 'Pop2010', 'growthSince2010', 'Percent', 'density'])

# drop unnecessary columns of hosp_beds
hosp_beds = hosp_beds.drop(columns=['Location', 'State/Local Government', 'Non-Profit', 'For-Profit'])

# change order of columns of hosp_beds
cols = list(hosp_beds)
cols.insert(0, cols.pop(cols.index('Abbreviation')))
hosp_beds = hosp_beds.loc[:, cols]

In [5]:
us_population.head()

Unnamed: 0,Abbreviation,Pop
0,AL,4908621
1,AK,734002
2,AZ,7378494
3,AR,3038999
4,CA,39937489


In [6]:
hosp_beds.head()

Unnamed: 0,Abbreviation,Total
0,,2.4
1,AL,3.1
2,AK,2.2
3,AZ,1.9
4,AR,3.2


In [7]:
# filter out non-existing states like 'AS'
all_cases = all_cases[all_cases['state'].isin(state_abbrev['Abbreviation'].tolist())]

In [8]:
# see what filtered dataframe looks like
all_cases.head()

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized
0,2020-04-26,AK,341.0,115.0,14.0,,,,217.0,9.0,
1,2020-04-26,AL,6270.0,,,845.0,,,,216.0,845.0
2,2020-04-26,AR,2941.0,1907.0,104.0,291.0,,25.0,985.0,49.0,291.0
4,2020-04-26,AZ,6526.0,4868.0,671.0,1037.0,308.0,200.0,1383.0,275.0,1037.0
5,2020-04-26,CA,42164.0,,4928.0,,1473.0,,,1710.0,


In [9]:
# check which states have 0 positive cases
all_cases.loc[all_cases['positive'] == 0]

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized
2294,2020-03-17,WV,0.0,,,,,,,0.0,
2350,2020-03-16,WV,0.0,,,,,,,0.0,
2401,2020-03-15,WV,0.0,,,,,,,0.0,
2452,2020-03-14,WV,0.0,,,,,,,0.0,
2467,2020-03-13,ID,0.0,,,,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...
2859,2020-03-05,NE,0.0,,,,,,,0.0,
2864,2020-03-05,OH,0.0,,,,,,,0.0,
2867,2020-03-05,SC,0.0,,,,,,,0.0,
2870,2020-03-05,VA,0.0,,,,,,,0.0,


In [10]:
# Split dataframes by date
df_split_by_date = dict(tuple(all_cases.groupby('date')))

# Split dataframes by state
df_split_by_state = dict(tuple(all_cases.groupby('state')))

df_split_by_state['NY'].head()

Unnamed: 0,date,state,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized
37,2020-04-26,NY,288045.0,247192.0,12819.0,57103.0,5016.0,,23887.0,16966.0,57103.0
93,2020-04-25,NY,282143.0,241657.0,13524.0,57103.0,5016.0,,23887.0,16599.0,57103.0
149,2020-04-24,NY,271590.0,231541.0,14258.0,57103.0,5016.0,,23887.0,16162.0,57103.0
205,2020-04-23,NY,263460.0,223833.0,15021.0,57103.0,5016.0,,23887.0,15740.0,57103.0
261,2020-04-22,NY,257216.0,218027.0,15599.0,57103.0,5016.0,,23887.0,15302.0,57103.0


In [11]:
# merge dataframes us_population and all_cases

df_merge_uspop = all_cases.merge(us_population, how='left', left_on='state', right_on='Abbreviation')
df_merge_uspop = df_merge_uspop.drop(columns=['Abbreviation'])
df_merge_uspop = df_merge_uspop.rename(columns={'Pop': 'population'})

# change location of 'population' column
cols = list(df_merge_uspop)
cols.insert(2, cols.pop(cols.index('population')))
df_merge_uspop = df_merge_uspop.loc[:, cols]

# merge dataframes hosp_beds and df_merge_uspop
df_merge_hosp = df_merge_uspop.merge(hosp_beds, how='left', left_on='state', right_on='Abbreviation')
df_merge_hosp = df_merge_hosp.drop(columns=['Abbreviation'])
all_cases = df_merge_hosp.rename(columns={'Total': 'bedsPerThousand'})

In [12]:
all_cases.head()

Unnamed: 0,date,state,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,bedsPerThousand
0,2020-04-26,AK,734002,341.0,115.0,14.0,,,,217.0,9.0,,2.2
1,2020-04-26,AL,4908621,6270.0,,,845.0,,,,216.0,845.0,3.1
2,2020-04-26,AR,3038999,2941.0,1907.0,104.0,291.0,,25.0,985.0,49.0,291.0,3.2
3,2020-04-26,AZ,7378494,6526.0,4868.0,671.0,1037.0,308.0,200.0,1383.0,275.0,1037.0,1.9
4,2020-04-26,CA,39937489,42164.0,,4928.0,,1473.0,,,1710.0,,1.8


In [16]:
# Calculate the total beds, and add the column
all_cases['total_beds'] = all_cases['population'] / 1000 * all_cases['bedsPerThousand']
all_cases

Unnamed: 0,date,state,population,positive,active,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,onVentilatorCurrently,recovered,death,hospitalized,bedsPerThousand,total_beds
0,2020-04-26,AK,734002,341.0,115.0,14.0,,,,217.0,9.0,,2.2,1614.8044
1,2020-04-26,AL,4908621,6270.0,,,845.0,,,,216.0,845.0,3.1,15216.7251
2,2020-04-26,AR,3038999,2941.0,1907.0,104.0,291.0,,25.0,985.0,49.0,291.0,3.2,9724.7968
3,2020-04-26,AZ,7378494,6526.0,4868.0,671.0,1037.0,308.0,200.0,1383.0,275.0,1037.0,1.9,14019.1386
4,2020-04-26,CA,39937489,42164.0,,4928.0,,1473.0,,,1710.0,,1.8,71887.4802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2722,2020-01-26,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
2723,2020-01-25,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
2724,2020-01-24,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615
2725,2020-01-23,WA,7797095,1.0,,,,,,,0.0,,1.7,13255.0615


## Step 2: Some Exploratory Data Analysis (EDA)

In [13]:
# TODO Get some insights on data

## Step 3: Build model for missing data (maybe? Up for discussion)

In [14]:
## TODO Prepare necessary data for model (filter out with values which have all data)

## Step 4: Using findings from dataset of hospital beds, conclude research problem

## Download datasets from Kaggle

In [15]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('benhamner/jhucovid19', path='./kaggle/input/jhucovid19/', unzip=True)

## Load JHU data

In [16]:
# Get Time-Series Data of cases as Pandas DataFrame
dir_jhu = './kaggle/input/jhucovid19/csse_covid_19_data/csse_covid_19_daily_reports'

df_list = []
for dirname, _, files in os.walk(dir_jhu):
    for file in files:
        if 'gitignore' not in file and 'README' not in file:
            full_dir = os.path.join(dirname, file)
            #print(full_dir)
            df_list.append(pd.read_csv(full_dir))
jhu_df = pd.concat(df_list, axis=0, ignore_index=True, sort=True)

jhu_df

Unnamed: 0,Active,Admin2,Combined_Key,Confirmed,Country/Region,Country_Region,Deaths,FIPS,Last Update,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered
0,,,,67760.0,Mainland China,,3024.0,,2020-03-10T15:13:05,,,30.9756,,112.2707,Hubei,,47743.0
1,,,,10149.0,Italy,,631.0,,2020-03-10T17:53:02,,,43.0000,,12.0000,,,724.0
2,,,,8042.0,Iran (Islamic Republic of),,291.0,,2020-03-10T19:13:20,,,32.0000,,53.0000,,,2731.0
3,,,,7513.0,Republic of Korea,,54.0,,2020-03-10T19:13:20,,,36.0000,,128.0000,,,247.0
4,,,,1784.0,France,,33.0,,2020-03-10T18:53:02,,,47.0000,,2.0000,,,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114236,,,,7.0,Afghanistan,,0.0,,2020-03-11T20:00:00,,,33.0000,,65.0000,,,0.0
114237,,,,2.0,Monaco,,0.0,,2020-03-11T20:00:00,,,43.7333,,7.4167,,,0.0
114238,,,,1.0,Liechtenstein,,0.0,,2020-03-11T20:00:00,,,47.1400,,9.5500,,,0.0
114239,,,,1.0,Guyana,,1.0,,2020-03-11T20:00:00,,,5.0000,,-58.7500,,,0.0
