In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Previewing data

In [7]:
#Preview of the data
covid_df = pd.read_csv('us_covid19_daily.csv')
pd.set_option('display.max_columns', None)
covid_df.head(5)

Unnamed: 0,date,states,positive,negative,pending,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,onVentilatorCurrently,onVentilatorCumulative,recovered,dateChecked,death,hospitalized,totalTestResults,lastModified,total,posNeg,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease,hash
0,20201206,56,14534035,161986294,13592.0,101487.0,585676.0,20145.0,31946.0,7094.0,3322.0,5624444.0,2020-12-06T24:00:00Z,273374.0,585676.0,204063869,2020-12-06T24:00:00Z,0,0,1138,2256,1172590,176771,1634532,9cf16504f91958e803a2197daf8c2528a4eddc18
1,20201205,56,14357264,160813704,13433.0,101190.0,583420.0,19950.0,31831.0,7005.0,3321.0,5576026.0,2020-12-05T24:00:00Z,272236.0,583420.0,202429337,2020-12-05T24:00:00Z,0,0,2445,3316,1526995,211073,2169756,6249216c5f097c94ce33a811dab011a483a42404
2,20201204,56,14146191,159286709,12714.0,101276.0,580104.0,19858.0,31608.0,6999.0,3305.0,5470389.0,2020-12-04T24:00:00Z,269791.0,580104.0,200259581,2020-12-04T24:00:00Z,0,0,2563,4652,1260657,224831,1854869,ae30ea088584335ba4d57ee927f8dbda6add74db
3,20201203,56,13921360,158026052,15106.0,100755.0,575452.0,19723.0,31276.0,6867.0,3280.0,5404018.0,2020-12-03T24:00:00Z,267228.0,575452.0,198404712,2020-12-03T24:00:00Z,0,0,2706,5331,1238465,210204,1828230,0f253d185ecb336cdd18a4c61996eda1b7eef13b
4,20201202,56,13711156,156787587,14368.0,100322.0,570121.0,19680.0,31038.0,6855.0,3252.0,5322128.0,2020-12-02T24:00:00Z,264522.0,570121.0,196576482,2020-12-02T24:00:00Z,0,0,2733,5028,982032,195796,1459202,477c17b6302d0485195e77ecf8270a974f7a3c82


In [315]:
print(covid_df.columns)
print("\nNumber of rows and columns" + str(covid_df.shape) )
#Dataset has 320 rows and 25 columns

Index(['date', 'states', 'positive', 'negative', 'pending',
       'hospitalizedCurrently', 'hospitalizedCumulative', 'inIcuCurrently',
       'inIcuCumulative', 'onVentilatorCurrently', 'onVentilatorCumulative',
       'recovered', 'dateChecked', 'death', 'hospitalized', 'totalTestResults',
       'lastModified', 'total', 'posNeg', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease', 'hash'],
      dtype='object')

Number of rows and columns(320, 25)


# Dropping non essential columns

In [316]:
#Removal of non essential columns (useless columns)
cols_to_drop=['states','pending','dateChecked','lastModified','total','posNeg','hash']
covid_df.drop(columns=cols_to_drop, inplace=True)
print(covid_df.columns)
print(len(covid_df.columns))

Index(['date', 'positive', 'negative', 'hospitalizedCurrently',
       'hospitalizedCumulative', 'inIcuCurrently', 'inIcuCumulative',
       'onVentilatorCurrently', 'onVentilatorCumulative', 'recovered', 'death',
       'hospitalized', 'totalTestResults', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease'],
      dtype='object')
18


In [317]:
#Since we are monitoring the relation between covid and increases in stock DAILY, cummulative variables will be dropped
cum_cols_drop=['hospitalizedCumulative','inIcuCumulative','onVentilatorCumulative',]
covid_df.drop(columns=cum_cols_drop, inplace=True)
print(covid_df.columns)
print(len(covid_df.columns))

Index(['date', 'positive', 'negative', 'hospitalizedCurrently',
       'inIcuCurrently', 'onVentilatorCurrently', 'recovered', 'death',
       'hospitalized', 'totalTestResults', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease'],
      dtype='object')
15


In [318]:
# To avoid overfitting,unrelated variables to stocks will be dropped
# Purpose: Generalize covid situation
ntrelated_cols = ['inIcuCurrently','onVentilatorCurrently']
covid_df.drop(columns=ntrelated_cols, inplace=True)
print(covid_df.columns)
print(len(covid_df.columns))

Index(['date', 'positive', 'negative', 'hospitalizedCurrently', 'recovered',
       'death', 'hospitalized', 'totalTestResults', 'deathIncrease',
       'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
       'totalTestResultsIncrease'],
      dtype='object')
13


# Inverting dataset due to reversed date order

In [319]:
#Since the dates for the dataset is from latest to oldest, the order of the datset will be inverted
covid_df = covid_df[::-1].reset_index(drop=True)
covid_df.head(3)

Unnamed: 0,date,positive,negative,hospitalizedCurrently,recovered,death,hospitalized,totalTestResults,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,20200122,0,0,,,,,1,0,0,0,0,0
1,20200123,0,0,,,,,2,0,0,0,0,1
2,20200124,0,0,,,,,2,0,0,0,0,0


# Converting date to datetime format

In [320]:
covid_df['date'] = pd.to_datetime(covid_df['date'], format='%Y%m%d')
covid_df.head(50)

Unnamed: 0,date,positive,negative,hospitalizedCurrently,recovered,death,hospitalized,totalTestResults,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-01-22,0,0,,,,,1,0,0,0,0,0
1,2020-01-23,0,0,,,,,2,0,0,0,0,1
2,2020-01-24,0,0,,,,,2,0,0,0,0,0
3,2020-01-25,0,0,,,,,2,0,0,0,0,0
4,2020-01-26,0,0,,,,,2,0,0,0,0,0
5,2020-01-27,0,0,,,,,3,0,0,0,0,1
6,2020-01-28,0,0,,,,,3,0,0,0,0,0
7,2020-01-29,0,0,,,,,5,0,0,0,0,2
8,2020-01-30,0,0,,,,,5,0,0,0,0,0
9,2020-01-31,0,0,,,,,8,0,0,0,0,3


# Dropping rows with nan values

In [321]:
#Rows with nan values will be dropped , these columns have Nan Values , resulting from the beginning of covid cases
covid_df= covid_df.dropna().reset_index(drop=True)
covid_df.head(4)

Unnamed: 0,date,positive,negative,hospitalizedCurrently,recovered,death,hospitalized,totalTestResults,deathIncrease,hospitalizedIncrease,negativeIncrease,positiveIncrease,totalTestResultsIncrease
0,2020-03-25,75242,395632,5140.0,146.0,1058.0,7073.0,524577,238,1949,70882,12891,89096
1,2020-03-26,92976,482148,7805.0,97.0,1374.0,9551.0,632999,316,2478,86516,17734,108422
2,2020-03-27,112220,559423,10978.0,2418.0,1782.0,12123.0,735114,408,2572,77275,19244,102115
3,2020-03-28,131826,657185,12409.0,3145.0,2333.0,14532.0,835570,551,2409,97762,19606,100456


Notice that the dates start from 25th  of march instead of our original previous date, this reflects normal covid conditions, where there are people admitted to hospitals, recover, etc.

In [322]:
#Used for checking additional null values
covid_df.isnull().sum()

date                        0
positive                    0
negative                    0
hospitalizedCurrently       0
recovered                   0
death                       0
hospitalized                0
totalTestResults            0
deathIncrease               0
hospitalizedIncrease        0
negativeIncrease            0
positiveIncrease            0
totalTestResultsIncrease    0
dtype: int64

In [323]:
covid_df.to_csv('covid_df_preprocessed.csv', index=False)