# Covid-19 Virus 

Covid-19 tracking data are sourced from John Hopkins site: 

https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports


In [1]:
import os
import pandas as pd

In [2]:
# Path 
path = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/"

In [4]:
# Join various path components  
file_path = os.path.join(path, "03-13-2020"+".csv")
file_path
df1 = pd.read_csv(file_path, parse_dates = ['Last Update'])
df1.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,China,2020-03-13 11:09:03,67786,3062,51553,30.9756,112.2707
1,Guangdong,China,2020-03-13 11:09:03,1356,8,1296,23.3417,113.4244
2,Henan,China,2020-03-11 08:13:09,1273,22,1249,33.882,113.614
3,Zhejiang,China,2020-03-12 01:33:02,1215,1,1197,29.1832,120.0934
4,Hunan,China,2020-03-13 11:09:03,1018,4,1005,27.6104,111.7088


In [5]:
type(df1)

pandas.core.frame.DataFrame

In [3]:
import datetime as dt

In [92]:
start = dt.datetime(2020, 1, 22)
end = dt.datetime(2020, 3, 22)
rng = pd.date_range(start, end, freq='D')

dt_list = []
for date in rng:
    date_str = date.strftime("%m-%d-%Y.csv")
    dt_list.append(date_str)


dfList = []
for file in dt_list:
    df = pd.read_csv(os.path.join(path, file), parse_dates = ['Last Update'])
    dfList.append(df)


data_df = (pd.concat(dfList,sort=False)
             .fillna(0)
            )
data_df.head()

In [97]:
data_df2 = data_df.copy()

In [117]:
def df_clean(df):
    # new data frame with split value columns 
    new = df["Country/Region"].str.split(",", n = 1, expand = True) 

    # making separate first name column from new data frame 
    df["State/Province"]= new[1] 

    # making separate last name column from new data frame 
    df["Country"]= new[0] 

    df['State/Province'] = df['State/Province'].where(df['State/Province']=="None", df['Province/State'], axis='index')

    data_df = (df.drop(columns = ['Province/State', 'Country/Region'])
                 .rename(columns = {'Last Update': 'Last_Update_DateTime'}))

    return data_df

In [99]:
data_df = df_clean(data_df2)

In [100]:
data_df.head()

Unnamed: 0,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,State/Province,Country
0,2020-01-22 17:00:00,1.0,0.0,0.0,0.0,0.0,Anhui,Mainland China
1,2020-01-22 17:00:00,14.0,0.0,0.0,0.0,0.0,Beijing,Mainland China
2,2020-01-22 17:00:00,6.0,0.0,0.0,0.0,0.0,Chongqing,Mainland China
3,2020-01-22 17:00:00,1.0,0.0,0.0,0.0,0.0,Fujian,Mainland China
4,2020-01-22 17:00:00,0.0,0.0,0.0,0.0,0.0,Gansu,Mainland China


In [21]:
data_df.to_pickle("Data/covid19_data_df.pkl")

In [70]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7926 entries, 0 to 308
Data columns (total 8 columns):
Last Update       7926 non-null datetime64[ns]
Confirmed         7926 non-null float64
Deaths            7926 non-null float64
Recovered         7926 non-null float64
Latitude          7926 non-null float64
Longitude         7926 non-null float64
State/Province    7926 non-null object
Country           7926 non-null object
dtypes: datetime64[ns](1), float64(5), object(2)
memory usage: 557.3+ KB


In [69]:
data_df = pd.read_pickle("Data/covid19_data_df.pkl")

In [71]:
dt_index = pd.DatetimeIndex(data_df['Last Update'].dt.date)

In [72]:
data_df = data_df.set_index(dt_index)

In [73]:
data_df.index

DatetimeIndex(['2020-01-22', '2020-01-22', '2020-01-22', '2020-01-22',
               '2020-01-22', '2020-01-22', '2020-01-22', '2020-01-22',
               '2020-01-22', '2020-01-22',
               ...
               '2020-03-18', '2020-03-18', '2020-03-19', '2020-03-22',
               '2020-03-17', '2020-03-17', '2020-03-22', '2020-03-17',
               '2020-03-19', '2020-03-18'],
              dtype='datetime64[ns]', name='Last Update', length=7926, freq=None)

In [79]:
data_df['2020-03-22']

Unnamed: 0_level_0,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,State/Province,Country
Last Update,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-22,2020-03-22 09:43:06,67800.0,3144.0,59433.0,30.9756,112.2707,Hubei,China
2020-03-22,2020-03-22 18:13:20,59138.0,5476.0,7024.0,41.8719,12.5674,0,Italy
2020-03-22,2020-03-22 23:13:18,28768.0,1772.0,2575.0,40.4637,-3.7492,0,Spain
2020-03-22,2020-03-22 23:43:02,24873.0,94.0,266.0,51.1657,10.4515,0,Germany
2020-03-22,2020-03-22 14:13:06,21638.0,1685.0,7931.0,32.4279,53.6880,0,Iran
...,...,...,...,...,...,...,...,...
2020-03-22,2020-03-22 01:43:03,1.0,0.0,0.0,1.3733,32.2903,0,Uganda
2020-03-22,2020-03-22 11:43:03,0.0,0.0,0.0,15.1111,-23.6167,0,Cape Verde
2020-03-22,2020-03-22 11:13:23,0.0,0.0,0.0,-8.5500,125.5600,0,East Timor
2020-03-22,2020-03-22 08:13:35,0.0,1.0,0.0,13.4443,144.7937,0,Guam


In [120]:
data_df.to_csv("Data/covid19_data.csv", index=False)

In [86]:
data_df = (data_df.reset_index()
            )
data_df

Unnamed: 0,Last Update,Last_Update_DateTime,Confirmed,Deaths,Recovered,Latitude,Longitude,State/Province,Country
0,2020-01-22,2020-01-22 17:00:00,1.0,0.0,0.0,0.0000,0.000,Anhui,Mainland China
1,2020-01-22,2020-01-22 17:00:00,14.0,0.0,0.0,0.0000,0.000,Beijing,Mainland China
2,2020-01-22,2020-01-22 17:00:00,6.0,0.0,0.0,0.0000,0.000,Chongqing,Mainland China
3,2020-01-22,2020-01-22 17:00:00,1.0,0.0,0.0,0.0000,0.000,Fujian,Mainland China
4,2020-01-22,2020-01-22 17:00:00,0.0,0.0,0.0,0.0000,0.000,Gansu,Mainland China
...,...,...,...,...,...,...,...,...,...
7921,2020-03-17,2020-03-17 18:33:03,0.0,0.0,0.0,49.1900,-2.110,0,Jersey
7922,2020-03-22,2020-03-22 22:43:02,0.0,1.0,0.0,18.2000,-66.500,0,Puerto Rico
7923,2020-03-17,2020-03-17 21:33:03,0.0,0.0,0.0,-1.4400,15.556,0,Republic of the Congo
7924,2020-03-19,2020-03-19 12:13:38,0.0,0.0,0.0,24.2500,-76.000,0,The Bahamas


In [88]:
data_df.columns

Index(['Last Update', 'Last_Update_DateTime', 'Confirmed', 'Deaths',
       'Recovered', 'Latitude', 'Longitude', 'State/Province', 'Country'],
      dtype='object')

In [113]:
data_df = pd.read_csv("Data/covid19_data.csv")

### Update today's data

In [102]:
today_str = date.today().strftime("%m-%d-%Y.csv")
today_str

'03-23-2020.csv'

In [118]:
file_path = os.path.join(path, today_str)
df_today = (pd.read_csv(file_path, parse_dates = ['Last_Update'])
            .rename(columns = {'Last_Update': 'Last Update',
                              'Country_Region': 'Country/Region',
                              'Province_State': 'Province/State',
                              'Admin2': 'County',
                              'Lat':'Longitude',
                              'Long_':'Latitude'})
           )
df_today = df_clean(df_today)
df_today.head()

Unnamed: 0,FIPS,County,Last_Update_DateTime,Longitude,Latitude,Confirmed,Deaths,Recovered,Active,Combined_Key,State/Province,Country
0,45001.0,Abbeville,2020-03-23 23:19:34,34.223334,-82.461707,1,0,0,0,"Abbeville, South Carolina, US",South Carolina,US
1,22001.0,Acadia,2020-03-23 23:19:34,30.295065,-92.414197,1,0,0,0,"Acadia, Louisiana, US",Louisiana,US
2,51001.0,Accomack,2020-03-23 23:19:34,37.767072,-75.632346,1,0,0,0,"Accomack, Virginia, US",Virginia,US
3,16001.0,Ada,2020-03-23 23:19:34,43.452658,-116.241552,13,0,0,0,"Ada, Idaho, US",Idaho,US
4,19001.0,Adair,2020-03-23 23:19:34,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US",Iowa,US


In [119]:
df_today = df_today[['Last_Update_DateTime', 'Confirmed', 'Deaths',
       'Recovered', 'Latitude', 'Longitude', 'County', 'State/Province', 'Country']]

In [121]:
df_today.to_csv('Data/covid19_data_03232020.csv', index=False)

In [9]:
start1 = dt.datetime(2020, 3, 21)
end1 = dt.datetime(2020, 3, 21)

In [12]:
def data_import1(start, end):
    rng = pd.date_range(start, end, freq='D')

    dt_list = []
    for date in rng:
        date_str = date.strftime("%m-%d-%Y.csv")
        dt_list.append(date_str)


    dfList = []
    for file in dt_list:
        df = pd.read_csv(os.path.join(path, file), parse_dates = ['Last Update'])
        dfList.append(df)


    data_df = (pd.concat(dfList,sort=False)
                 .fillna(0)
                )
    return data_df


In [13]:
data_df = data_import1(start1, end1)

In [14]:
start2 = dt.datetime(2020, 3, 22)
end2 = dt.datetime(2020, 3, 26)

In [15]:
def data_import2(start, end):
    rng = pd.date_range(start, end, freq='D')

    dt_list = []
    for date in rng:
        date_str = date.strftime("%m-%d-%Y.csv")
        dt_list.append(date_str)


    dfList = []
    for file in dt_list:
        df = pd.read_csv(os.path.join(path, file), parse_dates = ['Last_Update'])
        dfList.append(df)


    data_df = (pd.concat(dfList,sort=False)
                 .fillna(0)
                )
    return data_df


In [17]:
data_df2 = data_import2(start2, end2)

In [18]:
data_df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,China,2020-03-21 10:13:08,67800,3139,58946,30.9756,112.2707
1,0,Italy,2020-03-21 17:43:03,53578,4825,6072,41.8719,12.5674
2,0,Spain,2020-03-21 13:13:30,25374,1375,2125,40.4637,-3.7492
3,0,Germany,2020-03-21 20:43:02,22213,84,233,51.1657,10.4515
4,0,Iran,2020-03-21 11:13:12,20610,1556,7635,32.4279,53.688


In [19]:
data_df2.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,36061.0,New York City,New York,US,2020-03-22 23:45:00,40.767273,-73.971526,9654,63,0,0,"New York City, New York, US"
1,36059.0,Nassau,New York,US,2020-03-22 23:45:00,40.740665,-73.589419,1900,4,0,0,"Nassau, New York, US"
2,36119.0,Westchester,New York,US,2020-03-22 23:45:00,41.162784,-73.757417,1873,0,0,0,"Westchester, New York, US"
3,36103.0,Suffolk,New York,US,2020-03-22 23:45:00,40.883201,-72.801217,1034,9,0,0,"Suffolk, New York, US"
4,36087.0,Rockland,New York,US,2020-03-22 23:45:00,41.150279,-74.025605,455,1,0,0,"Rockland, New York, US"
