In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import matplotlib.pyplot as plt

# Retriving Dataset

## JHU Dataset

In [2]:
df_confirmed = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df_deaths = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
df_recovered = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
df_covid19 = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/web-data/data/cases_country.csv")

In [3]:
dates = df_confirmed.columns.values.tolist()[4:]
df_confirmed = pd.melt(df_confirmed, id_vars=['Province/State','Country/Region','Lat','Long'], value_vars= dates)
df_deaths = pd.melt(df_deaths, id_vars=['Province/State','Country/Region','Lat','Long'], value_vars= dates)
df_recovered = pd.melt(df_recovered, id_vars=['Province/State','Country/Region','Lat','Long'], value_vars= dates)

In [4]:
df_confirmed = df_confirmed.rename(columns={'Country/Region':'Country_Region','Province/State':'Province_State','variable': 'Date','value':'Confirmed'})
df_deaths = df_deaths.rename(columns={'Country/Region':'Country_Region','Province/State':'Province_State','variable': 'Date','value':'Deaths'})
df_recovered = df_recovered.rename(columns={'Country/Region':'Country_Region','Province/State':'Province_State','variable': 'Date','value':'Recovered'})

## US State Level data

In [5]:
US_State = pd.DataFrame()
for month in range(1,4):
    for day in range(1,31):
        month = str(month).zfill(2)
        day = str(day).zfill(2)
        try:
            US_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/' + month + '-' + day +'-2020.csv'
            df_bin = pd.read_csv(US_url)
            US_State = pd.concat([US_State, df_bin], ignore_index = True)
        except:
            continue

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [6]:
US_State_copy = US_State.copy()

In [7]:
US_State_copy = US_State_copy[(US_State_copy['Country/Region'] == 'US') | (US_State_copy['Country_Region'] == 'US')]
US_State_copy = US_State_copy.reset_index(drop = True)

# Data Cleaning

In [8]:
US_State_copy.head()

Unnamed: 0,Active,Admin2,Combined_Key,Confirmed,Country/Region,Country_Region,Deaths,FIPS,Last Update,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered
0,,,,1.0,US,,,,1/22/2020 17:00,,,,,,Washington,,
1,,,,1.0,US,,,,1/23/20 17:00,,,,,,Washington,,
2,,,,1.0,US,,,,1/24/20 17:00,,,,,,Washington,,
3,,,,1.0,US,,,,1/24/20 17:00,,,,,,Chicago,,
4,,,,1.0,US,,,,1/25/20 17:00,,,,,,Washington,,


In [9]:
US_State_copy.columns

Index(['Active', 'Admin2', 'Combined_Key', 'Confirmed', 'Country/Region',
       'Country_Region', 'Deaths', 'FIPS', 'Last Update', 'Last_Update', 'Lat',
       'Latitude', 'Long_', 'Longitude', 'Province/State', 'Province_State',
       'Recovered'],
      dtype='object')

In [10]:
US_State_copy['Active']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
26978    0.0
26979    0.0
26980    0.0
26981    0.0
26982    0.0
Name: Active, Length: 26983, dtype: float64

In [11]:
US_State_copy = US_State_copy.drop(['Active','Admin2','Combined_Key','FIPS'],1)

## Time

In [12]:
# for column Last Update
def strip_specific_time_1(date_column):
    for date in range(len(date_column)):
        if type(date_column[date]) == str:
            if len(date_column[date]) == 19:
                dt = str(date_column[date]).split('T')[0]
                date_column[date] = dt
            elif str(date_column[date]).split(' ')[0].split('/')[2] == '2020':
                dt = str(date_column[date]).split(' ')[0].split('/')
                date_column[date] = str(dt[2])+'-'+str(dt[0].zfill(2))+'-'+str(dt[1].zfill(2))
            else:
                dt = str(date_column[date]).split(' ')[0].split('/')
                date_column[date] = str(dt[2])+'20-'+str(dt[0].zfill(2))+'-'+str(dt[1].zfill(2))
        else:
            continue 

In [13]:
# for column Last_Update
def strip_specific_time_2(date_column):
    for date in range(len(date_column)):
        if type(date_column[date]) == str:
            try: 
                dt = datetime.strptime(str(date_column[date]), '%Y-%m-%d %H:%M:%S')
                date_column[date] = str(dt.year)+'-'+str(dt.month).zfill(2)+'-'+str(dt.day).zfill(2)
            except:
                dt = str(date_column[date]).split(' ')[0].split('/')
                date_column[date] = str(dt[2])+'20-'+str(dt[0].zfill(2))+'-'+str(dt[1].zfill(2))
        else:
            continue

In [14]:
strip_specific_time_2(US_State_copy['Last_Update'])
strip_specific_time_1(US_State_copy['Last Update'])
# combine the two columns
US_State_copy['Last_Update'][:1555] = US_State_copy['Last Update'][:1555]
US_State_copy = US_State_copy.drop('Last Update',1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [15]:
US_State_copy['Last_Update'].unique()

array(['2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25',
       '2020-01-26', '2020-01-27', '2020-01-28', '2020-01-29',
       '2020-01-30', '2020-02-01', '2020-02-03', '2020-02-05',
       '2020-02-09', '2020-02-11', '2020-02-13', '2020-02-21',
       '2020-02-22', '2020-02-24', '2020-02-25', '2020-02-26',
       '2020-02-27', '2020-02-28', '2020-02-29', '2020-03-01',
       '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
       '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09',
       '2020-03-10', '2020-03-11', '2020-03-12', '2020-03-14',
       '2020-03-13', '2020-03-15', '2020-03-16', '2020-03-17',
       '2020-03-18', '2020-03-19', '2020-03-20', '2020-03-21', nan,
       '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25',
       '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29'],
      dtype=object)

In [16]:
US_State_copy[US_State_copy['Last_Update'].isnull() == False]

Unnamed: 0,Confirmed,Country/Region,Country_Region,Deaths,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered
0,1.0,US,,,2020-01-22,,,,,Washington,,
1,1.0,US,,,2020-01-23,,,,,Washington,,
2,1.0,US,,,2020-01-24,,,,,Washington,,
3,1.0,US,,,2020-01-24,,,,,Chicago,,
4,1.0,US,,,2020-01-25,,,,,Washington,,
...,...,...,...,...,...,...,...,...,...,...,...,...
26978,56.0,,US,1.0,2020-03-29,13.4443,,144.7937,,,Guam,0.0
26979,0.0,,US,0.0,2020-03-29,15.0979,,145.6739,,,Northern Mariana Islands,0.0
26980,127.0,,US,3.0,2020-03-29,18.2208,,-66.5901,,,Puerto Rico,0.0
26981,0.0,,US,0.0,2020-03-29,0.0000,,0.0000,,,Recovered,2665.0


In [17]:
US_State_copy = US_State_copy[US_State_copy['Last_Update'].isnull() == False]
US_State_copy = US_State_copy.reset_index(drop = True)

## Country/Region, Province/State, Latitude, Longitude

In [18]:
US_State_copy[US_State_copy['Country_Region'].isnull()]

Unnamed: 0,Confirmed,Country/Region,Country_Region,Deaths,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered
0,1.0,US,,,2020-01-22,,,,,Washington,,
1,1.0,US,,,2020-01-23,,,,,Washington,,
2,1.0,US,,,2020-01-24,,,,,Washington,,
3,1.0,US,,,2020-01-24,,,,,Chicago,,
4,1.0,US,,,2020-01-25,,,,,Washington,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1550,15.0,US,,0.0,2020-03-21,,61.3707,,-152.4044,Alaska,,0.0
1551,15.0,US,,0.0,2020-03-21,,13.4443,,144.7937,Guam,,0.0
1552,14.0,US,,1.0,2020-03-20,,44.2998,,-99.4388,South Dakota,,0.0
1553,8.0,US,,0.0,2020-03-21,,38.4912,,-80.9545,West Virginia,,0.0


In [19]:
US_State_copy['Country_Region'][:1555] = US_State_copy['Country/Region'][:1555]
US_State_copy = US_State_copy.drop('Country/Region',1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
# The rest nulls are data for one country, so don't have province/state
US_State_copy[US_State_copy['Province_State'].isnull()]

Unnamed: 0,Confirmed,Country_Region,Deaths,Last_Update,Lat,Latitude,Long_,Longitude,Province/State,Province_State,Recovered
0,1.0,US,,2020-01-22,,,,,Washington,,
1,1.0,US,,2020-01-23,,,,,Washington,,
2,1.0,US,,2020-01-24,,,,,Washington,,
3,1.0,US,,2020-01-24,,,,,Chicago,,
4,1.0,US,,2020-01-25,,,,,Washington,,
...,...,...,...,...,...,...,...,...,...,...,...
1550,15.0,US,0.0,2020-03-21,,61.3707,,-152.4044,Alaska,,0.0
1551,15.0,US,0.0,2020-03-21,,13.4443,,144.7937,Guam,,0.0
1552,14.0,US,1.0,2020-03-20,,44.2998,,-99.4388,South Dakota,,0.0
1553,8.0,US,0.0,2020-03-21,,38.4912,,-80.9545,West Virginia,,0.0


In [21]:
def combine_same_columns(col1, col2):
    for i in range(len(col1)):
        if type(col1[i]) == str:
            col2[i] = col1[i]

In [22]:
combine_same_columns(US_State_copy['Province/State'],US_State_copy['Province_State'])
combine_same_columns(US_State_copy['Lat'],US_State_copy['Latitude'])
combine_same_columns(US_State_copy['Long_'],US_State_copy['Longitude'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [23]:
# Since we don't need Latitude and Longitude in this analysis, we directly delete these two columns
US_State_copy = US_State_copy.drop(['Province/State','Lat','Long_','Latitude','Longitude'], 1)

In [24]:
US_State_copy.isnull().any()

Confirmed         False
Country_Region    False
Deaths             True
Last_Update       False
Province_State    False
Recovered          True
dtype: bool

In [25]:
US_State_copy = US_State_copy[US_State_copy['Country_Region'].isnull() == False]

In [26]:
US_State_copy.isnull().any()

Confirmed         False
Country_Region    False
Deaths             True
Last_Update       False
Province_State    False
Recovered          True
dtype: bool

In [27]:
# Now all Nan we have is for Deaths and recovered, so we can fill using 0
US_State_copy = US_State_copy.fillna(0)

In [43]:
US_State_copy[(US_State_copy["Last_Update"] == "2020-03-11")&((US_State_copy["Province_State"]=="New York")| (US_State_copy["Province_State"].str.endswith("NY")))]


Unnamed: 0,Confirmed,Country_Region,Deaths,Last_Update,Province_State,Recovered
948,220.0,US,0.0,2020-03-11,New York,0.0
1085,421.0,US,0.0,2020-03-11,New York,0.0


# Country-level Analysis

## Prepare dataset
### China

In [29]:
def agg_China(df,column):    
    agg_China = pd.DataFrame(df[df['Country_Region'] == 'China'].groupby('Date')[column].sum(),columns = [column])
    agg_China = agg_China.reset_index(drop = False)
    df_China = df[df['Country_Region'] == 'China']
    df_China = df_China.merge(agg_China, left_on = 'Date', right_on = 'Date')
    return df_China

In [30]:
df_confirmed_China = agg_China(df_confirmed,'Confirmed')
df_deaths_China = agg_China(df_deaths,'Deaths')
df_recovered_China = agg_China(df_recovered,'Recovered')

In [31]:
df_confirmed_China = df_confirmed_China[['Country_Region','Date','Confirmed_y']]
df_confirmed_China = df_confirmed_China.drop_duplicates()
df_confirmed_China = df_confirmed_China.rename(columns = {'Confirmed_y':'Confirmed'}).reset_index(drop = True)

df_deaths_China = df_deaths_China[['Country_Region','Date','Deaths_y']]
df_deaths_China = df_deaths_China.drop_duplicates()
df_deaths_China = df_deaths_China.rename(columns = {'Deaths_y':'Deaths'}).reset_index(drop = True)

df_recovered_China = df_recovered_China[['Country_Region','Date','Recovered_y']]
df_recovered_China = df_recovered_China.drop_duplicates()
df_recovered_China = df_recovered_China.rename(columns = {'Recovered_y':'Recovered'}).reset_index(drop = True)

In [32]:
df_deaths_China = pd.DataFrame(df_deaths_China['Deaths'], columns = ['Deaths'])
df_recovered_China = pd.DataFrame(df_recovered_China['Recovered'], columns = ['Recovered'])
df_China = df_confirmed_China.merge(df_deaths_China, left_index = True, right_index = True)
df_China = df_China.merge(df_recovered_China, left_index = True, right_index = True)

### US

In [33]:
df_confirmed_US = df_confirmed[df_confirmed['Country_Region'] == 'US']
df_confirmed_US = df_confirmed_US[['Country_Region','Date','Confirmed']].reset_index(drop = True)

df_deaths_US = df_deaths[df_confirmed['Country_Region'] == 'US']
df_deaths_US = df_deaths_US[['Country_Region','Date','Deaths']].reset_index(drop = True)

df_recovered_US = df_recovered[df_recovered['Country_Region'] == 'US']
df_recovered_US = df_recovered_US[['Country_Region','Date','Recovered']].reset_index(drop = True)

In [34]:
df_deaths_US = pd.DataFrame(df_deaths_US['Deaths'], columns = ['Deaths'])
df_recovered_US = pd.DataFrame(df_recovered_US['Recovered'], columns = ['Recovered'])
df_US = df_confirmed_US.merge(df_deaths_US, left_index = True, right_index = True)
df_US = df_US.merge(df_recovered_US, left_index = True, right_index = True)

### Italy

In [35]:
df_confirmed_Italy = df_confirmed[df_confirmed['Country_Region'] == 'Italy']
df_confirmed_Italy = df_confirmed_Italy[['Country_Region','Date','Confirmed']].reset_index(drop = True)

df_deaths_Italy = df_deaths[df_confirmed['Country_Region'] == 'Italy']
df_deaths_Italy = df_deaths_Italy[['Country_Region','Date','Deaths']].reset_index(drop = True)

df_recovered_Italy = df_recovered[df_recovered['Country_Region'] == 'Italy']
df_recovered_Italy = df_recovered_Italy[['Country_Region','Date','Recovered']].reset_index(drop = True)

In [36]:
df_deaths_Italy = pd.DataFrame(df_deaths_Italy['Deaths'], columns = ['Deaths'])
df_recovered_Italy = pd.DataFrame(df_recovered_Italy['Recovered'], columns = ['Recovered'])
df_Italy = df_confirmed_Italy.merge(df_deaths_Italy, left_index = True, right_index = True)
df_Italy = df_Italy.merge(df_recovered_Italy, left_index = True, right_index = True)

In [37]:
# Add the number of new cases for each day
df_China["Confirmed_new"] = df_China["Confirmed"].diff()
df_China["Deaths_new"] = df_China["Deaths"].diff()
df_China["Recovered_new"] = df_China["Recovered"].diff()

df_Italy["Confirmed_new"] = df_Italy["Confirmed"].diff()
df_Italy["Deaths_new"] = df_Italy["Deaths"].diff()
df_Italy["Recovered_new"] = df_Italy["Recovered"].diff()

df_US["Confirmed_new"] = df_US["Confirmed"].diff()
df_US["Deaths_new"] = df_US["Deaths"].diff()
df_US["Recovered_new"] = df_US["Recovered"].diff()

In [38]:
# Add the Incremental ratio for new cases for each day
df_China["Confirmed_Incremental_Rate"] = df_China["Confirmed"].diff()/df_China["Confirmed"]
df_China["Deaths_Incremental_Rate"] = df_China["Deaths"].diff()/df_China["Deaths"]
df_China["Recovered_Incremental_Rate"] = df_China["Recovered"].diff()/df_China["Recovered"]

df_Italy["Confirmed_Incremental_Rate"] = df_Italy["Confirmed"].diff()/df_Italy["Confirmed"]
df_Italy["Deaths_Incremental_Rate"] = df_Italy["Deaths"].diff()/df_Italy["Deaths"]
df_Italy["Recovered_Incremental_Rate"] = df_Italy["Recovered"].diff()/df_Italy["Recovered"]

df_US["Confirmed_Incremental_Rate"] = df_US["Confirmed"].diff()/df_US["Confirmed"]
df_US["Deaths_Incremental_Rate"] = df_US["Deaths"].diff()/df_US["Deaths"]
df_US["Recovered_Incremental_Rate"] = df_US["Recovered"].diff()/df_US["Recovered"]

In [39]:
df_country = pd.concat([df_China,df_US,df_Italy], axis=0) 

In [40]:
df_country.to_csv('df_country.csv')

In [41]:
df_country.head()

Unnamed: 0,Country_Region,Date,Confirmed,Deaths,Recovered,Confirmed_new,Deaths_new,Recovered_new,Confirmed_Incremental_Rate,Deaths_Incremental_Rate,Recovered_Incremental_Rate
0,China,1/22/20,548,17,28,,,,,,
1,China,1/23/20,643,18,30,95.0,1.0,2.0,0.147745,0.055556,0.066667
2,China,1/24/20,920,26,36,277.0,8.0,6.0,0.301087,0.307692,0.166667
3,China,1/25/20,1406,42,39,486.0,16.0,3.0,0.345661,0.380952,0.076923
4,China,1/26/20,2075,56,49,669.0,14.0,10.0,0.32241,0.25,0.204082
