In [89]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt

In [90]:
#Bring in the CSV files / data

csv_path = "Resources/covid_19_data.csv"
csv_path2 = "Resources/H1N1_2009.csv"

covid = pd.read_csv(csv_path, parse_dates=["ObservationDate"])
h1n1 = pd.read_csv(csv_path2, parse_dates=["Update Time"], encoding = 'unicode_escape')


In [91]:
covid
h1n1

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,2009-07-06 09:00:00
1,Antigua and Barbuda,2,0.0,2009-07-06 09:00:00
2,Argentina,2485,60.0,2009-07-06 09:00:00
3,Australia,5298,10.0,2009-07-06 09:00:00
4,Austria,19,0.0,2009-07-06 09:00:00
...,...,...,...,...
1817,Thailand,2,0.0,2009-05-23 08:00:00
1818,Turkey,2,0.0,2009-05-23 08:00:00
1819,United Kingdom,117,0.0,2009-05-23 08:00:00
1820,United States of America,6552,9.0,2009-05-23 08:00:00


In [92]:
covid = covid.loc[:,['ObservationDate', 'Province/State', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']]

#Rename Columns
covid = covid.rename(columns={"ObservationDate": "Date"})
covid

Unnamed: 0,Date,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0
...,...,...,...,...,...,...
10041,2020-03-29,Wyoming,US,86.0,0.0,0.0
10042,2020-03-29,Xinjiang,Mainland China,76.0,3.0,73.0
10043,2020-03-29,Yukon,Canada,4.0,0.0,0.0
10044,2020-03-29,Yunnan,Mainland China,180.0,2.0,172.0


In [93]:
covid = covid[['Province/State', 'Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
covid

Unnamed: 0,Province/State,Country/Region,Date,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,2020-01-22,1.0,0.0,0.0
1,Beijing,Mainland China,2020-01-22,14.0,0.0,0.0
2,Chongqing,Mainland China,2020-01-22,6.0,0.0,0.0
3,Fujian,Mainland China,2020-01-22,1.0,0.0,0.0
4,Gansu,Mainland China,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...
10041,Wyoming,US,2020-03-29,86.0,0.0,0.0
10042,Xinjiang,Mainland China,2020-03-29,76.0,3.0,73.0
10043,Yukon,Canada,2020-03-29,4.0,0.0,0.0
10044,Yunnan,Mainland China,2020-03-29,180.0,2.0,172.0


In [94]:
#Group Provinces and take largest cumulative confirmed and death number
province_df = covid.groupby(by='Province/State').agg('max').reset_index(drop=False)

#Group all provinces into their countries and add confirmed and death numbers
province_df = province_df.groupby(by='Country/Region').agg('sum').reset_index(drop=False)

province_df

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Australia,3984.0,16.0,244.0
1,Canada,6304.0,63.0,14.0
2,Denmark,1495.0,13.0,73.0
3,France,34665.0,1427.0,2236.0
4,Germany,5.0,0.0,0.0
5,Hong Kong,641.0,4.0,112.0
6,Israel,8.0,0.0,0.0
7,Lebanon,2.0,0.0,0.0
8,Macau,37.0,0.0,10.0
9,Mainland China,81445.0,3300.0,75460.0


In [95]:
#Remove countries that are in province_df dataset
remove_list = province_df['Country/Region']
global_data = covid[~covid['Country/Region'].isin(remove_list)]

#province_df
global_data = global_data.loc[:,['Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
global_data = global_data.groupby(by='Country/Region').agg('sum').reset_index(drop=False)
global_data

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,881.0,21.0,19.0
3,Albania,1766.0,65.0,145.0
4,Algeria,3528.0,245.0,511.0
...,...,...,...,...
192,Vietnam,2515.0,0.0,705.0
193,West Bank and Gaza,382.0,4.0,70.0
194,Zambia,124.0,0.0,0.0
195,Zimbabwe,38.0,7.0,0.0


In [96]:
#Merge province and country data
global_data = pd.concat([global_data, province_df], ignore_index=True)

In [97]:
#global_data consists of every country with affected persons - total number of; Confirmed, Deaths, Recovered
global_data

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,881.0,21.0,19.0
3,Albania,1766.0,65.0,145.0
4,Algeria,3528.0,245.0,511.0
...,...,...,...,...
207,Netherlands,3695.0,137.0,5.0
208,Others,61.0,0.0,0.0
209,Taiwan,47.0,1.0,17.0
210,UK,5732.0,244.0,99.0


In [103]:
h1n1 = h1n1.groupby(by='Country').agg('max').reset_index(drop=False)
h1n1

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,2009-07-06 09:00:00
1,Antigua and Barbuda,2,0.0,2009-07-06 09:00:00
2,Argentina,2485,60.0,2009-07-06 09:00:00
3,Australia,5298,10.0,2009-07-06 09:00:00
4,Austria,19,0.0,2009-07-06 09:00:00
...,...,...,...,...
147,"Netherlands, Aruba",5,0.0,2009-07-06 09:00:00
148,"New Caledonia, FOC",12,0.0,2009-07-06 09:00:00
149,Puerto Rico,18,0.0,2009-07-06 09:00:00
150,"Saint Martin, FOC",1,0.0,2009-07-06 09:00:00
