In [1]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt

In [2]:
#Bring in the CSV files / data

csv_path = "Resources/covid_19_data.csv"
csv_path2 = "Resources/H1N1_2009.csv"

covid = pd.read_csv(csv_path, parse_dates=["ObservationDate"])
h1n1 = pd.read_csv(csv_path2, parse_dates=["Update Time"], encoding = 'unicode_escape')


In [3]:
covid
h1n1

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,2009-07-06 09:00:00
1,Antigua and Barbuda,2,0.0,2009-07-06 09:00:00
2,Argentina,2485,60.0,2009-07-06 09:00:00
3,Australia,5298,10.0,2009-07-06 09:00:00
4,Austria,19,0.0,2009-07-06 09:00:00
...,...,...,...,...
1817,Thailand,2,0.0,2009-05-23 08:00:00
1818,Turkey,2,0.0,2009-05-23 08:00:00
1819,United Kingdom,117,0.0,2009-05-23 08:00:00
1820,United States of America,6552,9.0,2009-05-23 08:00:00


In [4]:
covid = covid.loc[:,['ObservationDate', 'Province/State', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']]

#Rename Columns
covid = covid.rename(columns={"ObservationDate": "Date"})
covid

Unnamed: 0,Date,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0
...,...,...,...,...,...,...
10353,2020-03-30,Wyoming,US,94.0,0.0,0.0
10354,2020-03-30,Xinjiang,Mainland China,76.0,3.0,73.0
10355,2020-03-30,Yukon,Canada,4.0,0.0,0.0
10356,2020-03-30,Yunnan,Mainland China,180.0,2.0,172.0


In [5]:
covid = covid[['Province/State', 'Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
covid

Unnamed: 0,Province/State,Country/Region,Date,Confirmed,Deaths,Recovered
0,Anhui,Mainland China,2020-01-22,1.0,0.0,0.0
1,Beijing,Mainland China,2020-01-22,14.0,0.0,0.0
2,Chongqing,Mainland China,2020-01-22,6.0,0.0,0.0
3,Fujian,Mainland China,2020-01-22,1.0,0.0,0.0
4,Gansu,Mainland China,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...
10353,Wyoming,US,2020-03-30,94.0,0.0,0.0
10354,Xinjiang,Mainland China,2020-03-30,76.0,3.0,73.0
10355,Yukon,Canada,2020-03-30,4.0,0.0,0.0
10356,Yunnan,Mainland China,2020-03-30,180.0,2.0,172.0


In [6]:
#Group Provinces and take largest cumulative confirmed and death number
province_df = covid.groupby(by='Province/State').agg('max').reset_index(drop=False)
as
#Group all provinces into their countries and add confirmed and death numbers
province_df = province_df.groupby(by='Country/Region').agg('sum').reset_index(drop=False)

province_df

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Australia,4361.0,17.0,257.0
1,Canada,7422.0,79.0,14.0
2,Denmark,1504.0,13.0,73.0
3,France,34745.0,1428.0,2249.0
4,Germany,5.0,0.0,0.0
5,Hong Kong,682.0,4.0,123.0
6,Israel,8.0,0.0,0.0
7,Lebanon,2.0,0.0,0.0
8,Macau,38.0,0.0,10.0
9,Mainland China,81479.0,3304.0,75790.0


In [7]:
#Remove countries that are in province_df dataset
remove_list = province_df['Country/Region']
global_data = covid[~covid['Country/Region'].isin(remove_list)]

#province_df
global_data = global_data.loc[:,['Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
global_data = global_data.groupby(by='Country/Region').agg('sum').reset_index(drop=False)
global_data

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,1051.0,25.0,21.0
3,Albania,1989.0,76.0,189.0
4,Algeria,4112.0,280.0,548.0
...,...,...,...,...
193,Vietnam,2718.0,0.0,760.0
194,West Bank and Gaza,498.0,5.0,88.0
195,Zambia,159.0,0.0,0.0
196,Zimbabwe,45.0,8.0,0.0


In [8]:
#Merge province and country data
global_data = pd.concat([global_data, province_df], ignore_index=True)

In [9]:
#global_data consists of every country with affected persons - total number of; Confirmed, Deaths, Recovered
global_data

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,1051.0,25.0,21.0
3,Albania,1989.0,76.0,189.0
4,Algeria,4112.0,280.0,548.0
...,...,...,...,...
208,Netherlands,3698.0,137.0,5.0
209,Others,61.0,0.0,0.0
210,Taiwan,47.0,1.0,17.0
211,UK,5786.0,244.0,119.0


In [10]:
h1n1 = h1n1.groupby(by='Country').agg('max').reset_index(drop=False)
h1n1

Unnamed: 0,Country,Cases,Deaths,Update Time
0,Algeria,5,0.0,2009-07-06 09:00:00
1,Antigua and Barbuda,2,0.0,2009-07-06 09:00:00
2,Argentina,2485,60.0,2009-07-06 09:00:00
3,Australia,5298,10.0,2009-07-06 09:00:00
4,Austria,19,0.0,2009-07-06 09:00:00
...,...,...,...,...
147,"Netherlands, Aruba",5,0.0,2009-07-06 09:00:00
148,"New Caledonia, FOC",12,0.0,2009-07-06 09:00:00
149,Puerto Rico,18,0.0,2009-07-06 09:00:00
150,"Saint Martin, FOC",1,0.0,2009-07-06 09:00:00
