In [1]:
import pandas as pd
from sqlalchemy import create_engine
import datetime as dt
import uuid

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base 
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

from sqlalchemy import Column, Integer, String, Float


In [2]:
#Bring in the CSV files / data

csv_path = "Resources/covid_19_data.csv"
csv_path2 = "Resources/H1N1_2009.csv"
csv_path3 = "Resources/global_h1n1.csv"

covid = pd.read_csv(csv_path, parse_dates=["ObservationDate"])
h1n1 = pd.read_csv(csv_path2, parse_dates=["Update Time"],encoding = 'unicode_escape')
global_h1n1_data = pd.read_csv(csv_path3)


In [3]:
covid
h1n1
covid
global_h1n1_data

Unnamed: 0,Country,Confirmed,Deaths
0,Algeria,5,0.0
1,Antigua and Barbuda,2,0.0
2,Argentina,2485,60.0
3,Australia,5298,10.0
4,Austria,19,0.0
...,...,...,...
140,Venezuela,206,0.0
141,Vietnam,181,0.0
142,Virgin Islands,1,0.0
143,West Bank and Gaza,60,0.0


# COVID-19 Data Tables

In [4]:
covid = covid.loc[:,['ObservationDate', 'Province/State', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered']]

#Rename Columns
covid = covid.rename(columns={"ObservationDate": "Date", "Province/State" : "Province", "Country/Region" : "Country"})

#Replace Values for country naming consistency
replace_values = {"(St. Martin)" : "St. Martin", "('St. Martin',)": "St. Martin", 
                  'Republic of Ireland' : "Ireland", 'Cabo Verde' : "Cape Verde" } 

covid = covid.replace({"Country": replace_values})

covid["Country"].unique()

array(['Mainland China', 'Hong Kong', 'Macau', 'Taiwan', 'US', 'Japan',
       'Thailand', 'South Korea', 'Singapore', 'Philippines', 'Malaysia',
       'Vietnam', 'Australia', 'Mexico', 'Brazil', 'Colombia', 'France',
       'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast',
       'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy',
       'UK', 'Russia', 'Sweden', 'Spain', 'Belgium', 'Others', 'Egypt',
       'Iran', 'Israel', 'Lebanon', 'Iraq', 'Oman', 'Afghanistan',
       'Bahrain', 'Kuwait', 'Austria', 'Algeria', 'Croatia',
       'Switzerland', 'Pakistan', 'Georgia', 'Greece', 'North Macedonia',
       'Norway', 'Romania', 'Denmark', 'Estonia', 'Netherlands',
       'San Marino', ' Azerbaijan', 'Belarus', 'Iceland', 'Lithuania',
       'New Zealand', 'Nigeria', 'North Ireland', 'Ireland', 'Luxembourg',
       'Monaco', 'Qatar', 'Ecuador', 'Azerbaijan', 'Czech Republic',
       'Armenia', 'Dominican Republic', 'Indonesia', 'Portugal',
       'Andorra', 'Latvia

In [5]:
covid = covid[['Country', 'Province', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
covid.loc[covid.Country == "Taiwan", "Province"] = "Taiwan"
covid

Unnamed: 0,Country,Province,Date,Confirmed,Deaths,Recovered
0,Mainland China,Anhui,2020-01-22,1.0,0.0,0.0
1,Mainland China,Beijing,2020-01-22,14.0,0.0,0.0
2,Mainland China,Chongqing,2020-01-22,6.0,0.0,0.0
3,Mainland China,Fujian,2020-01-22,1.0,0.0,0.0
4,Mainland China,Gansu,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...
13524,US,Wyoming,2020-04-09,230.0,0.0,0.0
13525,Mainland China,Xinjiang,2020-04-09,76.0,3.0,73.0
13526,Canada,Yukon,2020-04-09,7.0,0.0,0.0
13527,Mainland China,Yunnan,2020-04-09,184.0,2.0,173.0


In [6]:
#Group Provinces and take largest cumulative confirmed and death number
province_df = covid.groupby(by='Province').agg('max').reset_index(drop=False)

#Group all provinces into their countries and add confirmed and death numbers
province_df = province_df.groupby(by='Country').agg('sum').reset_index(drop=False)

province_df

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Australia,6108.0,51.0,1472.0
1,Canada,20678.0,503.0,14.0
2,Denmark,1521.0,13.0,148.0
3,France,35142.0,1440.0,2419.0
4,Germany,5.0,0.0,0.0
5,Hong Kong,973.0,4.0,293.0
6,Israel,8.0,0.0,0.0
7,Lebanon,2.0,0.0,0.0
8,Macau,45.0,0.0,10.0
9,Mainland China,81866.0,3335.0,77382.0


In [7]:
#Remove countries that are in province_df dataset
remove_list = province_df['Country']
global_covid_data = covid[~covid['Country'].isin(remove_list)]

#province_df
global_covid_data = global_covid_data.loc[:,['Country', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
global_covid_data = global_covid_data.groupby(by='Country').agg('sum').reset_index(drop=False)


global_covid_data

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,Afghanistan,4382.0,113.0,173.0
2,Albania,5335.0,267.0,1242.0
3,Algeria,16532.0,1661.0,1745.0
4,Andorra,7228.0,210.0,300.0
...,...,...,...,...
196,West Bank and Gaza,2601.0,15.0,363.0
197,Western Sahara,20.0,0.0,0.0
198,Zambia,542.0,8.0,50.0
199,Zimbabwe,140.0,23.0,0.0


In [8]:
#Merge province and country data
global_covid_data = pd.concat([global_covid_data, province_df], ignore_index=True)

In [9]:
#global_covid_data consists of every country with affected persons - total number of; Confirmed, Deaths, Recovered

global_covid_data

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,Afghanistan,4382.0,113.0,173.0
2,Albania,5335.0,267.0,1242.0
3,Algeria,16532.0,1661.0,1745.0
4,Andorra,7228.0,210.0,300.0
...,...,...,...,...
211,Netherlands,3772.0,143.0,35.0
212,Others,61.0,0.0,0.0
213,Taiwan,380.0,5.0,67.0
214,UK,6271.0,256.0,307.0


# H1N1 Data Tables

In [10]:
# global_h1n1_data = h1n1.groupby(by='Country').agg('max').reset_index(drop=False)

# #Rename Columns
# global_h1n1_data = global_h1n1_data.rename(columns={"Cases": "Confirmed", "Update Time": "Date"})

global_h1n1_data


Unnamed: 0,Country,Confirmed,Deaths
0,Algeria,5,0.0
1,Antigua and Barbuda,2,0.0
2,Argentina,2485,60.0
3,Australia,5298,10.0
4,Austria,19,0.0
...,...,...,...
140,Venezuela,206,0.0
141,Vietnam,181,0.0
142,Virgin Islands,1,0.0
143,West Bank and Gaza,60,0.0


In [11]:
h1n1 = h1n1.rename(columns={"Cases": "Confirmed", "Update Time": "Date"})

h1n1

Unnamed: 0,Country,Confirmed,Deaths,Date
0,"Guernsey, Crown Dependency",5,0.0,2009-07-06 09:00:00
1,"Guernsey, Crown Dependency",5,0.0,2009-07-03 09:00:00
2,"Guernsey, Crown Dependency",5,0.0,2009-07-01 09:00:00
3,"Guernsey, Crown Dependency",1,0.0,2009-06-29 09:00:00
4,"Guernsey, Crown Dependency",1,0.0,2009-06-26 07:00:00
...,...,...,...,...
1817,Yemen,6,0.0,2009-06-26 07:00:00
1818,Yemen,6,0.0,2009-06-24 07:00:00
1819,Yemen,5,0.0,2009-06-22 07:00:00
1820,Yemen,4,0.0,2009-06-19 07:00:00


In [12]:
#Create lists of all countries
country_covid = global_covid_data['Country']
country_h1n1 = global_h1n1_data['Country']

#Combine country lists together
country_df = pd.concat([country_covid, country_h1n1], ignore_index=True)

#Put countries into a DataFrame
country_df = pd.DataFrame(country_df)

#Drop Duplicate Countries
country_df = country_df.drop_duplicates("Country")

#Reset Index and make new index as a column
country_df = country_df.reset_index(drop=True)
country_df = country_df.reset_index(level=0)

#Rename index column to Country ID
country_df = country_df.rename(columns={"index": "Country_ID"})

country_df = country_df[['Country', 'Country_ID']]

country_df



Unnamed: 0,Country,Country_ID
0,Azerbaijan,0
1,Afghanistan,1
2,Albania,2
3,Algeria,3
4,Andorra,4
...,...,...
237,"United Kingdom, Isle of Man, Crown Dependency",237
238,"United Kingdom, Jersey, Crown Dependency",238
239,Vanuatu,239
240,Virgin Islands,240


In [13]:
#Merge on global_covid_data
global_covid_data = pd.merge(global_covid_data, country_df, how='inner', on='Country')
global_covid_data = global_covid_data[['Country_ID', 'Country', 'Confirmed', 'Deaths', 'Recovered']]

In [14]:
# #Merge on global_h1n1_data
# # global_h1n1_data = pd.merge(global_h1n1_data, country_df, how='inner', on='Country')
# global_h1n1_data = global_h1n1_data[['Country', 'Confirmed', 'Deaths']]
# global_h1n1_data.to_csv("Resources/global_h1n1.csv", index=False)

In [15]:
global_covid_data

Unnamed: 0,Country_ID,Country,Confirmed,Deaths,Recovered
0,0,Azerbaijan,1.0,0.0,0.0
1,1,Afghanistan,4382.0,113.0,173.0
2,2,Albania,5335.0,267.0,1242.0
3,3,Algeria,16532.0,1661.0,1745.0
4,4,Andorra,7228.0,210.0,300.0
...,...,...,...,...,...
211,211,Netherlands,3772.0,143.0,35.0
212,212,Others,61.0,0.0,0.0
213,213,Taiwan,380.0,5.0,67.0
214,214,UK,6271.0,256.0,307.0


In [16]:
country_df.head(20)

Unnamed: 0,Country,Country_ID
0,Azerbaijan,0
1,Afghanistan,1
2,Albania,2
3,Algeria,3
4,Andorra,4
5,Angola,5
6,Antigua and Barbuda,6
7,Argentina,7
8,Armenia,8
9,Aruba,9


In [17]:
#Merge on covid
covid = pd.merge(covid, country_df, how='inner', on='Country')
covid = covid[['Country_ID', 'Country', 'Province', 'Date', 'Confirmed', 'Deaths', 'Recovered']]
covid

Unnamed: 0,Country_ID,Country,Province,Date,Confirmed,Deaths,Recovered
0,210,Mainland China,Anhui,2020-01-22,1.0,0.0,0.0
1,210,Mainland China,Beijing,2020-01-22,14.0,0.0,0.0
2,210,Mainland China,Chongqing,2020-01-22,6.0,0.0,0.0
3,210,Mainland China,Fujian,2020-01-22,1.0,0.0,0.0
4,210,Mainland China,Gansu,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...,...
13524,197,Western Sahara,,2020-04-09,4.0,0.0,0.0
13525,158,Sao Tome and Principe,,2020-04-06,4.0,0.0,0.0
13526,158,Sao Tome and Principe,,2020-04-07,4.0,0.0,0.0
13527,158,Sao Tome and Principe,,2020-04-08,4.0,0.0,0.0


In [18]:
#Merge on h1n1
h1n1 = pd.merge(h1n1, country_df, how='inner', on='Country')
h1n1 = h1n1[['Country_ID', 'Country', 'Date', 'Confirmed', 'Deaths']]
h1n1

Unnamed: 0,Country_ID,Country,Date,Confirmed,Deaths
0,226,"Guernsey, Crown Dependency",2009-07-06 09:00:00,5,0.0
1,226,"Guernsey, Crown Dependency",2009-07-03 09:00:00,5,0.0
2,226,"Guernsey, Crown Dependency",2009-07-01 09:00:00,5,0.0
3,226,"Guernsey, Crown Dependency",2009-06-29 09:00:00,1,0.0
4,226,"Guernsey, Crown Dependency",2009-06-26 07:00:00,1,0.0
...,...,...,...,...,...
1817,241,Yemen,2009-06-26 07:00:00,6,0.0
1818,241,Yemen,2009-06-24 07:00:00,6,0.0
1819,241,Yemen,2009-06-22 07:00:00,5,0.0
1820,241,Yemen,2009-06-19 07:00:00,4,0.0


In [19]:
#Set Index for all dataframes
# country_df.set_index("Country ID", inplace=True)
# global_covid_data.set_index("Country ID", inplace=True)
# global_h1n1_data.set_index("Country ID", inplace=True)
# covid.set_index("Country ID", inplace=True)
# h1n1.set_index("Country ID", inplace=True)


In [20]:
global_covid_data

Unnamed: 0,Country_ID,Country,Confirmed,Deaths,Recovered
0,0,Azerbaijan,1.0,0.0,0.0
1,1,Afghanistan,4382.0,113.0,173.0
2,2,Albania,5335.0,267.0,1242.0
3,3,Algeria,16532.0,1661.0,1745.0
4,4,Andorra,7228.0,210.0,300.0
...,...,...,...,...,...
211,211,Netherlands,3772.0,143.0,35.0
212,212,Others,61.0,0.0,0.0
213,213,Taiwan,380.0,5.0,67.0
214,214,UK,6271.0,256.0,307.0


In [21]:
global_h1n1_data

Unnamed: 0,Country,Confirmed,Deaths
0,Algeria,5,0.0
1,Antigua and Barbuda,2,0.0
2,Argentina,2485,60.0
3,Australia,5298,10.0
4,Austria,19,0.0
...,...,...,...
140,Venezuela,206,0.0
141,Vietnam,181,0.0
142,Virgin Islands,1,0.0
143,West Bank and Gaza,60,0.0


In [22]:
covid

Unnamed: 0,Country_ID,Country,Province,Date,Confirmed,Deaths,Recovered
0,210,Mainland China,Anhui,2020-01-22,1.0,0.0,0.0
1,210,Mainland China,Beijing,2020-01-22,14.0,0.0,0.0
2,210,Mainland China,Chongqing,2020-01-22,6.0,0.0,0.0
3,210,Mainland China,Fujian,2020-01-22,1.0,0.0,0.0
4,210,Mainland China,Gansu,2020-01-22,0.0,0.0,0.0
...,...,...,...,...,...,...,...
13524,197,Western Sahara,,2020-04-09,4.0,0.0,0.0
13525,158,Sao Tome and Principe,,2020-04-06,4.0,0.0,0.0
13526,158,Sao Tome and Principe,,2020-04-07,4.0,0.0,0.0
13527,158,Sao Tome and Principe,,2020-04-08,4.0,0.0,0.0


In [23]:
country_df[['Country_ID', 'Country']]

Unnamed: 0,Country_ID,Country
0,0,Azerbaijan
1,1,Afghanistan
2,2,Albania
3,3,Algeria
4,4,Andorra
...,...,...
237,237,"United Kingdom, Isle of Man, Crown Dependency"
238,238,"United Kingdom, Jersey, Crown Dependency"
239,239,Vanuatu
240,240,Virgin Islands


   # Load DataFrames into SQlite DB

In [24]:
engine = create_engine("sqlite:///COVID19_vs_H1N1.sqlite")
conn = engine.connect()

In [25]:
country_df.to_sql('country', con=engine, index=False, if_exists='append')

IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: country.Country_ID
[SQL: INSERT INTO country ("Country", "Country_ID") VALUES (?, ?)]
[parameters: ((' Azerbaijan', 0), ('Afghanistan', 1), ('Albania', 2), ('Algeria', 3), ('Andorra', 4), ('Angola', 5), ('Antigua and Barbuda', 6), ('Argentina', 7)  ... displaying 10 of 242 total bound parameter sets ...  ('Virgin Islands', 240), ('Yemen', 241))]
(Background on this error at: http://sqlalche.me/e/gkpj)

In [None]:
global_covid_data.to_sql('global_covid_data', con=engine, index=True, if_exists='append')
#engine.execute("SELECT * FROM global_covid_data").fetchall()

In [None]:
covid.to_sql('covid', con=engine, index=True, if_exists='append')

In [None]:
h1n1.to_sql(name='h1n1', con=engine, index=True, if_exists='append')

In [None]:
global_h1n1_data.to_sql('global_h1n1_data', con=engine, index=True, if_exists='append')

In [None]:
engine.table_names()

# Load DataFrames into database

In [None]:
#country_df.to_sql(name='country', con=engine, index=True, if_exists='replace')

In [None]:
#global_covid_data.to_sql(name='global_covid_data', con=engine, index=True, if_exists='replace')

In [None]:
#global_h1n1_data.to_sql(name='global_h1n1_data', con=engine, index=True, if_exists='replace')

In [None]:
#covid.to_sql(name='covid', con=engine, index=True, if_exists='replace')

In [None]:
#h1n1.to_sql(name='h1n1', con=engine, index=True, if_exists='replace')

In [27]:
tw = covid[covid.Country ==  "Israel"]

In [34]:
tw.iloc[8675,:]

IndexError: single positional indexer is out-of-bounds