## Get Raw Data

In [2]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

In [4]:
#accesses local file
# csv_file = "Resources/time_series_19-covid-Confirmed.csv"
# raw_covid_df = pd.read_csv(csv_file).sort_values('Country/Region').reset_index(drop=True)

# direct URL access for up-to-date info
# non-raw data: https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv

url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
raw_covid_df = pd.read_csv(url).sort_values('Country/Region').reset_index(drop=True)

In [5]:
#Cleaning: remove parens from country names
raw_covid_df['Country/Region'] = raw_covid_df['Country/Region'].str.replace('(',"").str.replace(")","")
raw_covid_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,21,22,22,22,24,24,40,40,74,84
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,51,55,59,64,70,76,89,104,123,146
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,54,60,74,87,90,139,201,230,264,302
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,2,39,39,53,75,88,113,133,164,188
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,1,2,2,3,3,3


## Separating Databases

In [6]:
#locale Index DB
index=np.arange(len(raw_covid_df))
raw_covid_df.insert(0,'Locale Index',index)

locale_index = pd.DataFrame()
locale_index['lat'] = raw_covid_df['Lat']
locale_index['long'] = raw_covid_df['Long']
locale_index['index'] = raw_covid_df['Locale Index']

In [7]:
#States DB
states_dirty = raw_covid_df[~raw_covid_df["Province/State"].str.contains(',', na=False)]
states = states_dirty[states_dirty['Province/State'].notna()]
# states.head()

In [8]:
#Cities DB (only in the US for now)
cities = raw_covid_df[raw_covid_df["Province/State"].str.contains(',', na=False)]
# cities.head()

In [10]:
#Countries Only DB

#remove cities (as keeping cities would create case redundancies)
covid_countries = raw_covid_df[~raw_covid_df["Province/State"].str.contains(',', na=False)]

#use groupby to combine regions/states into country total, drop lat and long
covid_countries = covid_countries.groupby(['Country/Region']).sum().drop(columns=['Lat','Long'])

covid_countries.head()

Unnamed: 0_level_0,Locale Index,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,...,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,21,22,22,22,24,24,40,40,74,84
Albania,1,0,0,0,0,0,0,0,0,0,...,51,55,59,64,70,76,89,104,123,146
Algeria,2,0,0,0,0,0,0,0,0,0,...,54,60,74,87,90,139,201,230,264,302
Andorra,3,0,0,0,0,0,0,0,0,0,...,2,39,39,53,75,88,113,133,164,188
Angola,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,2,3,3,3


## Begin SQL Connection

In [11]:
rds_connection_string = "postgres:postgres@localhost:5432/covid_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [12]:
covid_countries.to_sql(name='covid_country', con=engine, if_exists='replace',index=True)
#if using constraints in SQL, using replace would not work. 

In [13]:
engine.table_names()

['covid_country']

In [18]:
pd.read_sql_query('select * from covid_country', con=engine).head()

SyntaxError: invalid syntax (<ipython-input-18-618dceba0c2d>, line 1)

In [None]:
# SQL Query:
# select * from covid_country where "Country/Region" = 'US'  or "Country/Region" = 'Italy' or "Country/Region" = 'China';