In [1]:
# import dependencies 
import pandas as pd
from sqlalchemy import create_engine

from config import (username, password)

### Extract CSVs into DataFrames

In [2]:
# import file and load into DataFrame
covid_data_file = "Resources/owid-covid-data.csv"
covid_data_df = pd.read_csv(covid_data_file)
covid_data_df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


In [3]:
# inspect the DataFrame
covid_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84747 entries, 0 to 84746
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   iso_code                               84747 non-null  object 
 1   continent                              80654 non-null  object 
 2   location                               84747 non-null  object 
 3   date                                   84747 non-null  object 
 4   total_cases                            82674 non-null  float64
 5   new_cases                              82672 non-null  float64
 6   new_cases_smoothed                     81671 non-null  float64
 7   total_deaths                           73026 non-null  float64
 8   new_deaths                             73184 non-null  float64
 9   new_deaths_smoothed                    81671 non-null  float64
 10  total_cases_per_million                82227 non-null  float64
 11  ne

In [4]:
covid_data_df.shape

(84747, 59)

In [5]:
# check the list of countries
covid_data_df.location.unique()

array(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Asia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso',
       'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde',
       'Cayman Islands', 'Central African Republic', 'Chad', 'Chile',
       'China', 'Colombia', 'Comoros', 'Congo', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Europe', 'European Union', 'Faeroe Islands', 'Falkland Islands',
       'Fij

In [6]:
# remove all rows with data for 'World'
covid_data_df.drop(covid_data_df[covid_data_df.location == 'World'].index, inplace=True)

In [7]:
# rename 'Micronesia (country)' to 'Micronesia'
covid_data_df.location.replace('Micronesia (country)','Micronesia', inplace=True)

In [8]:
# rename column 'location'
covid_data_df = covid_data_df.rename(columns={"location": "country"})

### Create COVID Cases DataFrame

In [9]:
# create filtered DataFrame from specific columns related to covid cases
cases_cols = ["country", "date", "total_cases", "new_cases", "new_cases_smoothed"]
covidcases_df = covid_data_df[cases_cols].copy()

covidcases_df = covidcases_df.reset_index(drop=True)

covidcases_df.head()

Unnamed: 0,country,date,total_cases,new_cases,new_cases_smoothed
0,Afghanistan,2020-02-24,1.0,1.0,
1,Afghanistan,2020-02-25,1.0,0.0,
2,Afghanistan,2020-02-26,1.0,0.0,
3,Afghanistan,2020-02-27,1.0,0.0,
4,Afghanistan,2020-02-28,1.0,0.0,


In [10]:
# inspect the DataFrame
covidcases_df.shape

(84284, 5)

In [11]:
covidcases_df.describe()

Unnamed: 0,total_cases,new_cases,new_cases_smoothed
count,82211.0,82209.0,81214.0
mean,563339.9,3991.503424,3976.174554
std,2927381.0,20186.92394,19604.646671
min,1.0,-74347.0,-6223.0
25%,906.0,2.0,6.714
50%,10053.0,68.0,82.857
75%,105560.0,735.0,765.2145
max,44575560.0,506677.0,479167.857


In [12]:
# check if there are any rows with no data for cases
covidcases_df.iloc[:, 2:5].isnull().all(1).sum()

2067

In [13]:
# get indexes of all rows with no data for cases
covidcases_null_rows = covidcases_df.index[covidcases_df.iloc[:, 2:5].isnull().all(1)]

# drop empty rows
covidcases_df = covidcases_df.drop(covidcases_df.index[covidcases_null_rows])

In [14]:
# check if there are any empty rows left
covidcases_df.iloc[:, 2:5].isnull().all(1).sum()

0

In [15]:
# check if there are any duplicated rows
covidcases_df.duplicated().sum()

0

In [16]:
covidcases_group_country = covidcases_df.groupby("country").sum()
covidcases_group_country

Unnamed: 0_level_0,total_cases,new_cases,new_cases_smoothed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,15057770.0,59370.0,58861.708
Africa,746508547.0,4532581.0,4505018.434
Albania,15682275.0,130859.0,130492.998
Algeria,23807619.0,121580.0,120944.853
Andorra,1962038.0,13148.0,13054.301
...,...,...,...
Venezuela,29344820.0,194959.0,191509.286
Vietnam,484934.0,2865.0,2842.014
Yemen,759751.0,6263.0,6139.846
Zambia,10625006.0,91484.0,91340.567


In [17]:
covidcases_group_date = covidcases_df.groupby("date").sum()
covidcases_group_date

Unnamed: 0_level_0,total_cases,new_cases,new_cases_smoothed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,1114.0,0.0,0.000
2020-01-23,1310.0,196.0,0.000
2020-01-24,1884.0,574.0,0.000
2020-01-25,2869.0,985.0,0.000
2020-01-26,4239.0,1370.0,0.000
...,...,...,...
2021-04-24,323056737.0,1747423.0,1763785.705
2021-04-25,324598332.0,1541595.0,1775848.569
2021-04-26,326028248.0,1429916.0,1770521.856
2021-04-27,327840870.0,1812622.0,1764169.004


In [18]:
# set index to 'country'
covidcases_df.set_index("country", inplace=True)
covidcases_df.head()

Unnamed: 0_level_0,date,total_cases,new_cases,new_cases_smoothed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,2020-02-24,1.0,1.0,
Afghanistan,2020-02-25,1.0,0.0,
Afghanistan,2020-02-26,1.0,0.0,
Afghanistan,2020-02-27,1.0,0.0,
Afghanistan,2020-02-28,1.0,0.0,


### Create COVID Deaths DataFrame

In [19]:
# create filtered dataframe from specific columns related to covid deaths
deaths_cols = ["country", "date", "total_deaths", "new_deaths", "new_deaths_smoothed"]
coviddeaths_df = covid_data_df[deaths_cols].copy()

coviddeaths_df = coviddeaths_df.reset_index(drop=True)

coviddeaths_df.head()

Unnamed: 0,country,date,total_deaths,new_deaths,new_deaths_smoothed
0,Afghanistan,2020-02-24,,,
1,Afghanistan,2020-02-25,,,
2,Afghanistan,2020-02-26,,,
3,Afghanistan,2020-02-27,,,
4,Afghanistan,2020-02-28,,,


In [20]:
# inspect the DataFrame
coviddeaths_df.shape

(84284, 5)

In [21]:
coviddeaths_df.describe()

Unnamed: 0,total_deaths,new_deaths,new_deaths_smoothed
count,72563.0,72721.0,81214.0
mean,15823.03,96.062485,84.945859
std,69305.25,421.441072,381.062946
min,1.0,-1918.0,-232.143
25%,44.0,0.0,0.0
50%,305.0,2.0,1.143
75%,2909.0,17.0,12.96425
max,1010531.0,7554.0,5608.286


In [22]:
# check if there are any rows with no data for deaths
coviddeaths_df.iloc[:, 2:5].isnull().all(1).sum()

2978

In [23]:
# get indexes of all rows with no data for deaths
coviddeaths_null_rows = coviddeaths_df.index[coviddeaths_df.iloc[:, 2:5].isnull().all(1)]

# drop empty rows
coviddeaths_df = coviddeaths_df.drop(coviddeaths_df.index[coviddeaths_null_rows])

In [24]:
# check if there are any empty rows left
coviddeaths_df.iloc[:, 2:5].isnull().all(1).sum()

0

In [25]:
# check if there are any duplicated rows
coviddeaths_df.duplicated().sum()

0

In [26]:
coviddeaths_group_country = coviddeaths_df.groupby("country").sum()
coviddeaths_group_country

Unnamed: 0_level_0,total_deaths,new_deaths,new_deaths_smoothed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,584309.0,2611.0,2582.994
Africa,18821611.0,121133.0,120165.285
Albania,300806.0,2386.0,2375.164
Algeria,730353.0,3234.0,3207.428
Andorra,28169.0,125.0,123.880
...,...,...,...
Venezuela,273397.0,2099.0,2045.294
Vietnam,9038.0,35.0,35.002
Yemen,197937.0,1216.0,1187.453
Zambia,166889.0,1249.0,1246.010


In [27]:
coviddeaths_group_date = coviddeaths_df.groupby("date").sum()
coviddeaths_group_date

Unnamed: 0_level_0,total_deaths,new_deaths,new_deaths_smoothed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-22,34.0,0.0,0.000
2020-01-23,36.0,2.0,0.000
2020-01-24,52.0,16.0,0.000
2020-01-25,84.0,32.0,0.000
2020-01-26,112.0,28.0,0.000
...,...,...,...
2021-04-24,6875299.0,28549.0,27660.004
2021-04-25,6896121.0,20822.0,27974.006
2021-04-26,6920430.0,24309.0,28126.436
2021-04-27,6951911.0,31481.0,28175.141


In [28]:
# set index to 'country'
coviddeaths_df.set_index("country", inplace=True)
coviddeaths_df.head()

Unnamed: 0_level_0,date,total_deaths,new_deaths,new_deaths_smoothed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,2020-02-29,,,0.0
Afghanistan,2020-03-01,,,0.0
Afghanistan,2020-03-02,,,0.0
Afghanistan,2020-03-03,,,0.0
Afghanistan,2020-03-04,,,0.0


### Create DataFrame with additional health info for each country

In [29]:
# create filtered dataframe from specific columns related to health info
health_cols = ["country", "date", "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "life_expectancy"]
health_df = covid_data_df[health_cols].copy()

health_df = health_df.reset_index(drop=True)

health_df.head()

Unnamed: 0,country,date,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,life_expectancy
0,Afghanistan,2020-02-24,597.029,9.59,,,64.83
1,Afghanistan,2020-02-25,597.029,9.59,,,64.83
2,Afghanistan,2020-02-26,597.029,9.59,,,64.83
3,Afghanistan,2020-02-27,597.029,9.59,,,64.83
4,Afghanistan,2020-02-28,597.029,9.59,,,64.83


In [30]:
# inspect the DataFrame
health_df.shape

(84284, 7)

In [31]:
health_df.describe()

Unnamed: 0,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,life_expectancy
count,76799.0,77924.0,60072.0,59179.0,79968.0
mean,257.904808,7.81471,10.548806,32.640879,73.169969
std,119.097565,3.984979,10.432268,13.527264,7.570537
min,79.37,0.99,0.1,7.7,53.28
25%,167.295,5.29,1.9,21.4,67.88
50%,243.811,7.11,6.2,31.4,74.62
75%,329.635,10.08,19.3,41.1,78.73
max,724.417,30.53,44.0,78.1,86.75


In [32]:
# check if there are any rows with no data
health_df.iloc[:, 2:7].isnull().all(1).sum()

4316

In [33]:
# get indexes of all rows with no data
health_null_rows = health_df.index[health_df.iloc[:, 2:7].isnull().all(1)]

# drop empty rows
health_df = health_df.drop(health_df.index[health_null_rows])

In [34]:
# check if there are any empty rows left
health_df.iloc[:, 2:7].isnull().all(1).sum()

0

In [35]:
# check if there are any duplicated rows in all columns except from date
health_df.iloc[:, [0,2,3,4,5,6]].duplicated().sum()

79762

In [36]:
# drop duplicated rows
health_df.drop_duplicates(subset=["country", "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "life_expectancy"], inplace=True)

In [37]:
health_group_country = health_df.groupby("country").mean()
health_group_country

Unnamed: 0_level_0,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,life_expectancy
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,597.029,9.59,,,64.83
Albania,304.195,10.08,7.1,51.2,78.57
Algeria,278.364,6.73,0.7,30.4,76.88
Andorra,109.135,7.97,29.0,37.8,83.73
Angola,276.045,3.94,,,61.15
...,...,...,...,...,...
Venezuela,204.850,6.47,,,72.06
Vietnam,245.465,6.00,1.0,45.9,75.40
Yemen,495.003,5.35,7.6,29.2,66.12
Zambia,234.499,3.94,3.1,24.7,63.89


In [38]:
health_df.shape

(206, 7)

In [39]:
# delete date column, since further investigation showed that the health indicators are one-off instances per country
del health_df['date'] 

In [40]:
health_df.shape

(206, 6)

In [41]:
# set index to 'country'
health_df.set_index("country", inplace=True)
health_df.head()

Unnamed: 0_level_0,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,life_expectancy
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,597.029,9.59,,,64.83
Albania,304.195,10.08,7.1,51.2,78.57
Algeria,278.364,6.73,0.7,30.4,76.88
Andorra,109.135,7.97,29.0,37.8,83.73
Angola,276.045,3.94,,,61.15


### Create DataFrame with additional medical facilities info for each country

In [42]:
# create a filtered dataframe from specific columns related to medical facilities
medical_cols = ["country", "date", "handwashing_facilities", "hospital_beds_per_thousand"]
medical_facilities_df = covid_data_df[medical_cols].copy()

medical_facilities_df = medical_facilities_df.reset_index(drop=True)

medical_facilities_df.head()

Unnamed: 0,country,date,handwashing_facilities,hospital_beds_per_thousand
0,Afghanistan,2020-02-24,37.746,0.5
1,Afghanistan,2020-02-25,37.746,0.5
2,Afghanistan,2020-02-26,37.746,0.5
3,Afghanistan,2020-02-27,37.746,0.5
4,Afghanistan,2020-02-28,37.746,0.5


In [43]:
# inspect the DataFrame
medical_facilities_df.shape

(84284, 4)

In [44]:
medical_facilities_df.describe()

Unnamed: 0,handwashing_facilities,hospital_beds_per_thousand
count,38354.0,70043.0
mean,50.808191,3.031649
std,31.938333,2.471673
min,1.188,0.1
25%,19.351,1.3
50%,49.839,2.397
75%,83.241,4.0
max,98.999,13.8


In [45]:
# check if there are any rows with no data
medical_facilities_df.iloc[:, 2:4].isnull().all(1).sum()

8375

In [46]:
# get indexes of all rows with no data
medical_null_rows = medical_facilities_df.index[medical_facilities_df.iloc[:, 2:4].isnull().all(1)]

# drop empty rows
medical_facilities_df = medical_facilities_df.drop(medical_facilities_df.index[medical_null_rows])

In [47]:
# check if there are any empty rows left
medical_facilities_df.iloc[:, 2:4].isnull().all(1).sum()

0

In [48]:
# check if there are any duplicated rows in all columns except from date
medical_facilities_df.iloc[:, [0,2,3]].duplicated().sum()

75726

In [49]:
# drop duplicated rows
medical_facilities_df.drop_duplicates(subset=["country", "handwashing_facilities", "hospital_beds_per_thousand"], inplace=True)

In [50]:
medical_facilities_group_country = medical_facilities_df.groupby("country").mean()
medical_facilities_group_country

Unnamed: 0_level_0,handwashing_facilities,hospital_beds_per_thousand
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,37.746,0.50
Albania,,2.89
Algeria,83.741,1.90
Angola,26.664,
Antigua and Barbuda,,3.80
...,...,...
Venezuela,,0.80
Vietnam,85.847,2.60
Yemen,49.542,0.70
Zambia,13.938,2.00


In [51]:
medical_facilities_df.shape

(183, 4)

In [52]:
# delete date column, since further investigation showed that the medical facilities indicators are one-off instances per country
del medical_facilities_df['date'] 

In [53]:
medical_facilities_df.shape

(183, 3)

In [54]:
# set index to 'country'
medical_facilities_df.set_index("country", inplace=True)
medical_facilities_df.head()

Unnamed: 0_level_0,handwashing_facilities,hospital_beds_per_thousand
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,37.746,0.5
Albania,,2.89
Algeria,83.741,1.9
Angola,26.664,
Antigua and Barbuda,,3.8


### Create database connection

In [55]:
# create an engine for the `country_db` database
engine_path = (f"postgresql://{username}:{password}@localhost:5432/country_db")
engine = create_engine(engine_path)

In [56]:
# confirm tables
engine.table_names()

['covidcases', 'coviddeaths', 'health', 'medical_facilities']

### Load DataFrames into database

In [57]:
covidcases_df.to_sql(name='covidcases', con=engine, if_exists='append', index=True)

In [58]:
coviddeaths_df.to_sql(name='coviddeaths', con=engine, if_exists='append', index=True)

In [59]:
health_df.to_sql(name='health', con=engine, if_exists='append', index=True)

In [60]:
medical_facilities_df.to_sql(name='medical_facilities', con=engine, if_exists='append', index=True)