Import pandas, numpy and os.

In [1]:
import pandas as pd
import numpy as np
import os

Read in OWID Covid Tracking Dataset, which is a composition of a variety of datasets, sources of which can be found [here](https://ourworldindata.org/coronavirus-source-data)

In [2]:
owid_df = pd.read_csv('owid-covid-data.csv')

In [3]:
owid_df.shape

(118134, 62)

In [4]:
owid_df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,597.029,9.59,,,37.746,0.5,64.83,0.511,


In [5]:
owid_df.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

Now to remove columns that I'm not including in my analysis and load them into a new dataframe. I'll leave the original intact so that I can refrence it later for controls.


In [6]:
df = owid_df[['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'total_deaths', 'new_deaths','total_cases_per_million',
       'new_cases_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'stringency_index', 'excess_mortality']].copy()



In [7]:
df.shape

(118134, 14)

In [8]:
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,stringency_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,0.126,0.126,,,8.33,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,0.126,0.0,,,8.33,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,0.126,0.0,,,8.33,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,0.126,0.0,,,8.33,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,0.126,0.0,,,8.33,


One of the key statistics I'll be using in my analysis is excess mortality as it's generally regarded as the best way of cutting through noise to get to actually Covid outcomes in a given area. However, many countries don't and historically haven't kept good enough data on their mortality levels, so I'll be limiting my analysis to countries that have excess mortality data available.

In [9]:
temp = df[df['excess_mortality'].notna()]

This is the list of countries that have at least some information on excess mortality, so I'll constrain most of my analysis to these.

In [10]:
print(set(temp['location'].unique()))

{'Slovenia', 'Moldova', 'Brazil', 'Cuba', 'Taiwan', 'Belarus', 'Seychelles', 'Mauritius', 'France', 'Kazakhstan', 'Romania', 'El Salvador', 'Faeroe Islands', 'Estonia', 'Netherlands', 'Uzbekistan', 'Ireland', 'Switzerland', 'Denmark', 'Slovakia', 'Portugal', 'United States', 'Antigua and Barbuda', 'Jamaica', 'New Zealand', 'Costa Rica', 'Canada', 'Albania', 'Poland', 'Finland', 'Oman', 'Iran', 'Germany', 'Tunisia', 'Bosnia and Herzegovina', 'Mexico', 'Japan', 'Lebanon', 'Norway', 'Peru', 'Lithuania', 'Paraguay', 'Panama', 'Iceland', 'Nicaragua', 'Qatar', 'Serbia', 'Hong Kong', 'Philippines', 'Mongolia', 'Montenegro', 'Italy', 'Argentina', 'Russia', 'Belgium', 'Liechtenstein', 'Chile', 'Greece', 'Kosovo', 'Colombia', 'Luxembourg', 'North Macedonia', 'Ukraine', 'Armenia', 'Andorra', 'Guatemala', 'Ecuador', 'Cyprus', 'Singapore', 'Hungary', 'Sweden', 'Latvia', 'Australia', 'Gibraltar', 'South Korea', 'Malaysia', 'Bulgaria', 'Monaco', 'Kyrgyzstan', 'Thailand', 'Bolivia', 'Uruguay', 'Israel

In [11]:
print(len(set(temp['location'].unique())))

94


In [12]:
my_list = ['Azerbaijan', 'Ukraine', 'Canada', 'Panama', 'Latvia', 'Mauritius', 'Taiwan', 'Macao', 'Moldova', 'Portugal', 'Spain', 'Bolivia', 'Monaco', 'Uruguay', 'Singapore', 'Denmark', 'Japan', 'Belgium', 'Austria', 'Colombia', 'Hungary', 'Bosnia and Herzegovina', 'Kyrgyzstan', 'Chile', 'Romania', 'Bulgaria', 'Germany', 'El Salvador', 'Paraguay', 'Oman', 'Czechia', 'Egypt', 'Armenia', 'Qatar', 'Finland', 'Argentina', 'Ecuador', 'Norway', 'Serbia', 'Croatia', 'Mexico', 'Guatemala', 'Kazakhstan', 'Sweden', 'Montenegro', 'New Zealand', 'Philippines', 'San Marino', 'Costa Rica', 'South Korea', 'Mongolia', 'Peru', 'France', 'Estonia', 'Faeroe Islands', 'Seychelles', 'Poland', 'Gibraltar', 'Tunisia', 'United Kingdom', 'Ireland', 'Switzerland', 'Jamaica', 'Australia', 'Iran', 'Malta', 'Cyprus', 'Kosovo', 'United States', 'Slovakia', 'Andorra', 'Israel', 'Lithuania', 'Slovenia', 'Georgia', 'Netherlands', 'Belarus', 'Thailand', 'Albania', 'Liechtenstein', 'Malaysia', 'Hong Kong', 'Luxembourg', 'Iceland', 'Nicaragua', 'Russia', 'Italy', 'Lebanon', 'Uzbekistan', 'Antigua and Barbuda', 'Cuba', 'Greece', 'Brazil', 'North Macedonia']

In [13]:
df = df[df['location'].isin(my_list)]

In [14]:
df.shape

(53407, 14)

Now we'll filter down our data frame to include only countries that contain information on their stringency as well.

In [15]:
temp = df[df['stringency_index'].notna()]

In [16]:
print(set(temp['location'].unique()))

{'Slovenia', 'Moldova', 'Brazil', 'Cuba', 'Taiwan', 'Belarus', 'Seychelles', 'Mauritius', 'France', 'Kazakhstan', 'Romania', 'El Salvador', 'Faeroe Islands', 'Estonia', 'Netherlands', 'Uzbekistan', 'Ireland', 'Switzerland', 'Denmark', 'Slovakia', 'Portugal', 'United States', 'Jamaica', 'New Zealand', 'Costa Rica', 'Canada', 'Albania', 'Poland', 'Finland', 'Oman', 'Iran', 'Germany', 'Tunisia', 'Bosnia and Herzegovina', 'Mexico', 'Japan', 'Lebanon', 'Norway', 'Peru', 'Lithuania', 'Paraguay', 'Panama', 'Iceland', 'Nicaragua', 'Qatar', 'Serbia', 'Hong Kong', 'Philippines', 'Mongolia', 'Italy', 'Argentina', 'Russia', 'Belgium', 'Liechtenstein', 'Chile', 'Greece', 'Kosovo', 'Colombia', 'Luxembourg', 'Ukraine', 'Andorra', 'Guatemala', 'Ecuador', 'Cyprus', 'Singapore', 'Hungary', 'Sweden', 'Latvia', 'Australia', 'South Korea', 'Malaysia', 'Bulgaria', 'Monaco', 'Kyrgyzstan', 'Thailand', 'Bolivia', 'Uruguay', 'Israel', 'Malta', 'Macao', 'Czechia', 'Egypt', 'San Marino', 'Georgia', 'Spain', 'Aust

In [17]:
my_list = ['Australia', 'Latvia', 'San Marino', 'Hungary', 'Denmark', 'Portugal', 'Kazakhstan', 'Oman', 'Belgium', 'Estonia', 'Albania', 'Hong Kong', 'Slovenia', 'Sweden', 'Tunisia', 'Austria', 'Chile', 'Germany', 'Costa Rica', 'France', 'Switzerland', 'Brazil', 'Italy', 'Iceland', 'Lebanon', 'Romania', 'Bosnia and Herzegovina', 'Uzbekistan', 'Mongolia', 'Iran', 'Mauritius', 'Bulgaria', 'Netherlands', 'Guatemala', 'United Kingdom', 'Slovakia', 'Greece', 'Georgia', 'Andorra', 'El Salvador', 'Poland', 'Liechtenstein', 'Seychelles', 'Belarus', 'Israel', 'Japan', 'Kosovo', 'Panama', 'New Zealand', 'Egypt', 'Serbia', 'Kyrgyzstan', 'Faeroe Islands', 'Ukraine', 'Czechia', 'Canada', 'Argentina', 'Luxembourg', 'Mexico', 'Bolivia', 'Lithuania', 'Macao', 'Singapore', 'Azerbaijan', 'Malaysia', 'Norway', 'Monaco', 'United States', 'Cyprus', 'Jamaica', 'Taiwan', 'Finland', 'Malta', 'Peru', 'Croatia', 'Philippines', 'South Korea', 'Ecuador', 'Moldova', 'Colombia', 'Uruguay', 'Cuba', 'Spain', 'Russia', 'Thailand', 'Nicaragua', 'Paraguay', 'Ireland', 'Qatar']

In [18]:
df = df[df['location'].isin(my_list)]

In [19]:
df.shape

(50903, 14)

In [20]:
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,stringency_index,excess_mortality
1161,ALB,Europe,Albania,2020-02-25,,,,,,,,,8.33,
1162,ALB,Europe,Albania,2020-02-26,,,,,,,,,8.33,
1163,ALB,Europe,Albania,2020-02-27,,,,,,,,,8.33,
1164,ALB,Europe,Albania,2020-02-28,,,,,,,,,8.33,
1165,ALB,Europe,Albania,2020-02-29,,,,,,,,,8.33,2.17


In [21]:
df.to_csv('data_cleaning_df.csv', index=False)