Import pandas, numpy and os.

In [1]:
import pandas as pd
import numpy as np
import os

Read in OWID Covid Tracking Dataset, which is a composition of a variety of datasets, sources of which can be found [here](https://ourworldindata.org/coronavirus-source-data)

In [2]:
owid_df = pd.read_csv('owid-covid-data-2.csv')

In [3]:
owid_df.shape

(152004, 67)

In [4]:
owid_df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [5]:
owid_df.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'new_tests', 'total_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

Now to remove columns that I'm not including in my analysis and load them into a new dataframe. I'll leave the original intact so that I can refrence it later for controls.


In [6]:
df = owid_df[['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'total_deaths', 'new_deaths','total_cases_per_million',
       'new_cases_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'stringency_index', 'excess_mortality']].copy()



In [7]:
df.shape

(152004, 14)

In [8]:
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,stringency_index,excess_mortality
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,0.126,0.126,,,8.33,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,0.126,0.0,,,8.33,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,0.126,0.0,,,8.33,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,0.126,0.0,,,8.33,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,0.126,0.0,,,8.33,


One of the key statistics I'll be using in my analysis is excess mortality as it's generally regarded as the best way of cutting through noise to get to actually Covid outcomes in a given area. However, many countries don't and historically haven't kept good enough data on their mortality levels, so I'll be limiting my analysis to countries that have excess mortality data available.

In [9]:
temp = df[df['excess_mortality'].notna()]

This is the list of countries that have at least some information on excess mortality, so I'll constrain most of my analysis to these.

In [10]:
print(set(temp['location'].unique()))

{'Bolivia', 'Romania', 'Iran', 'Australia', 'Peru', 'Jamaica', 'Dominican Republic', 'Montenegro', 'Spain', 'Tunisia', 'Canada', 'Bosnia and Herzegovina', 'Barbados', 'Nicaragua', 'Belarus', 'Chile', 'Israel', 'Russia', 'Belgium', 'Latvia', 'Germany', 'Greenland', 'Faeroe Islands', 'New Caledonia', 'Norway', 'Egypt', 'Georgia', 'Qatar', 'Gibraltar', 'Lebanon', 'Costa Rica', 'Hong Kong', 'Japan', 'El Salvador', 'Brazil', 'France', 'United States', 'Panama', 'North Macedonia', 'Kazakhstan', 'Malta', 'Uzbekistan', 'Cyprus', 'San Marino', 'Moldova', 'New Zealand', 'South Africa', 'Azerbaijan', 'Argentina', 'Armenia', 'Mongolia', 'Jordan', 'Mexico', 'Liechtenstein', 'Palestine', 'Taiwan', 'Hungary', 'Portugal', 'United Kingdom', 'Mauritius', 'Denmark', 'French Polynesia', 'Maldives', 'Macao', 'Ecuador', 'Aruba', 'Cuba', 'Andorra', 'Slovenia', 'Luxembourg', 'Singapore', 'Kuwait', 'Lithuania', 'Seychelles', 'Serbia', 'Paraguay', 'Albania', 'South Korea', 'Croatia', 'Kosovo', 'Monaco', 'Irelan

In [11]:
print(len(set(temp['location'].unique())))

108


In [13]:
my_list = ['Bolivia', 'Romania', 'Iran', 'Australia', 'Peru', 'Jamaica', 'Dominican Republic', 'Montenegro', 'Spain', 'Tunisia', 'Canada', 'Bosnia and Herzegovina', 'Barbados', 'Nicaragua', 'Belarus', 'Chile', 'Israel', 'Russia', 'Belgium', 'Latvia', 'Germany', 'Greenland', 'Faeroe Islands', 'New Caledonia', 'Norway', 'Egypt', 'Georgia', 'Qatar', 'Gibraltar', 'Lebanon', 'Costa Rica', 'Hong Kong', 'Japan', 'El Salvador', 'Brazil', 'France', 'United States', 'Panama', 'North Macedonia', 'Kazakhstan', 'Malta', 'Uzbekistan', 'Cyprus', 'San Marino', 'Moldova', 'New Zealand', 'South Africa', 'Azerbaijan', 'Argentina', 'Armenia', 'Mongolia', 'Jordan', 'Mexico', 'Liechtenstein', 'Palestine', 'Taiwan', 'Hungary', 'Portugal', 'United Kingdom', 'Mauritius', 'Denmark', 'French Polynesia', 'Maldives', 'Macao', 'Ecuador', 'Aruba', 'Cuba', 'Andorra', 'Slovenia', 'Luxembourg', 'Singapore', 'Kuwait', 'Lithuania', 'Seychelles', 'Serbia', 'Paraguay', 'Albania', 'South Korea', 'Croatia', 'Kosovo', 'Monaco', 'Ireland', 'Kyrgyzstan', 'Bermuda', 'Austria', 'Malaysia', 'Czechia', 'Brunei', 'Belize', 'Poland', 'Thailand', 'Switzerland', 'Bulgaria', 'Uruguay', 'Greece', 'Philippines', 'Sweden', 'Antigua and Barbuda', 'Finland', 'Slovakia', 'Ukraine', 'Guatemala', 'Estonia', 'Netherlands', 'Colombia', 'Iceland', 'Oman', 'Italy']

In [14]:
df = df[df['location'].isin(my_list)]

In [15]:
df.shape

(73574, 14)

Now we'll filter down our data frame to include only countries that contain information on their stringency as well.

In [16]:
temp = df[df['stringency_index'].notna()]

In [17]:
print(set(temp['location'].unique()))

{'Bolivia', 'Romania', 'Iran', 'Australia', 'Peru', 'Jamaica', 'Dominican Republic', 'Spain', 'Tunisia', 'Canada', 'Bosnia and Herzegovina', 'Barbados', 'Nicaragua', 'Belarus', 'Chile', 'Israel', 'Russia', 'Belgium', 'Latvia', 'Germany', 'Greenland', 'Faeroe Islands', 'Norway', 'Egypt', 'Georgia', 'Qatar', 'Lebanon', 'Costa Rica', 'Hong Kong', 'Japan', 'El Salvador', 'Brazil', 'France', 'United States', 'Kazakhstan', 'Malta', 'Uzbekistan', 'Cyprus', 'San Marino', 'Moldova', 'New Zealand', 'South Africa', 'Azerbaijan', 'Argentina', 'Italy', 'Mongolia', 'Jordan', 'Palestine', 'Mexico', 'Liechtenstein', 'Taiwan', 'Hungary', 'Portugal', 'United Kingdom', 'Mauritius', 'Denmark', 'Macao', 'Ecuador', 'Aruba', 'Cuba', 'Andorra', 'Slovenia', 'Luxembourg', 'Singapore', 'Kuwait', 'Lithuania', 'Seychelles', 'Serbia', 'Paraguay', 'Albania', 'South Korea', 'Croatia', 'Kosovo', 'Monaco', 'Ireland', 'Kyrgyzstan', 'Bermuda', 'Austria', 'Malaysia', 'Czechia', 'Brunei', 'Belize', 'Poland', 'Thailand', 'S

In [19]:
my_list = ['Bolivia', 'Romania', 'Iran', 'Australia', 'Peru', 'Jamaica', 'Dominican Republic', 'Spain', 'Tunisia', 'Canada', 'Bosnia and Herzegovina', 'Barbados', 'Nicaragua', 'Belarus', 'Chile', 'Israel', 'Russia', 'Belgium', 'Latvia', 'Germany', 'Greenland', 'Faeroe Islands', 'Norway', 'Egypt', 'Georgia', 'Qatar', 'Lebanon', 'Costa Rica', 'Hong Kong', 'Japan', 'El Salvador', 'Brazil', 'France', 'United States', 'Kazakhstan', 'Malta', 'Uzbekistan', 'Cyprus', 'San Marino', 'Moldova', 'New Zealand', 'South Africa', 'Azerbaijan', 'Argentina', 'Italy', 'Mongolia', 'Jordan', 'Palestine', 'Mexico', 'Liechtenstein', 'Taiwan', 'Hungary', 'Portugal', 'United Kingdom', 'Mauritius', 'Denmark', 'Macao', 'Ecuador', 'Aruba', 'Cuba', 'Andorra', 'Slovenia', 'Luxembourg', 'Singapore', 'Kuwait', 'Lithuania', 'Seychelles', 'Serbia', 'Paraguay', 'Albania', 'South Korea', 'Croatia', 'Kosovo', 'Monaco', 'Ireland', 'Kyrgyzstan', 'Bermuda', 'Austria', 'Malaysia', 'Czechia', 'Brunei', 'Belize', 'Poland', 'Thailand', 'Switzerland', 'Bulgaria', 'Uruguay', 'Greece', 'Philippines', 'Sweden', 'Finland', 'Slovakia', 'Ukraine', 'Guatemala', 'Estonia', 'Netherlands', 'Colombia', 'Iceland', 'Oman', 'Panama']

In [20]:
df = df[df['location'].isin(my_list)]

In [21]:
df.shape

(68246, 14)

In [22]:
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,stringency_index,excess_mortality
1371,ALB,Europe,Albania,2020-02-25,,,,,,,,,8.33,
1372,ALB,Europe,Albania,2020-02-26,,,,,,,,,8.33,
1373,ALB,Europe,Albania,2020-02-27,,,,,,,,,8.33,
1374,ALB,Europe,Albania,2020-02-28,,,,,,,,,8.33,
1375,ALB,Europe,Albania,2020-02-29,,,,,,,,,8.33,2.88


In [23]:
df.to_csv('data_cleaning_df_2.csv', index=False)