In [154]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
plt.rcParams["figure.figsize"] = (24, 12)

In [155]:
# get currently available data
df = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", parse_dates=["date"])
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,0.126,0.126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,


In [156]:
# select attributes that we find interesting
attributes = ["iso_code", "continent", "location", "date", "total_cases", "new_cases", "total_deaths", "new_deaths", "reproduction_rate", "icu_patients", "hosp_patients",
"weekly_icu_admissions", "weekly_hosp_admissions", "new_tests", "total_tests", "positive_rate", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated",
"new_vaccinations", "stringency_index", "population", "population_density", "median_age", "aged_65_older", "aged_70_older", "gdp_per_capita", "extreme_poverty",
"cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "handwashing_facilities", "hospital_beds_per_thousand", "life_expectancy",
"human_development_index", "excess_mortality"]

df = df[attributes]

In [164]:
# now check which columns contain NaN values
df.isna().any()

pandas.core.series.Series

In [158]:
# it's weird that continent contains NaNs and location doesn't, so let's take a look at that
df[df["continent"].isna()].head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
617,OWID_AFR,,Africa,2020-02-13,,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
618,OWID_AFR,,Africa,2020-02-14,1.0,1.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
619,OWID_AFR,,Africa,2020-02-15,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
620,OWID_AFR,,Africa,2020-02-16,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
621,OWID_AFR,,Africa,2020-02-17,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,


In [159]:
# let's store these rows which combine data of a continent in a seperate dataframe, so it doesn't give us any weird mistakes later/confuse us
# before we do this we should be sure that really only these combined rows have NaNs
print("Unique locations where continent is NaN", df[df["continent"].isna()]["location"].unique())
# looking good, so lets create a new dataframe
continent_df = df[df["continent"].isna()]
# drop these rows from the original dataframe
df = df[~df["continent"].isna()]

Unique locations where continent is NaN ['Africa' 'Asia' 'Europe' 'European Union' 'International' 'North America'
 'Oceania' 'South America' 'World']


In [160]:
# we would expect that new_cases should be almost complete (because it's the most significant attribute), so let's take a look at that
print("Percentage of non-missing values for each country in the column new_cases")
df[["location", "new_cases"]].groupby("location").count()["new_cases"] / df[["location", "new_cases"]].groupby("location").size()

Percentage of non-missing values for each country in the column new_cases


location
Afghanistan          1.000000
Albania              0.978896
Algeria              1.000000
Andorra              1.000000
Angola               1.000000
                       ...   
Vietnam              1.000000
Wallis and Futuna    0.000000
Yemen                1.000000
Zambia               1.000000
Zimbabwe             1.000000
Length: 224, dtype: float64

In [161]:
# it seems that there are countries that don't have any values (e.g. Wallis and Futuna) for new_cases or simply not enough values, these are obviously uselss to us
# lets drop all countries that have missing values for more than half of their entries for the column new_cases 
# (if they have more than half of the values, then we can fix the missing values with interpolation later)
temp = ((df[["location", "new_cases"]].groupby("location").count()["new_cases"] / df[["location", "new_cases"]].groupby("location").size()) >= 0.5)
temp = temp[temp == True].index.tolist()
# drop all rows that don't fulfill the above defined criteria
df = df[df["location"].isin(temp)]