In [348]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)
plt.rcParams["figure.figsize"] = (24, 12)

In [349]:
# get currently available data
df = pd.read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", parse_dates=["date"])
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,new_tests,total_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,0.126,0.126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,0.126,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,39835428.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,


In [350]:
# select attributes that we find interesting
attributes = [
    "iso_code", "continent", "location", "date", "total_cases", "new_cases", "total_deaths", "new_deaths", "reproduction_rate", "icu_patients", "hosp_patients",
    "weekly_icu_admissions", "weekly_hosp_admissions", "new_tests", "total_tests", "positive_rate", "total_vaccinations", "people_vaccinated", "people_fully_vaccinated",
    "new_vaccinations", "stringency_index", "population", "population_density", "median_age", "aged_65_older", "aged_70_older", "gdp_per_capita", "extreme_poverty",
    "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "handwashing_facilities", "hospital_beds_per_thousand", "life_expectancy",
    "human_development_index", "excess_mortality"
]

df = df[attributes]

In [351]:
# now check which columns contain NaN values
df.isna().any().to_frame().T

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,False,True,False,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [352]:
# it's weird that continent contains NaNs and location doesn't, so let's take a look at that
print("First 5 rows that have NaN for the column continent")
df[df["continent"].isna()].head()

First 5 rows that have NaN for the column continent


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
620,OWID_AFR,,Africa,2020-02-13,,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
621,OWID_AFR,,Africa,2020-02-14,1.0,1.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
622,OWID_AFR,,Africa,2020-02-15,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
623,OWID_AFR,,Africa,2020-02-16,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,
624,OWID_AFR,,Africa,2020-02-17,1.0,0.0,,0.0,,,,,,,,,,,,,,1373486000.0,,,,,,,,,,,,,,,


In [353]:
# let's store these rows which combine data of a continent in a seperate dataframe, so it doesn't give us any weird mistakes later/confuse us
# before we do this we should be sure that really only these combined rows have NaNs
print("Unique locations where continent is NaN", df[df["continent"].isna()]["location"].unique())
# looking good, so lets create a new dataframe
continent_df = df[df["continent"].isna()]
# drop these rows from the original dataframe
df = df[~df["continent"].isna()]

Unique locations where continent is NaN ['Africa' 'Asia' 'Europe' 'European Union' 'High income' 'International'
 'Low income' 'Lower middle income' 'North America' 'Oceania'
 'South America' 'Upper middle income' 'World']


In [354]:
# we would expect that new_cases should be almost complete (because it's the most important attribute), so let's take a look at that
print("Percentage of non-missing values for each country in the column new_cases:")
(df[["location", "new_cases"]].groupby("location").count()["new_cases"] / df[["location", "new_cases"]].groupby("location").size()).to_frame().T

Percentage of non-missing values for each country in the column new_cases:


location,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,Argentina,Armenia,Aruba,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bermuda,Bhutan,Bolivia,Bonaire Sint Eustatius and Saba,Bosnia and Herzegovina,Botswana,Brazil,British Virgin Islands,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Cayman Islands,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Cook Islands,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Curacao,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Eswatini,Ethiopia,Faeroe Islands,Falkland Islands,Fiji,Finland,France,French Polynesia,Gabon,Gambia,Georgia,Germany,Ghana,Gibraltar,Greece,Greenland,Grenada,Guatemala,Guernsey,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Isle of Man,Israel,Italy,Jamaica,Japan,Jersey,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Macao,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Marshall Islands,Mauritania,Mauritius,Mexico,Micronesia (country),Moldova,Monaco,Mongolia,Montenegro,Montserrat,Morocco,Mozambique,Myanmar,Namibia,Nauru,Nepal,Netherlands,New Caledonia,New Zealand,Nicaragua,Niger,Nigeria,Niue,North Macedonia,Northern Cyprus,Norway,Oman,Pakistan,Palau,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Pitcairn,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Helena,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Sint Maarten (Dutch part),Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Tokelau,Tonga,Trinidad and Tobago,Tunisia,Turkey,Turkmenistan,Turks and Caicos Islands,Tuvalu,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Wallis and Futuna,Yemen,Zambia,Zimbabwe
0,1.0,0.978998,1.0,1.0,1.0,0.0,1.0,0.908012,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.99183,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.998382,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.998469,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.996705,0.988728,1.0,0.961059,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.962559,1.0,1.0,0.0,0.0,0.924031,1.0,0.998464,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.9888,0.0,1.0,0.958533,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.998366,0.996764,1.0,1.0,1.0,1.0,1.0,0.0,0.998397,1.0,1.0,0.998469,0.0,1.0,1.0,0.988506,1.0,1.0,1.0,1.0,1.0,0.985531,1.0,1.0,1.0,0.967267,1.0,1.0,0.991935,0.0,1.0,0.993174,0.998464,1.0,1.0,0.996721,1.0,1.0,1.0,0.913947,1.0,1.0,1.0,1.0,1.0,0.0,0.962323,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.99835,1.0,0.998355,0.903561,1.0,0.0,0.987076,0.9808,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.99513,0.985437,1.0,1.0,1.0,0.0,1.0,0.950156,1.0,1.0,0.957614,0.996942,1.0,1.0,1.0,0.996683,1.0,1.0,1.0,1.0,0.989378,1.0,1.0,0.971684,1.0,0.996727,0.0,0.035176,1.0,1.0,0.998344,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.998469,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


In [355]:
# it seems that there are countries that don't have any values for new_cases or simply not enough values, these are obviously useless to us
# lets drop all countries that have missing values for more than half of their entries for the column new_cases 
# (if they have more than half of the values, then we can fix the missing values with interpolation later)
temp = ((df[["location", "new_cases"]].groupby("location").count()["new_cases"] / df[["location", "new_cases"]].groupby("location").size()) >= 0.5)
temp = temp[temp == True].index.tolist()
# drop all rows that don't fulfill the above defined criteria
df = df[df["location"].isin(temp)]
# reset index, so that it is correct again (we dropped rows)
df.reset_index(inplace = True, drop = True)

In [356]:
# number of trailing NaNs for column new_cases
temp = df.copy()
temp.reset_index(inplace = True)
last_index = temp.groupby("location").apply(lambda x: x.iloc[-1]["index"])
last_valid_index = temp.groupby("location").apply(lambda x: x["new_cases"].last_valid_index())
(last_index - last_valid_index).to_frame().T

location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Marshall Islands,Mauritania,Mauritius,Mexico,Micronesia (country),Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palau,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [357]:
# also while looking through the records, we found that some countries have leading NaNs for new_cases
# let's remove these (while we're at it, let's also remove trailing NaNs)
df = df.sort_values(by = ["location", "date"]).reset_index(drop = True)
# get the first and last valid index
first_valid_index = df.groupby("location").apply(lambda x: x["new_cases"].first_valid_index())
last_valid_index = df.groupby("location").apply(lambda x: x["new_cases"].last_valid_index())
# create list of indices that we want to keep
valid_indices = [np.arange(first, last+1) for first, last in zip(first_valid_index, last_valid_index)]
# flatten it to be a 1D array instead of 2D
valid_indices = [elem for sublist in valid_indices for elem in sublist]
df = df[df.index.isin(valid_indices)]
# we removed rows, so we need to reset the index
df.reset_index(drop = True, inplace = True)

In [358]:
# let's look at what percentage of values is still NaN for each column
print("Percentage of missing values for each column")
(df.isna().sum() / len(df)).to_frame().T

Percentage of missing values for each column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,icu_patients,hosp_patients,weekly_icu_admissions,weekly_hosp_admissions,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality
0,0.0,0.0,0.0,0.0,0.0,2.6e-05,0.093829,0.093855,0.11007,0.863739,0.840195,0.989124,0.981708,0.530691,0.529229,0.464966,0.764882,0.776667,0.800635,0.804728,0.09466,0.0,0.021052,0.045816,0.056588,0.051085,0.042355,0.352309,0.037795,0.021242,0.248737,0.259232,0.51503,0.123421,0.0052,0.031937,0.961029


In [359]:
# for some reason total_cases is complete, but new_cases isn't so let's fix that real quick
miss_indices = df[df["new_cases"].isna()].index
df.loc[miss_indices, "new_cases"] = list(df.iloc[miss_indices+1]["total_cases"] - np.array(df.iloc[miss_indices]["total_cases"]))

In [360]:
# also it seems that there are some columns where we simply have too many missing values for them to be useful, let's remove these
cols_to_drop = ["icu_patients", "hosp_patients", "weekly_icu_admissions", "weekly_hosp_admissions", "excess_mortality"]
df.drop(columns = cols_to_drop, inplace = True)

In [361]:
# let's try and fill the missing values for the remaining columns
# vaccinations numbers are very interesting to us so let's take a look at it
# for each country get the percentage of values that are not NaN for total_vaccinations
print("Percentage of non-NaN values for each country for the column total_vaccinations")
(df.groupby("location").count()["total_vaccinations"] / df.groupby("location").size()).to_frame().T

Percentage of non-NaN values for each country for the column total_vaccinations


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Marshall Islands,Mauritania,Mauritius,Mexico,Micronesia (country),Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palau,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,0.046774,0.328383,0.017771,0.052202,0.065546,0.116279,0.468954,0.039088,0.38829,0.504039,0.348534,0.058431,0.427419,0.298188,0.399666,0.042208,0.485938,0.172297,0.025042,0.154351,0.266556,0.029508,0.080342,0.385113,0.207921,0.479407,0.024793,0.010274,0.404321,0.064039,0.502311,0.060504,0.028333,0.043624,0.507246,0.368098,0.343186,0.037906,0.038333,0.077176,0.324503,0.365105,0.252073,0.201653,0.509772,0.048013,0.47812,0.023451,0.082631,0.346906,0.42671,0.074603,0.221477,0.093333,0.0,0.505673,0.064892,0.289037,0.050336,0.390093,0.479263,0.064892,0.043478,0.224919,0.481481,0.039933,0.47411,0.075885,0.384359,0.17608,0.023729,0.112769,0.067227,0.077815,0.392638,0.225859,0.175325,0.43876,0.411093,0.0704,0.048387,0.502439,0.515249,0.486025,0.10596,0.322086,0.232026,0.313953,0.270764,0.046784,0.186356,0.019355,0.180905,0.118443,0.531811,0.390048,0.020333,0.0301,0.062606,0.517185,0.507317,0.422764,0.026891,0.271478,0.390769,0.410214,0.047458,0.460526,0.0,0.063228,0.045226,0.465909,0.0,0.250412,0.04065,0.310744,0.382943,0.319739,0.042159,0.028912,0.224626,0.221538,0.366288,0.420455,0.028523,0.026891,0.126623,0.224919,0.545307,0.120968,0.163166,0.0,0.245902,0.38843,0.036975,0.332784,0.441708,0.24031,0.418985,0.425775,0.357724,0.491909,0.35559,0.093178,0.088136,0.191348,0.088186,0.068182,0.160976,0.050173,0.422512,0.161501,0.28243,0.116667,0.041096,0.190184,0.500821,0.511475,0.041131,0.066778,0.290164,0.386503,0.053541,0.335925,0.362654,0.036606,0.292845,0.076205,0.510501,0.023609,0.288344,0.068716,0.010017,0.315951,0.045531,0.029557,0.34609,0.309329,0.486755,0.178451,0.415033,0.427245,0.462733,0.501534,0.416944,0.116667,0.05,0.0,0.02995,0.328221,0.022648,0.19598,0.398319


In [362]:
# maybe the high number of missing values comes from leading NaNs? let's check that (those are not the number of leading NaNs!!!!!!!)
temp = df.copy()
temp.reset_index(inplace = True)
first_index = temp.groupby("location").apply(lambda x: x.iloc[0]["index"])
first_valid_index = temp.groupby("location").apply(lambda x: x["total_vaccinations"].first_valid_index())
print("Number of leading NaNs for each country for the column total_vaccinations")
(first_valid_index - first_index).to_frame().T

Number of leading NaNs for each country for the column total_vaccinations


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Eritrea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Marshall Islands,Mauritania,Mauritius,Mexico,Micronesia (country),Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palau,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Vatican,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,364.0,307.0,339.0,329.0,346.0,340.0,301.0,395.0,392.0,306.0,322.0,362.0,303.0,324.0,335.0,304.0,328.0,342.0,422.0,385.0,323.0,343.0,360.0,325.0,389.0,296.0,449.0,567.0,379.0,401.0,323.0,363.0,423.0,448.0,305.0,327.0,348.0,356.0,395.0,293.0,355.0,309.0,445.0,302.0,301.0,403.0,278.0,395.0,326.0,351.0,325.0,345.0,335.0,366.0,,304.0,381.0,391.0,363.0,337.0,338.0,373.0,357.0,382.0,335.0,352.0,306.0,326.0,348.0,383.0,413.0,336.0,483.0,354.0,396.0,299.0,306.0,351.0,316.0,355.0,371.0,303.0,302.0,331.0,363.0,391.0,315.0,324.0,356.0,14.0,379.0,308.0,375.0,358.0,277.0,358.0,300.0,421.0,389.0,292.0,302.0,303.0,418.0,349.0,396.0,331.0,370.0,316.0,,377.0,313.0,300.0,,362.0,305.0,349.0,340.0,332.0,350.0,305.0,369.0,367.0,314.0,356.0,347.0,373.0,370.0,356.0,280.0,308.0,343.0,,390.0,316.0,375.0,351.0,339.0,395.0,299.0,300.0,297.0,305.0,319.0,338.0,334.0,339.0,368.0,167.0,364.0,343.0,310.0,357.0,308.0,300.0,348.0,354.0,304.0,297.0,162.0,397.0,348.0,400.0,365.0,338.0,367.0,361.0,345.0,330.0,300.0,344.0,423.0,371.0,510.0,401.0,388.0,368.0,338.0,373.0,308.0,353.0,358.0,342.0,345.0,325.0,351.0,381.0,203.0,,340.0,409.0,394.0,392.0,335.0


In [363]:
# so it seems that there are a lot of leading NaNs, however for some countries we don't have any vaccination numbers, lets remove these countries
temp = (first_valid_index - first_index)
print("Number of countries for which we don't have any vaccination numbers: {}".format(len(temp[temp.isna()])))
# also after removing these countries, we need to reset the index (because we dropped some rows)
df = df[~df["location"].isin(temp[temp.isna()].index)].reset_index(drop = True)

Number of countries for which we don't have any vaccination numbers: 5


In [364]:
# let's look at the first value of total_vaccinations for each country that isn't NaN
first_valid_index = df.groupby("location").apply(lambda x: x["total_vaccinations"].first_valid_index())
print("First non-NaN value for each country for the column total_vaccinations")
df.iloc[list(first_valid_index)][["total_vaccinations", "location"]].set_index("location").T

First non-NaN value for each country for the column total_vaccinations


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
total_vaccinations,0.0,0.0,0.0,576.0,0.0,0.0,700.0,565.0,20.0,753.0,0.0,0.0,38965.0,0.0,4490.0,0.0,329.0,0.0,10683.0,0.0,0.0,0.0,0.0,0.0,0.0,1719.0,200.0,81.0,0.0,0.0,5.0,0.0,667.0,5324.0,420.0,1500000.0,18.0,13440.0,14297.0,55.0,0.0,7864.0,2037745.0,3901.0,1280.0,0.0,1.0,10246.0,0.0,0.0,17.0,0.0,0.0,6565.0,191.0,0.0,430000.0,0.0,1767.0,432.0,0.0,0.0,0.0,24355.0,0.0,447.0,0.0,0.0,62645.0,5889.0,0.0,38.0,2684.0,47.0,1094.0,4875.0,0.0,0.0,0.0,0.0,85.0,64.0,7336.0,0.0,125.0,0.0,0.0,0.0,0.0,0.0,2500.0,0.0,40732.0,1.0,0.0,0.0,36404.0,750.0,0.0,2417.0,452.0,609.0,5417.0,60.0,880.0,0.0,11705.0,0.0,0.0,2924.0,3849.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3836.0,1.0,0.0,0.0,0.0,0.0,0.0,1717.0,0.0,81942.0,641.0,0.0,200.0,5.0,0.0,2000.0,4963.0,0.0,965.0,28500.0,0.0,0.0,0.0,8952.0,7411.0,35.0,0.0,137862.0,0.0,7000.0,0.0,0.0,3400.0,480.0,6190.0,0.0,117567.0,0.0,29872.0,0.0,82834.0,0.0,0.0,0.0,2471.0,1.0,0.0,0.0,69229.0,105745.0,0.0,2629.0,0.0,0.0,0.0,0.0,0.0,159.0,826301.0,2677971.0,20013.0,372.0,0.0,0.0,0.0,0.0,18555.0,0.0,39.0


In [365]:
# so unfortunately these aren't always zero, however just interpolating these leading NaNs would temper too much with the given data, so let's just set them to zero
df.reset_index(inplace=True)
first_index = df.groupby("location").apply(lambda x: x.iloc[0]["index"])
df.drop(columns = ["index"], inplace = True)
# create list of indices that we want to change
valid_indices = [np.arange(first, last-1) for first, last in zip(first_index, first_valid_index)]
# flatten it to be a 1D array instead of 2D
valid_indices = [elem for sublist in valid_indices for elem in sublist]
# also set the people_vaccinated, people_fully_vaccinated, new_vaccinations to 0 for these rows
df.loc[valid_indices, ["total_vaccinations", "people_vaccinated", "people_fully_vaccinated", "new_vaccinations"]] = 0

In [366]:
# let's look at the percentage of missing values for columns again
print("Percentage of missing values for each column")
(df.isna().sum() / len(df)).to_frame().T

Percentage of missing values for each column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.081165,0.081191,0.099329,0.522683,0.521195,0.455836,0.190596,0.202705,0.226959,0.231121,0.084421,0.0,0.016052,0.037296,0.048252,0.042655,0.037718,0.343791,0.03242,0.016245,0.241805,0.252479,0.510037,0.117633,0.005289,0.027122


In [367]:
# it's looking a lot better now, but the ~50% missing values for new_tests/total_tests are really annoying because these are such interesting columns
# lets's check for leading NaNs
temp = df.copy()
temp.reset_index(inplace=True)
# get the first index for each country
first_index = temp.groupby("location").apply(lambda x: x.iloc[0]["index"])
# get the first valid index for each country for the column total_tests
first_valid_index = temp.groupby("location").apply(lambda x: x["total_tests"].first_valid_index())
print("Number of leading NaNs for each country for the column total_tests")
(first_valid_index - first_index).to_frame().T

Number of leading NaNs for each country for the column total_tests


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,,0.0,,245.0,,487.0,0.0,8.0,56.0,36.0,423.0,554.0,10.0,0.0,,4.0,26.0,422.0,429.0,17.0,3.0,27.0,4.0,41.0,,34.0,,,478.0,,5.0,,,,31.0,153.0,88.0,,,0.0,33.0,7.0,10.0,18.0,,,0.0,,,17.0,17.0,,17.0,484.0,0.0,,0.0,0.0,29.0,,486.0,495.0,401.0,48.0,3.0,6.0,,0.0,,,,,,8.0,2.0,0.0,43.0,14.0,46.0,43.0,18.0,0.0,24.0,13.0,13.0,59.0,0.0,4.0,,438.0,79.0,,421.0,0.0,441.0,,,,80.0,20.0,0.0,39.0,0.0,0.0,8.0,,135.0,18.0,,0.0,435.0,,246.0,,0.0,6.0,8.0,18.0,3.0,17.0,2.0,,,40.0,33.0,35.0,,15.0,,0.0,338.0,2.0,0.0,64.0,55.0,0.0,12.0,15.0,33.0,24.0,474.0,,565.0,,,,0.0,1.0,0.0,,,75.0,0.0,0.0,,,1.0,1.0,144.0,72.0,22.0,,,,88.0,,0.0,,,0.0,478.0,1.0,52.0,393.0,16.0,11.0,37.0,0.0,60.0,38.0,15.0,,,,21.0,,1.0,47.0


In [368]:
# unfortunately this is not so easy to solve
# maybe looking at the percentage of missing values will help us somehow
print("Percentage of non-NaN values for each country for the column total_tests")
(df.groupby("location").count()["total_tests"] / df.groupby("location").size()).to_frame().T

Percentage of non-NaN values for each country for the column total_tests


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,0.0,0.755776,0.0,0.075041,0.0,0.001661,0.986928,0.64658,0.634823,0.927302,0.239414,0.013356,0.835484,0.940692,0.0,0.337662,0.948438,0.055743,0.001669,0.768473,0.938742,0.722951,0.288889,0.027508,0.0,0.728171,0.0,0.0,0.07716,0.0,0.946071,0.0,0.0,0.0,0.943639,0.003067,0.847291,0.0,0.0,0.986864,0.897351,0.872375,0.779436,0.963636,0.0,0.0,0.993517,0.0,0.0,0.684039,0.925081,0.0,0.630872,0.06,0.993517,0.0,0.920266,0.744966,0.947368,0.0,0.053245,0.080268,0.127832,0.131173,0.793677,0.894822,0.0,0.990017,0.0,0.0,0.0,0.0,0.0,0.03681,0.919804,0.993506,0.900775,0.748777,0.5664,0.791935,0.95935,0.993579,0.954969,0.65894,0.967791,0.436275,0.742525,0.36711,0.0,0.148087,0.86129,0.0,0.111675,0.993475,0.041734,0.0,0.0,0.0,0.860884,0.956098,0.985366,0.443697,0.439863,0.990769,0.948929,0.0,0.773026,0.207987,0.0,0.991883,0.082372,0.0,0.449587,0.0,0.967374,0.924115,0.82483,0.873544,0.936923,0.137763,0.845779,0.0,0.0,0.413961,0.700647,0.933657,0.0,0.893376,0.0,0.97686,0.011765,0.988468,0.981938,0.886822,0.895254,0.990212,0.956098,0.613269,0.77795,0.770383,0.110169,0.0,0.001664,0.0,0.0,0.0,0.993475,0.985318,0.993432,0.0,0.0,0.122699,0.986864,0.993443,0.0,0.0,0.968852,0.881902,0.243523,0.124417,0.95216,0.0,0.0,0.0,0.849758,0.0,0.992331,0.0,0.0,0.981595,0.040472,0.98358,0.838602,0.207856,0.966887,0.752525,0.767974,0.947368,0.895963,0.930982,0.755814,0.0,0.0,0.0,0.289877,0.0,0.949749,0.907563


In [369]:
# not really sure what to do with these
# we'll do some interpolation later for these (between first and last valid value for each country) and see to what extent that fixes it
# let's look at other columns that have a high percentage of missing values
print("Percentage of non-NaN values for each country for the column positive_rate")
(df.groupby("location").count()["positive_rate"] / df.groupby("location").size()).to_frame().T

Percentage of non-NaN values for each country for the column positive_rate


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,0.0,0.754125,0.0,0.482871,0.0,0.0,0.986928,0.969055,0.890601,0.441034,0.29316,0.008347,0.966129,0.937397,0.0,0.977273,0.939063,0.261824,0.0,0.781609,0.975166,0.939344,0.899145,0.0,0.0,0.927512,0.0,0.0,0.074074,0.0,0.972265,0.270588,0.0,0.0,0.932367,0.0,0.835796,0.0,0.0,0.977011,0.925497,0.972536,0.767828,0.952066,0.684039,0.627483,0.985413,0.0,0.0,0.946254,0.664495,0.0,0.630872,0.176667,0.985413,0.0,0.928571,0.82047,0.936533,0.81106,0.164725,0.145485,0.331715,0.131173,0.965058,0.972492,0.0,0.981697,0.0,0.0,0.0,0.0,0.0,0.035276,0.98036,0.0,0.916279,0.902121,0.9104,0.912903,0.947967,0.983949,0.945652,0.958609,0.961656,0.514706,0.740864,0.611296,0.0,0.251248,0.854839,0.0,0.174281,0.985318,0.0,0.0,0.0,0.28088,0.860884,0.946341,0.969106,0.816807,0.682131,0.981538,0.968699,0.0,0.761513,0.0,0.0,0.991883,0.255354,0.0,0.489256,0.0,0.985318,0.947723,0.835034,0.950083,0.98,0.137763,0.980519,0.0,0.0,0.917208,0.867314,0.923948,0.082258,0.959612,0.601639,0.983471,0.084034,0.983526,0.981938,0.875969,0.893617,0.980424,0.0,0.953074,0.925466,0.943428,0.179661,0.0,0.0,0.0,0.0,0.0,0.982055,0.980424,0.985222,0.0,0.0,0.858896,0.977011,0.993443,0.0,0.0,0.985246,0.983129,0.459413,0.844479,0.947531,0.0,0.0,0.720062,0.849758,0.0,0.984663,0.0,0.0,0.973926,0.15683,0.985222,0.853577,0.338789,0.955298,0.821549,0.923203,0.98452,0.885093,0.920245,0.940199,0.0,0.0,0.0,0.527607,0.0,0.981575,0.90084


In [370]:
# this looks similar to total_tests (either countries have a high number of non-NaN values (> 80%) or a low number (< 20%))
# also not sure what to do with these (we'll also interpolate it later)
# let's take a look at another column with a high percentage of NaN-values
print("Percentage of non-NaN values for each country for the column extreme_poverty")
(df.groupby("location").count()["extreme_poverty"] / df.groupby("location").size()).to_frame().T

Percentage of non-NaN values for each country for the column extreme_poverty


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [371]:
# ok this is even more extreme now it's either 100% or 0% now, we can't really do anything here
# let's take a look at another column with a high percentage of NaN-values
print("Percentage of non-NaN values for each country for the column handwashing_facilities")
(df.groupby("location").count()["handwashing_facilities"] / df.groupby("location").size()).to_frame().T

Percentage of non-NaN values for each country for the column handwashing_facilities


location,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,Azerbaijan,Bahamas,Bahrain,Bangladesh,Barbados,Belarus,Belgium,Belize,Benin,Bhutan,Bolivia,Bosnia and Herzegovina,Botswana,Brazil,Brunei,Bulgaria,Burkina Faso,Burundi,Cambodia,Cameroon,Canada,Cape Verde,Central African Republic,Chad,Chile,China,Colombia,Comoros,Congo,Costa Rica,Cote d'Ivoire,Croatia,Cuba,Cyprus,Czechia,Democratic Republic of Congo,Denmark,Djibouti,Dominica,Dominican Republic,Ecuador,Egypt,El Salvador,Equatorial Guinea,Estonia,Eswatini,Ethiopia,Fiji,Finland,France,Gabon,Gambia,Georgia,Germany,Ghana,Greece,Grenada,Guatemala,Guinea,Guinea-Bissau,Guyana,Haiti,Honduras,Hong Kong,Hungary,Iceland,India,Indonesia,Iran,Iraq,Ireland,Israel,Italy,Jamaica,Japan,Jordan,Kazakhstan,Kenya,Kiribati,Kosovo,Kuwait,Kyrgyzstan,Laos,Latvia,Lebanon,Lesotho,Liberia,Libya,Liechtenstein,Lithuania,Luxembourg,Madagascar,Malawi,Malaysia,Maldives,Mali,Malta,Mauritania,Mauritius,Mexico,Moldova,Monaco,Mongolia,Montenegro,Morocco,Mozambique,Myanmar,Namibia,Nepal,Netherlands,New Zealand,Nicaragua,Niger,Nigeria,North Macedonia,Norway,Oman,Pakistan,Palestine,Panama,Papua New Guinea,Paraguay,Peru,Philippines,Poland,Portugal,Qatar,Romania,Russia,Rwanda,Saint Kitts and Nevis,Saint Lucia,Saint Vincent and the Grenadines,Samoa,San Marino,Sao Tome and Principe,Saudi Arabia,Senegal,Serbia,Seychelles,Sierra Leone,Singapore,Slovakia,Slovenia,Solomon Islands,Somalia,South Africa,South Korea,South Sudan,Spain,Sri Lanka,Sudan,Suriname,Sweden,Switzerland,Syria,Taiwan,Tajikistan,Tanzania,Thailand,Timor,Togo,Trinidad and Tobago,Tunisia,Turkey,Uganda,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [372]:
# we can't do anything here as well..
# so let's do some interpolation (we'll only interpolate between the first valid value and the last, because it would probably temper too much with the data)
# first we'll look at the percentage of missing values for each column again
print("Percentage of missing values for each column")
cols = df.columns
(df.isna().sum() / len(df)).to_frame().T

Percentage of missing values for each column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.081165,0.081191,0.099329,0.522683,0.521195,0.455836,0.190596,0.202705,0.226959,0.231121,0.084421,0.0,0.016052,0.037296,0.048252,0.042655,0.037718,0.343791,0.03242,0.016245,0.241805,0.252479,0.510037,0.117633,0.005289,0.027122


In [373]:
# since we're only gonna interpolate between the first and last valid value for each country, we can basically put each column in here and see to what extent it fixes something
# we'll only interpolate the total_columns and add the missing values later for the new_columns
cols_to_interpolate = [
    "total_deaths", "reproduction_rate", "total_tests", "positive_rate", "total_vaccinations","people_vaccinated", "people_fully_vaccinated",
    "stringency_index", "population_density", "median_age", "aged_65_older", "aged_70_older","gdp_per_capita", "extreme_poverty",
    "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "handwashing_facilities", "hospital_beds_per_thousand",
    "life_expectancy", "human_development_index"
]
df = df.groupby("location").apply(lambda x: x[df.columns.difference(cols_to_interpolate)].join(x[cols_to_interpolate].interpolate(method = "linear", axis = 0, limit_area = "inside")))[cols]
# let's change new_deaths, new_tests and new_vaccinations accordingly now
df[["new_deaths", "new_tests", "new_vaccinations"]] = df.groupby("location").apply(lambda x: x[["total_deaths", "total_tests", "total_vaccinations"]].diff()).to_numpy()

In [374]:
# let's see how that affected the percentage of missing values for each column
print("Percentage of missing values for each column")
(df.isna().sum() / len(df)).to_frame().T

Percentage of missing values for each column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.081165,0.082793,0.099329,0.42617,0.425017,0.431926,0.00675,0.007762,0.007691,0.008404,0.08369,0.0,0.016052,0.037296,0.048252,0.042655,0.037718,0.343791,0.03242,0.016245,0.241805,0.252479,0.510037,0.117633,0.005289,0.027122


In [375]:
print("Percentage of countries that only have NaN-values for a given column")
(df.groupby("location").apply(lambda x: x.isna().all()).sum(axis = 0) / len(df["location"].unique())).to_frame().T

Percentage of countries that only have NaN-values for a given column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.015957,0.015957,0.026596,0.319149,0.303191,0.308511,0.0,0.0,0.0,0.0,0.074468,0.0,0.015957,0.037234,0.047872,0.042553,0.037234,0.345745,0.031915,0.015957,0.244681,0.255319,0.505319,0.12234,0.005319,0.026596


In [376]:
# it might be fine to use the average value of the continent for a country for a specific column if it's completly missing 
# (this only applies to some columns (columns that describe local factors and that we won't expect to change much over the time interval))
# however we should probably first look at the standard deviation and compare it with the mean and if the std is too big, we can't do it for that column
cols_to_consider = [
    "location", "population_density", "median_age", "aged_65_older", "aged_70_older", "extreme_poverty", 
    "cardiovasc_death_rate", "diabetes_prevalence", "female_smokers", "male_smokers", "handwashing_facilities",
    "hospital_beds_per_thousand", "life_expectancy", "human_development_index"
]
# get the continent for each country
continents = df.groupby("location").apply(lambda x: x.iloc[0]["continent"])
# get the mean value for the considered columns for each country
temp = df[cols_to_consider].groupby("location").mean()
# add continent as column
temp["continent"] = continents
# now get the relative size of std to mean for each continent and then average that out for all continents
means = temp.groupby("continent").mean()
means_all = (temp.groupby("continent").std() / temp.groupby("continent").mean()).mean()
print("Relative magnitude of std compared to mean (= 1 -> mean and std are the same)")
means_all.to_frame().T

Relative magnitude of std compared to mean (= 1 -> mean and std are the same)


Unnamed: 0,population_density,median_age,aged_65_older,aged_70_older,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,1.89741,0.181294,0.468467,0.510914,1.134216,0.418043,0.399842,0.797155,0.422045,0.341071,0.626664,0.06479,0.136037


In [377]:
# for some columns the std is quite big compared to the mean value, for these columns it's probably not a good idea to just use the mean
# now we just need to define a threshhold at which we want to use the mean of the continent for missing values
threshhold = 0.5
cols_to_use_mean = means_all <= threshhold
cols_to_use_mean = cols_to_use_mean[cols_to_use_mean].index
print("Columns to use average value of the continent for missing values with threshhold = {}:".format(threshhold))
cols_to_use_mean.to_list()

Columns to use average value of the continent for missing values with threshhold = 0.5:


['median_age',
 'aged_65_older',
 'cardiovasc_death_rate',
 'diabetes_prevalence',
 'male_smokers',
 'handwashing_facilities',
 'life_expectancy',
 'human_development_index']

In [378]:
print("Mean values that will be used for NaN cells")
means[cols_to_use_mean]

Mean values that will be used for NaN cells


Unnamed: 0_level_0,median_age,aged_65_older,cardiovasc_death_rate,diabetes_prevalence,male_smokers,handwashing_facilities,life_expectancy,human_development_index
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,21.111321,3.670453,286.848132,5.621509,27.882857,25.155273,64.072264,0.562692
Asia,30.470213,6.584711,290.287261,9.329111,39.697368,71.17292,74.596809,0.739891
Europe,42.07,17.719625,230.159415,6.558409,35.5925,93.954,79.824545,0.879881
North America,31.661905,8.828762,200.113045,10.733043,22.308333,77.541308,75.411739,0.755957
Oceania,27.0125,7.2825,375.077,13.4825,35.542857,30.5495,72.85375,0.71175
South America,30.216667,8.27725,187.323917,7.810833,24.6625,65.990667,75.090833,0.763167


In [379]:
for col in cols_to_use_mean:
    # get all rows that are NaN for this column
    nan_indices = df[col].isna()
    # set it to the mean value of the continent of that country for that column
    df.loc[nan_indices, col] = means.loc[df[nan_indices]["continent"], col].to_numpy()

In [380]:
# let's look at the missing values for each column again
print("Percentage of missing values for each column")
(df.isna().sum() / len(df)).to_frame().T

Percentage of missing values for each column


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,total_deaths,new_deaths,reproduction_rate,new_tests,total_tests,positive_rate,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,0.0,0.0,0.0,0.0,0.0,0.0,0.081165,0.082793,0.099329,0.42617,0.425017,0.431926,0.00675,0.007762,0.007691,0.008404,0.08369,0.0,0.016052,0.0,0.0,0.042655,0.037718,0.343791,0.0,0.0,0.241805,0.0,0.0,0.117633,0.0,0.0


In [381]:
# except a few columns it's looking decent now, not sure what to do further