# Using Python to Debunk COVID Myths: ‘Death Statistic Inflation’

What is required from Python:
 - Download most recent death and population data from eurostat
 - Format data and only select where NUTS3 includes UK
 - Interpolate weekly population numbers
 - Age standardise
 - 

In [38]:
import datetime as dt
import gzip
import io
import numpy as np
import pandas as pd
import requests
import sys
import warnings

%config Completer.use_jedi = False
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## 1. Population Data

### 1a. Import, Clean and Munge Raw Data

In [39]:
r = requests.get('https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_pjangrp3.tsv.gz')
mlz = gzip.open(io.BytesIO(r.content))

df_pop = pd.read_csv(mlz, sep='\t')

In [41]:
# rename and fix id data column
df_pop = df_pop.rename(columns={"sex,unit,age,geo\\time": "Headings"})
# parse to 4 cols
df_pop["Headings"] = df_pop["Headings"].apply(lambda x: x.split(','))
df_pop[['Sex', 'Unit', 'Age', 'Code']] = pd.DataFrame(df_pop.Headings.tolist(), index= df_pop.index)
df_pop = df_pop.drop(columns=['Headings', 'Unit'])

In [42]:
df_pop = df_pop[(df_pop.Sex == 'T') & (~df_pop.Age.isin(['TOTAL', 'UNK']))]
df_pop = df_pop.drop(columns=['Sex'])

In [43]:
df_pop = pd.melt(df_pop, id_vars=['Age', 'Code'], var_name=['Year'], value_vars=['2014 ', '2015 ', '2016 ', '2017 ', '2018 ', '2019 '], value_name='Pop')

In [44]:
# remove iregs from number col (e.g. p means provisional)
num_iregs = [":", "b", "p", "e", " "]
for ireg in num_iregs:
    df_pop.Pop = df_pop.Pop.str.replace(ireg, "")

# cast to numeric
num_cols = ['Pop', 'Year']
for col in num_cols:
    df_pop[col] = pd.to_numeric(df_pop[col])

print('We have {:,.0f} observations for annual data by NUTS3 and age group breakdown'.format(len(df_pop)))
df_pop.head()

We have 239,352 observations for annual data by NUTS3 and age group breakdown


Unnamed: 0,Age,Code,Year,Pop
0,Y10-14,AL,2014,215892.0
1,Y10-14,AL0,2014,215892.0
2,Y10-14,AL01,2014,64493.0
3,Y10-14,AL011,2014,10707.0
4,Y10-14,AL012,2014,20163.0


In [45]:
# give country code to help with chunking
df_pop['Country_Code'] = df_pop.Code.str[:2]

In [47]:
df_pop = pd.merge(left=df_pop, right=df_nuts, on='Code', how='left')

In [48]:
df_pop = df_pop[df_pop.Country == 'United Kingdom']

### 1b. Create Liner Interp for 2020 and 2021

In [8]:
# add 2020, 2021 data with NAN for pop to be linearly interpolated forward
df_pop_new = df_pop[['Age', 'Code', 'Country_Code']].drop_duplicates()
df_pop_new['Pop'] = np.nan

df_pop_new['Year'] = 2020
df_pop = pd.concat([df_pop, df_pop_new])

df_pop_new['Year'] = 2021
df_pop = pd.concat([df_pop, df_pop_new])

In [9]:
# just to prove we have a complete data set
df_pop[['Year', 'Code']].groupby('Year').count()

Unnamed: 0_level_0,Code
Year,Unnamed: 1_level_1
2014,39892
2015,39892
2016,39892
2017,39892
2018,39892
2019,39892
2020,39892
2021,39892


In [10]:
# linear interp 2019 population by group for 2020 and 2021
df_pop = df_pop.sort_values(['Code', 'Age', 'Year'])
df_pop = df_pop.reset_index(drop=True)
df_pop['Pop'] = df_pop['Pop'].ffill()