In [2]:
mortality_url = 'https://raw.githubusercontent.com/mariobecerra/mda_project/main/data/mortality_data_2000-2019.csv'
population_url = 'https://raw.githubusercontent.com/mariobecerra/mda_project/main/data/Population_par_commune.xlsx'

In [2]:
import pandas as pd

#################                weekly mortality data divided by municipality (2000-2019)
week_mor_mun = pd.read_csv(mortality_url)
week_mor_mun['WEEK'] = week_mor_mun['YEAR_WEEK'].str.slice(start=6).astype('int') # get week number
week_mor_mun.rename(columns = {'N_MASK':'N_DEATHS'}, inplace = True)
del week_mor_mun['YEAR_WEEK']
week_mor_mun = week_mor_mun[week_mor_mun.COD != 'external'] # do not need data for external causes of death
week_mor_mun = week_mor_mun[week_mor_mun.ARRON != 58000] # data for Arrondissement de La Louvière is only available for 2019 and therefore has been removed from dataset
print(week_mor_mun)
week_mor_mun.info()

       YEAR      COD  ARRON  N_DEATHS  WEEK
17     2000  natural  11000      31.0     1
18     2000  natural  12000      10.0     1
19     2000  natural  13000      14.0     1
20     2000  natural  21000      43.0     1
21     2000  natural  23000      14.0     1
...     ...      ...    ...       ...   ...
82579  2019  natural  84000       6.0    53
82580  2019  natural  85000       5.0    53
82581  2019  natural  91000      10.0    53
82582  2019  natural  92000      19.0    53
82583  2019  natural  93000       7.0    53

[45555 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 45555 entries, 17 to 82583
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   YEAR      45555 non-null  int64  
 1   COD       45555 non-null  object 
 2   ARRON     45555 non-null  int64  
 3   N_DEATHS  45210 non-null  float64
 4   WEEK      45555 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(1)
memory usage: 

In [3]:
##################               combine data sets from different excel sheets

# data with yearly population data divided by municipality
xls = pd. ExcelFile(population_url)

# create base dataframe
cols = ['municipality_code', 'municipality_name', 'male', 'female', 'total']
year_pop_mun = pd.DataFrame(columns=cols)
years = []
for i in range(21): # need poulation data for 21 years (years on both ends)
    df = pd.read_excel(xls, 'Population en '+str(2000+i), header=None, skiprows=4, nrows=646) # 2000-2019
    df.columns = cols
    length_of_df = df.loc[df['municipality_name'] == 'Viroinval'].index.tolist()[0] + 1 # get the last line of table
    years = years+[2000+i]*length_of_df
    year_pop_mun = pd.concat([year_pop_mun, df[:length_of_df]] , axis=0, ignore_index=True)
year_pop_mun['year'] = years
    
# # last dataframe is missing 8 municipalities
# year_pop_mun.drop(year_pop_mun.tail(4).index,inplace=True)
# year_pop_mun['year'] = years[:-8]

# convert population columns to int
convert_dict = {'male': int, 'female': int, 'total': int, 'municipality_code':int}
year_pop_mun = year_pop_mun.astype(convert_dict)

In [4]:
import numpy as np
##################                assign population to weekly death rate data (at week 1)
existing_arrons = week_mor_mun['ARRON'].value_counts().index
week_mor_mun['population_t'] = np.nan
exceptions = []
for year in range(21):
    for arron in existing_arrons:
        value = year_pop_mun[(year_pop_mun['year'] == (2000+year)) & (year_pop_mun['municipality_code'] == arron)]['total'].tolist()
        if len(value) == 1: # check if population value exists in year_pop_mun
            condition = (week_mor_mun['ARRON'] == arron) & (week_mor_mun['WEEK'] == 1) & (week_mor_mun['YEAR'] == (2000+year))
            if (condition).any(): # check if a row with week 1 exists in week_mor_mun
                week_mor_mun.loc[condition, 'population_t'] = value[0]
            else:
                dummy_df = pd.DataFrame({'YEAR': [2000+year], 'COD': ['natural'], 'ARRON': [arron], 'N_DEATHS': [0], 'WEEK':[1], 'population_t':  [value[0]]})
                week_mor_mun = pd.concat([week_mor_mun, dummy_df] , axis=0, ignore_index=True)
        else:
            exceptions.append((year, arron))

In [5]:
exceptions

[(19, 54000), (20, 54000)]

In [6]:
week_mor_mun['ARRON'].value_counts()

64000    1065
12000    1065
62000    1065
92000    1065
24000    1065
73000    1065
35000    1065
63000    1065
46000    1065
93000    1065
32000    1065
42000    1065
25000    1065
72000    1065
55000    1065
38000    1065
21000    1065
85000    1065
45000    1065
13000    1065
61000    1065
44000    1065
11000    1065
41000    1065
56000    1065
71000    1065
52000    1065
33000    1065
31000    1065
83000    1065
34000    1065
91000    1065
57000    1065
23000    1065
53000    1065
36000    1065
84000    1064
37000    1064
43000    1064
51000    1064
81000    1063
82000    1062
54000    1011
Name: ARRON, dtype: int64

In [7]:
# get number of weeks for each year
no_weeks = [week_mor_mun[(week_mor_mun['YEAR'] == (2000+i))]['WEEK'].max() for i in range(20)] # weeks for each year
for year in range(20): # for each year
    for arron in existing_arrons: # for each arron
        if arron == 54000 and (year == 18 or year == 19): # 54000 population not available for 2019
            continue
        condition = (week_mor_mun['ARRON'] == arron) & (week_mor_mun['WEEK'] == 1)
        prev_year_pop = week_mor_mun.loc[condition & (week_mor_mun['YEAR'] == (2000+year)), 'population_t'].tolist()[0]
        next_year_pop = week_mor_mun.loc[condition & (week_mor_mun['YEAR'] == (2000+year+1)), 'population_t'].tolist()[0]
        inter_values = np.linspace(prev_year_pop, next_year_pop, no_weeks[year], endpoint=False, dtype=int)[1:]
        for ix, k in enumerate(inter_values):
            week_mor_mun.loc[(week_mor_mun['ARRON'] == arron) & (week_mor_mun['YEAR'] == (2000+year)) & (week_mor_mun['WEEK'] == ix + 2),'population_t'] = k

In [8]:
print(arron, year)

54000 19


In [9]:
week_mor_mun.to_csv('out/weekly_mortality_with_linear_interpolated_population_data.csv', index=False)

# Mortality Dataframe for Meta-Analysis

In [5]:
import pandas as pd

df = pd.read_csv(mortality_url)
df.rename(columns = {'N_MASK':'N_DEATHS'}, inplace = True)
df['WEEK'] = df['YEAR_WEEK'].str.slice(start=6).astype('int') # get week number
df = df[df.COD != 'external'] # do not need data for external causes of death
df = df[df.ARRON != 58000] # data for Arrondissement de La Louvière is only available for 2019 and therefore has been removed from dataset
df = df.sort_values(by = ['YEAR_WEEK', 'ARRON'])
df.to_csv('out/mortality.csv', index=False)