<a href="https://colab.research.google.com/github/mariobecerra/mda_project/blob/main/code/01_weekly_temperature_belgium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

URLs of data

In [32]:
arrondissements_url = 'https://raw.githubusercontent.com/mariobecerra/mda_project/main/data/arrondissements_coords.csv'
mortality_url = 'https://raw.githubusercontent.com/mariobecerra/mda_project/main/data/mortality_data_2000-2019.csv'
temperature_url = 'https://raw.githubusercontent.com/mariobecerra/mda_project/main/data/temp_2000_2019.csv'

Read data

In [33]:
import pandas as pd
import numpy as np
import datetime

In [34]:
arrondissements_data = pd.read_csv(arrondissements_url)

arrondissements_data.head(45)

Unnamed: 0,NIS_Code,Nom_arrondissement,lat,lon
0,11000,Antwerpen,51.280681,4.505809
1,63000,Verviers,50.464213,6.02207
2,13000,Turnhout,51.250094,4.950422
3,62000,Liège,50.613433,5.607778
4,64000,Waremme,50.662789,5.206501
5,37000,Tielt,50.999605,3.312998
6,45000,Audenarde,50.832904,3.638004
7,91000,Dinant,50.165781,5.025523
8,52000,Charleroi,50.444787,4.442946
9,35000,Oostende,51.176463,2.952955


In [35]:
mortality_data = pd.read_csv(mortality_url)
mortality_data


Unnamed: 0,YEAR,YEAR_WEEK,COD,ARRON,N_MASK
0,2000,2000-001,external,11000,
1,2000,2000-001,external,12000,
2,2000,2000-001,external,13000,
3,2000,2000-001,external,21000,
4,2000,2000-001,external,23000,
...,...,...,...,...,...
82579,2019,2019-053,natural,84000,6.0
82580,2019,2019-053,natural,85000,5.0
82581,2019,2019-053,natural,91000,10.0
82582,2019,2019-053,natural,92000,19.0


In [36]:
temperature_data = pd.read_csv(temperature_url, sep=";")
temperature_data.describe()

Unnamed: 0,GRID_NO,LATITUDE,LONGITUDE,DAY,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG
count,540570.0,540570.0,540570.0,540570.0,540570.0,540570.0,540570.0
mean,100891.5,50.646314,4.642905,20095660.0,14.522454,7.204934,10.86472
std,2312.18536,0.499724,1.032292,57664.57,7.654351,5.818143,6.484894
min,96095.0,49.59159,2.39172,20000100.0,-13.1,-20.8,-14.9
25%,99096.0,50.27905,3.83158,20041230.0,8.7,3.0,6.1
50%,101093.5,50.68772,4.83266,20091230.0,14.6,7.5,11.1
75%,103090.0,51.06821,5.56253,20141230.0,20.3,11.7,15.9
max,105095.0,51.60877,6.36911,20191230.0,41.6,26.5,32.4


In [37]:
temperature_data.max()

GRID_NO            1.050950e+05
LATITUDE           5.160877e+01
LONGITUDE          6.369110e+00
DAY                2.019123e+07
TEMPERATURE_MAX    4.160000e+01
TEMPERATURE_MIN    2.650000e+01
TEMPERATURE_AVG    3.240000e+01
dtype: float64

Find the closest temperature grid to each arrondissement

In [38]:
grids_temp = temperature_data.drop_duplicates(subset = ['GRID_NO', 'LATITUDE', 'LONGITUDE'])
grids_temp

Unnamed: 0,GRID_NO,LATITUDE,LONGITUDE,DAY,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG
0,96095,49.59159,5.03811,20000101,6.2,2.9,4.5
7305,96096,49.60623,5.38328,20000101,5.8,2.4,4.1
14610,96097,49.61982,5.72863,20000101,4.9,1.1,3.0
21915,96098,49.63236,6.07415,20000101,5.3,1.5,3.4
29220,97095,49.81581,5.01500,20000101,5.9,2.5,4.2
...,...,...,...,...,...,...,...
504045,104094,51.36832,4.48712,20000101,8.5,4.9,6.7
511350,104095,51.38472,4.84537,20000101,8.1,4.5,6.3
518655,104096,51.40002,5.20384,20000101,7.9,4.0,6.0
525960,104097,51.41421,5.56253,20000101,7.9,4.0,6.0


In [39]:
# Not the most efficient, but it's a small dataset so it's okay
distances = pd.DataFrame()

for i in range(arrondissements_data.shape[0]):
  diff_lat = grids_temp['LATITUDE'] - arrondissements_data.loc[i].at['lat']
  diff_lon = grids_temp['LONGITUDE'] - arrondissements_data.loc[i].at['lon']
  distances_i = np.power((diff_lat), 2) + np.power((diff_lon), 2)

  temp = pd.DataFrame(
        {
            'NIS_Code': arrondissements_data.loc[i].at['NIS_Code'],
            'GRID_NO': grids_temp.GRID_NO,
            'Dist': distances_i
        }
    )
  distances = pd.concat([distances, temp])



In [40]:
distances

Unnamed: 0,NIS_Code,GRID_NO,Dist
0,11000,96095,3.136373
7305,11000,96096,3.573742
14610,11000,96097,4.253751
21915,11000,96098,5.176657
29220,11000,97095,2.405123
...,...,...,...
504045,54000,104094,2.017259
511350,54000,104095,3.076978
518655,54000,104096,4.393328
525960,54000,104097,5.966667


Find the temperature grid closest to each arrondissement

In [41]:
arron_grid_mapping = distances.groupby('NIS_Code').apply(lambda x: x[x['Dist'] == x['Dist'].min()])[['NIS_Code', 'GRID_NO']].reset_index(drop = True)

arron_grid_mapping.head(8)

Unnamed: 0,NIS_Code,GRID_NO
0,11000,104094
1,12000,103094
2,13000,103095
3,21000,102094
4,23000,102093
5,24000,102095
6,25000,101094
7,31000,104090


Join arrondissement and temperature datasets to get temperature per day in each arrondissement.

In [42]:
join_1 = pd.merge(arron_grid_mapping, temperature_data, how = "inner")
join_1

Unnamed: 0,NIS_Code,GRID_NO,LATITUDE,LONGITUDE,DAY,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG
0,11000,104094,51.36832,4.48712,20000101,8.5,4.9,6.7
1,11000,104094,51.36832,4.48712,20000102,9.5,5.4,7.4
2,11000,104094,51.36832,4.48712,20000103,9.8,7.6,8.7
3,11000,104094,51.36832,4.48712,20000104,9.8,7.2,8.5
4,11000,104094,51.36832,4.48712,20000105,8.1,3.2,5.7
...,...,...,...,...,...,...,...,...
321415,93000,99094,50.24822,4.61816,20191227,6.7,4.2,5.4
321416,93000,99094,50.24822,4.61816,20191228,4.6,3.8,4.2
321417,93000,99094,50.24822,4.61816,20191229,7.2,-1.8,2.7
321418,93000,99094,50.24822,4.61816,20191230,9.2,1.1,5.2


In [43]:
arron_temp_data = pd.merge(join_1, arrondissements_data, how = 'left')[['DAY', 'Nom_arrondissement', 'NIS_Code', 'GRID_NO', 'TEMPERATURE_MAX', 'TEMPERATURE_MIN', 'TEMPERATURE_AVG']].sort_values(by=['Nom_arrondissement', 'NIS_Code', 'DAY']).assign(YEAR = lambda x: np.floor(x.DAY/10000))
arron_temp_data

Unnamed: 0,DAY,Nom_arrondissement,NIS_Code,GRID_NO,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG,YEAR
116880,20000101,Aalst,41000,102092,8.4,4.7,6.6,2000.0
116881,20000102,Aalst,41000,102092,8.8,5.7,7.3,2000.0
116882,20000103,Aalst,41000,102092,9.5,7.1,8.3,2000.0
116883,20000104,Aalst,41000,102092,10.2,7.1,8.7,2000.0
116884,20000105,Aalst,41000,102092,7.2,2.1,4.7,2000.0
...,...,...,...,...,...,...,...,...
80350,20191227,Ypres,33000,102089,8.0,6.9,7.4,2019.0
80351,20191228,Ypres,33000,102089,7.5,5.7,6.6,2019.0
80352,20191229,Ypres,33000,102089,4.8,0.5,2.7,2019.0
80353,20191230,Ypres,33000,102089,7.7,1.2,4.5,2019.0


In [60]:
arron_temp_data = pd.merge(join_1, arrondissements_data, how = 'left')[['DAY', 'Nom_arrondissement', 'NIS_Code', 'GRID_NO', 'TEMPERATURE_MAX', 'TEMPERATURE_MIN', 'TEMPERATURE_AVG']].sort_values(by=['Nom_arrondissement', 'NIS_Code', 'DAY']).assign(Month = lambda x: (x.DAY//100)%100)
arron_temp_data

# Saving arron_emp_data because it will be used for meta-analysis dataset construction
arron_temp_data.to_csv('../out/arron_temp_data.csv', encoding = 'utf-8-sig')

Get percentiles to define heatwave

In [45]:
def q95(x):
    return x.quantile(0.90)

temp_percetiles_year_arron = arron_temp_data[['Month', 'NIS_Code', 'Nom_arrondissement', 'TEMPERATURE_MAX']].groupby(['Month', 'NIS_Code', 'Nom_arrondissement']).agg(q95).stack(level=0).reset_index().rename(columns={0:"p95"})
temp_percetiles_year_arron[temp_percetiles_year_arron['Month']==6]

Unnamed: 0,Month,NIS_Code,Nom_arrondissement,level_3,p95
220,6,11000,Antwerpen,TEMPERATURE_MAX,27.2
221,6,12000,Mechelen,TEMPERATURE_MAX,27.51
222,6,13000,Turnhout,TEMPERATURE_MAX,27.81
223,6,21000,Bruxelles-Capitale,TEMPERATURE_MAX,27.4
224,6,23000,Hal-Vilvorde,TEMPERATURE_MAX,27.4
225,6,24000,Leuven,TEMPERATURE_MAX,28.0
226,6,25000,Nivelles,TEMPERATURE_MAX,27.4
227,6,31000,Brugge,TEMPERATURE_MAX,23.9
228,6,32000,Diksmuide,TEMPERATURE_MAX,24.7
229,6,33000,Ypres,TEMPERATURE_MAX,26.01


Get temperatures of five consecutive days

In [46]:

temp_lead_1 = arron_temp_data.groupby(['Nom_arrondissement', 'NIS_Code'])['TEMPERATURE_MAX'].shift(-1)

temp_lead_2 = arron_temp_data.groupby(['Nom_arrondissement', 'NIS_Code'])['TEMPERATURE_MAX'].shift(-2)

temp_lead_3 = arron_temp_data.groupby(['Nom_arrondissement', 'NIS_Code'])['TEMPERATURE_MAX'].shift(-3)

temp_lead_4 = arron_temp_data.groupby(['Nom_arrondissement', 'NIS_Code'])['TEMPERATURE_MAX'].shift(-4)

    

In [47]:
arron_temp_data['temp_lead_1'] = temp_lead_1
arron_temp_data['temp_lead_2'] = temp_lead_2
arron_temp_data['temp_lead_3'] = temp_lead_3
arron_temp_data['temp_lead_4'] = temp_lead_4
arron_temp_data

Unnamed: 0,DAY,Nom_arrondissement,NIS_Code,GRID_NO,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG,Month,temp_lead_1,temp_lead_2,temp_lead_3,temp_lead_4
116880,20000101,Aalst,41000,102092,8.4,4.7,6.6,1,8.8,9.5,10.2,7.2
116881,20000102,Aalst,41000,102092,8.8,5.7,7.3,1,9.5,10.2,7.2,9.0
116882,20000103,Aalst,41000,102092,9.5,7.1,8.3,1,10.2,7.2,9.0,6.6
116883,20000104,Aalst,41000,102092,10.2,7.1,8.7,1,7.2,9.0,6.6,8.7
116884,20000105,Aalst,41000,102092,7.2,2.1,4.7,1,9.0,6.6,8.7,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...
80350,20191227,Ypres,33000,102089,8.0,6.9,7.4,12,7.5,4.8,7.7,5.8
80351,20191228,Ypres,33000,102089,7.5,5.7,6.6,12,4.8,7.7,5.8,
80352,20191229,Ypres,33000,102089,4.8,0.5,2.7,12,7.7,5.8,,
80353,20191230,Ypres,33000,102089,7.7,1.2,4.5,12,5.8,,,


Create boolean variable if there are 5 consecutive days in which temperature was higher than the 95-th percentile of temperature each year and each arrondissement

In [49]:
join_percentiles = pd.merge(arron_temp_data, temp_percetiles_year_arron[['Month', 'NIS_Code', 'Nom_arrondissement', 'p95']], how = "left")
join_percentiles

Unnamed: 0,DAY,Nom_arrondissement,NIS_Code,GRID_NO,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG,Month,temp_lead_1,temp_lead_2,temp_lead_3,temp_lead_4,p95
0,20000101,Aalst,41000,102092,8.4,4.7,6.6,1,8.8,9.5,10.2,7.2,10.91
1,20000102,Aalst,41000,102092,8.8,5.7,7.3,1,9.5,10.2,7.2,9.0,10.91
2,20000103,Aalst,41000,102092,9.5,7.1,8.3,1,10.2,7.2,9.0,6.6,10.91
3,20000104,Aalst,41000,102092,10.2,7.1,8.7,1,7.2,9.0,6.6,8.7,10.91
4,20000105,Aalst,41000,102092,7.2,2.1,4.7,1,9.0,6.6,8.7,6.7,10.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...
321415,20191227,Ypres,33000,102089,8.0,6.9,7.4,12,7.5,4.8,7.7,5.8,11.90
321416,20191228,Ypres,33000,102089,7.5,5.7,6.6,12,4.8,7.7,5.8,,11.90
321417,20191229,Ypres,33000,102089,4.8,0.5,2.7,12,7.7,5.8,,,11.90
321418,20191230,Ypres,33000,102089,7.7,1.2,4.5,12,5.8,,,,11.90


In [50]:
bool0 = (join_percentiles['TEMPERATURE_MAX'].to_numpy() >= join_percentiles['p95'].to_numpy()).astype(int)
bool1 = (join_percentiles['temp_lead_1'].to_numpy() >= join_percentiles['p95'].to_numpy()).astype(int)
bool2 = (join_percentiles['temp_lead_2'].to_numpy() >= join_percentiles['p95'].to_numpy()).astype(int)
bool3 = (join_percentiles['temp_lead_3'].to_numpy() >= join_percentiles['p95'].to_numpy()).astype(int)
bool4 = (join_percentiles['temp_lead_4'].to_numpy() >= join_percentiles['p95'].to_numpy()).astype(int)
heatwave_boolean = bool0 * bool1 * bool2 * bool3 * bool4
heatwave_boolean

array([0, 0, 0, ..., 0, 0, 0])

In [51]:
# Not the most efficient, but it works
iso_weeks = ['1990-001']*join_percentiles.shape[0] # Prefill list with correct size
for i in range(len(iso_weeks)):
  day_i = join_percentiles.loc[i].at['DAY']
  iso_week_i = datetime.datetime.strptime(str(day_i), '%Y%m%d').isocalendar()
  iso_weeks[i] = str(iso_week_i[0]) + '-' + str(iso_week_i[1]).zfill(3)



In [52]:
heat_wave_day_def = join_percentiles[['NIS_Code', 'Nom_arrondissement','Month']]
heat_wave_day_def['heatwave_boolean'] = heatwave_boolean
heat_wave_day_def['YEAR_WEEK'] = iso_weeks
heat_wave_day_def

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heat_wave_day_def['heatwave_boolean'] = heatwave_boolean
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heat_wave_day_def['YEAR_WEEK'] = iso_weeks


Unnamed: 0,NIS_Code,Nom_arrondissement,Month,heatwave_boolean,YEAR_WEEK
0,41000,Aalst,1,0,1999-052
1,41000,Aalst,1,0,1999-052
2,41000,Aalst,1,0,2000-001
3,41000,Aalst,1,0,2000-001
4,41000,Aalst,1,0,2000-001
...,...,...,...,...,...
321415,33000,Ypres,12,0,2019-052
321416,33000,Ypres,12,0,2019-052
321417,33000,Ypres,12,0,2019-052
321418,33000,Ypres,12,0,2020-001


In [53]:
heat_wave_week = heat_wave_day_def.groupby(['NIS_Code', 'Nom_arrondissement','YEAR_WEEK'])['heatwave_boolean'].sum().reset_index()
heat_wave_week['heatwave_week_boolean'] = (heat_wave_week['heatwave_boolean'] > 0).astype(int)
heat_wave_week

Unnamed: 0,NIS_Code,Nom_arrondissement,YEAR_WEEK,heatwave_boolean,heatwave_week_boolean
0,11000,Antwerpen,1999-052,0,0
1,11000,Antwerpen,2000-001,0,0
2,11000,Antwerpen,2000-002,0,0
3,11000,Antwerpen,2000-003,0,0
4,11000,Antwerpen,2000-004,0,0
...,...,...,...,...,...
45975,93000,Philippeville,2019-049,0,0
45976,93000,Philippeville,2019-050,0,0
45977,93000,Philippeville,2019-051,0,0
45978,93000,Philippeville,2019-052,0,0


In [54]:
heat_wave_week['heatwave_week_boolean'].sum()

1675

In [55]:
mortality_heat_wave = pd.merge(heat_wave_week[['YEAR_WEEK', 'NIS_Code', 'Nom_arrondissement', 'heatwave_week_boolean']], mortality_data[['ARRON', 'YEAR_WEEK', 'COD', 'N_MASK']], left_on = ['NIS_Code', 'YEAR_WEEK'], right_on = ['ARRON', 'YEAR_WEEK'], how = 'inner')
mortality_heat_wave.count()

YEAR_WEEK                81142
NIS_Code                 81142
Nom_arrondissement       81142
heatwave_week_boolean    81142
ARRON                    81142
COD                      81142
N_MASK                   57772
dtype: int64

Save dataset

In [58]:
mortality_heat_wave.to_csv('../out/mortality_heat_wave.csv', encoding = 'utf-8-sig')