In [136]:
import numpy as np
import pandas as pd

# Introduction

**We know that that heatwave is the extented period of time when extremely high temperatures are observed relative to the climate of the location. In many literatures the severity of the heatwave is represented by the Excess Heat Factor (EHF) [Nairn et al., 2015](https://pubmed.ncbi.nlm.nih.gov/25546282/) and mathematically it is defined to be:**

$$EHF=EHI_{sig}*max(1, EHI_{accl})$$ 



where

$$EHI_{sig}=(T_i+T_{i+1}+T_{i+2}/3-T_{95})$$ 
$$EHI_{accl}=(T_i+T_{i+1}+T_{i+2}/3-T_i+T_{i-1}+...+T_{i-30}/30)$$ 




**$T_{95}$ is calculated by taking the 95th percetile of the DMTs. DMT of the day is found as the mean of the  temperatures recorded of the given day. Of course the measurements must be equally spaced.** 

# Data collection

**Using [Opendata meteo.be](https://opendata.meteo.be) the following temperature data was collected in csv format:**


*Cities: Brussels, Antwerp, Liege*

*Period: 1952-2021* 

*Frequency: Hourly*

# Data cleaning and transformation

In [137]:
Bru=pd.read_csv("./data/Brussels.csv")
Ant=pd.read_csv("./data/Antwerp.csv")
Lie=pd.read_csv("./data/Liege.csv")

In [138]:
Bru.head()

Unnamed: 0,FID,the_geom,code,timestamp,precip_quantity,precip_range,temp,temp_min,temp_max,temp_grass_min,...,wind_speed_unit,wind_direction,wind_peak_speed,humidity_relative,weather_current,pressure,pressure_station_level,sun_duration_24hours,short_wave_from_sky_24hours,cloudiness
0,synop_data.6451.1952-01-01 00:00:00+00,POINT (50.896391 4.526765),6451,1952-01-01T00:00:00,,,3.0,,,,...,,90.0,,,61.0,1005.5,,,,8.0
1,synop_data.6451.1952-01-01 03:00:00+00,POINT (50.896391 4.526765),6451,1952-01-01T03:00:00,,,3.0,,,,...,,,,,50.0,1003.1,,,,8.0
2,synop_data.6451.1952-01-01 06:00:00+00,POINT (50.896391 4.526765),6451,1952-01-01T06:00:00,2.0,2.0,3.0,3.0,,,...,,250.0,,,51.0,1004.0,,,,8.0
3,synop_data.6451.1952-01-01 09:00:00+00,POINT (50.896391 4.526765),6451,1952-01-01T09:00:00,,,3.0,,,,...,,270.0,,,21.0,1006.9,,,,5.0
4,synop_data.6451.1952-01-01 12:00:00+00,POINT (50.896391 4.526765),6451,1952-01-01T12:00:00,,,4.0,,,,...,,260.0,,,25.0,1009.2,,,,6.0


**As we are using only the information on temperature we need to keep only the timestamp and the temperature**

In [139]:
Bru=Bru[["timestamp", "temp"]]
Ant=Ant[["timestamp", "temp"]]
Lie=Lie[["timestamp", "temp"]]

In [140]:
Bru.head()

Unnamed: 0,timestamp,temp
0,1952-01-01T00:00:00,3.0
1,1952-01-01T03:00:00,3.0
2,1952-01-01T06:00:00,3.0
3,1952-01-01T09:00:00,3.0
4,1952-01-01T12:00:00,4.0


In [141]:
cities=[Bru, Ant, Lie]
for city in cities:
    city['timestamp_formatted']=pd.to_datetime(city['timestamp'])

In [142]:
Bru.head()

Unnamed: 0,timestamp,temp,timestamp_formatted
0,1952-01-01T00:00:00,3.0,1952-01-01 00:00:00
1,1952-01-01T03:00:00,3.0,1952-01-01 03:00:00
2,1952-01-01T06:00:00,3.0,1952-01-01 06:00:00
3,1952-01-01T09:00:00,3.0,1952-01-01 09:00:00
4,1952-01-01T12:00:00,4.0,1952-01-01 12:00:00


In [143]:
Bru.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420595 entries, 0 to 420594
Data columns (total 3 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   timestamp            420595 non-null  object        
 1   temp                 420261 non-null  float64       
 2   timestamp_formatted  420595 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 9.6+ MB


In [144]:
for city in cities:
    city['year']=city['timestamp_formatted'].dt.year
    city['month']=city['timestamp_formatted'].dt.month
    city['day']=city['timestamp_formatted'].dt.day
    city['hour']=city['timestamp_formatted'].dt.hour
    city['daytime']=city['hour'].apply(lambda x: 18>=x>=12)
    city['nighttime']=city['hour'].apply(lambda x: 6>=x>=0)
    city.drop(labels=['timestamp','timestamp_formatted'], axis=1, inplace=True)

In [145]:
Bru.head(10)

Unnamed: 0,temp,year,month,day,hour,daytime,nighttime
0,3.0,1952,1,1,0,False,True
1,3.0,1952,1,1,3,False,True
2,3.0,1952,1,1,6,False,True
3,3.0,1952,1,1,9,False,False
4,4.0,1952,1,1,12,True,False
5,5.0,1952,1,1,15,True,False
6,3.0,1952,1,1,18,True,False
7,2.0,1952,1,1,21,False,False
8,3.0,1952,1,2,0,False,True
9,3.0,1952,1,2,3,False,True


In [146]:
Bru.isna().sum()/Bru.shape[0]

temp         0.000794
year         0.000000
month        0.000000
day          0.000000
hour         0.000000
daytime      0.000000
nighttime    0.000000
dtype: float64

In [147]:
Ant.isna().sum()/Ant.shape[0]

temp         0.034075
year         0.000000
month        0.000000
day          0.000000
hour         0.000000
daytime      0.000000
nighttime    0.000000
dtype: float64

In [148]:
Lie.isna().sum()/Lie.shape[0]

temp         0.009356
year         0.000000
month        0.000000
day          0.000000
hour         0.000000
daytime      0.000000
nighttime    0.000000
dtype: float64

**Thus, it needs to be researched if there are summer days when the temperatures were not recorded at all. If is partially recorded, see if it influences our analysis (possibility of smart imputation).**

In [149]:
Bru_summer=Bru[Bru['month'].isin([5,6,7,8,9])]

In [150]:
Ant_summer=Ant[Ant['month'].isin([5,6,7,8,9])]

In [151]:
Lie_summer=Lie[Lie['month'].isin([5,6,7,8,9])]

In [152]:
Bru_count=Bru_summer.set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).count()
Ant_count=Ant_summer.set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).count()
Lie_count=Lie_summer.set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).count()

In [153]:
Bru_count['hour'].unique()

array([ 8, 24,  9, 23, 13, 22, 20, 21])

In [154]:
Ant_count['hour'].unique()

array([ 8,  2,  6, 24, 12, 23, 13, 22, 17, 21])

In [155]:
Lie_count['hour'].unique()

array([ 8, 24, 10, 23, 19, 20, 13, 22, 21, 18])

In [156]:
Bru_count[Bru_count['hour']==24].index.unique(level=0)

Int64Index([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
            1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
            2018, 2019, 2020, 2021],
           dtype='int64', name='year')

In [157]:
Ant_count[Ant_count['hour']==24].index.unique(level=0)

Int64Index([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
            1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
            2018, 2019, 2020, 2021],
           dtype='int64', name='year')

In [158]:
Lie_count[Lie_count['hour']==24].index.unique(level=0)

Int64Index([1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
            1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
            2018, 2019, 2020, 2021],
           dtype='int64', name='year')

**At this point we know that until 1985 the measurements were taken 8 times a day, start at 00:00 and ends at 21:00 with 3 hours period. After 1985, it the measurements were taken 24 times a day every hour. However, it seems like there are some exceptions too.**

In [184]:
pd.set_option('display.max_rows', 300)

In [160]:
Bru_na=Bru_summer.set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Ant_na=Ant_summer.set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Lie_na=Lie_summer.set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()

In [161]:
Bru_na[Bru_na['temp']>0][['temp']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp
year,month,day,Unnamed: 3_level_1
1952,9,1,1
1953,6,1,1
1960,5,25,8
1962,5,18,1
1962,5,31,1
1962,9,14,1
1965,5,17,1
1966,7,11,1
1967,7,1,1
1968,9,5,1


In [162]:
Ant_na[Ant_na['temp']>0][['temp']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp
year,month,day,Unnamed: 3_level_1
1952,6,30,1
1953,5,29,1
1956,7,1,3
1956,7,2,2
1956,7,3,2
...,...,...,...
2014,5,14,4
2018,5,25,3
2019,9,19,1
2020,9,17,2


In [163]:
Lie_na[Lie_na['temp']>0][['temp']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp
year,month,day,Unnamed: 3_level_1
1952,5,1,3
1952,5,2,3
1952,5,3,3
1952,5,4,3
1952,5,5,3
...,...,...,...
2017,9,27,1
2018,5,15,1
2019,5,8,1
2019,6,17,1


**As there are many missing values, we will explore if there are days when no temperature was recorded during daytime or nighttime. That is because we can expect that the daily highest occur **

In [164]:
Bru_na_day=Bru_summer[Bru_summer['daytime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Ant_na_day=Ant_summer[Ant_summer['daytime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Lie_na_day=Lie_summer[Lie_summer['daytime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()

In [177]:
Bru_na_day['measurements_daytime']=Bru_summer[Bru_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Bru_na_day['missing_percent']=Bru_na_day['temp']/Bru_na_day['measurements_daytime']
Bru_na_day[Bru_na_day['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1960,5,25,3,0,0,0,3,1.0


In [178]:
Ant_na_day['measurements_daytime']=Ant_summer[Ant_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Ant_na_day['missing_percent']=Bru_na_day['temp']/Bru_na_day['measurements_daytime']
Ant_na_day[Ant_na_day['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1960,5,25,0,0,0,0,3,1.0


In [179]:
Lie_na_day['measurements_daytime']=Lie_summer[Lie_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Lie_na_day['missing_percent']=Lie_na_day['temp']/Lie_na_day['measurements_daytime']
Lie_na_day[Lie_na_day['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1976,7,25,3,0,0,0,3,1.0


In [180]:
Bru_na_night=Bru_summer[Bru_summer['nighttime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Ant_na_night=Ant_summer[Ant_summer['nighttime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()
Lie_na_night=Lie_summer[Lie_summer['nighttime']==True].set_index(['year', 'month', 'day']).isna().groupby(level=[0, 1, 2]).sum()

In [181]:
Bru_na_night['measurements_daytime']=Bru_summer[Bru_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Bru_na_night['missing_percent']=Bru_na_night['temp']/Bru_na_night['measurements_daytime']
Bru_na_night[Bru_na_night['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1960,5,25,3,0,0,0,3,1.0


In [185]:
Ant_na_night['measurements_daytime']=Ant_summer[Ant_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Ant_na_night['missing_percent']=Ant_na_night['temp']/Ant_na_night['measurements_daytime']
Ant_na_night[Ant_na_night['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1956,7,8,3,0,0,0,3.0,1.0
1956,7,15,3,0,0,0,3.0,1.0
1956,7,22,3,0,0,0,3.0,1.0
1956,7,29,3,0,0,0,3.0,1.0
1956,8,5,3,0,0,0,3.0,1.0
1956,8,12,3,0,0,0,3.0,1.0
1956,8,19,3,0,0,0,3.0,1.0
1956,8,26,3,0,0,0,3.0,1.0
1956,9,2,3,0,0,0,3.0,1.0
1956,9,9,3,0,0,0,3.0,1.0


In [183]:
Lie_na_night['measurements_daytime']=Lie_summer[Lie_summer['daytime']==True].set_index(['year', 'month', 'day']).groupby(level=[0, 1, 2]).sum()['daytime']
Lie_na_night['missing_percent']=Lie_na_night['temp']/Lie_na_night['measurements_daytime']
Lie_na_night[Lie_na_night['missing_percent']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temp,hour,daytime,nighttime,measurements_daytime,missing_percent
year,month,day,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1954,8,5,3,0,0,0,3,1.0
1954,8,6,3,0,0,0,3,1.0
1954,8,7,3,0,0,0,3,1.0
1954,8,8,3,0,0,0,3,1.0
1954,8,9,3,0,0,0,3,1.0
1954,8,10,3,0,0,0,3,1.0
1954,8,14,3,0,0,0,3,1.0
1954,8,15,3,0,0,0,3,1.0
1954,8,19,3,0,0,0,3,1.0
1954,8,22,3,0,0,0,3,1.0


**Brussels seems to be fine. For Antwerp, as large portio of nighttime temperatures are unknown from 1956 to 1966, we will only keep the data from 1970. Similarly, for Liege the year of 1954 will br dropped**

**For the rest, if we have one day with no observation at all, we will be interpolating that by looking at the EHI of the neighboring days.**