In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def transform_data_datetime(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['DATE'], format="%Y%m%d")
    df['year'] = df['datetime'].dt.year
    df['month int'] = df['datetime'].dt.month
    df['month'] = df['datetime'].dt.strftime('%b')
    df['day'] = df['datetime'].dt.day

    return df

# Heathrow Precipitation Data

This notebook provides information about the daily precipitation recorded in Heathrow weather station by [European Climate Assessment and Dataset website.](https://www.ecad.eu/)

The website allows you to download data from different stations and Heathrow's one is the best option to perform the intended analysis. The data we will use in this notebook are the one contained in the `RR_SOUID219137.csv` file. 

In [3]:
heathrow_daily_data = pd.read_csv('./data/non_blended/RR_SOUID107650.csv')
print(heathrow_daily_data.shape)
heathrow_daily_data.head()

(23862, 5)


Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR
0,1860,107650,19600101,22,0
1,1860,107650,19600102,23,0
2,1860,107650,19600103,7,0
3,1860,107650,19600104,0,0
4,1860,107650,19600105,0,0


In [4]:
heathrow_daily_data.dtypes

STAID    int64
SOUID    int64
DATE     int64
RR       int64
Q_RR     int64
dtype: object

In [5]:
# Adding columns for handling dates
heathrow_daily_data = transform_data_datetime(heathrow_daily_data)

In [6]:
heathrow_daily_data.head()

Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR,datetime,year,month int,month,day
0,1860,107650,19600101,22,0,1960-01-01,1960,1,Jan,1
1,1860,107650,19600102,23,0,1960-01-02,1960,1,Jan,2
2,1860,107650,19600103,7,0,1960-01-03,1960,1,Jan,3
3,1860,107650,19600104,0,0,1960-01-04,1960,1,Jan,4
4,1860,107650,19600105,0,0,1960-01-05,1960,1,Jan,5


The `RR` unit is 0.1mm, we create a collumn rr_mm that contains the same measurement but in 1mm unit.

In [7]:
heathrow_daily_data['RR_mm'] = heathrow_daily_data['RR'] * 0.1
heathrow_daily_data.sort_values('datetime').reset_index(inplace=True, drop=True)
heathrow_daily_data[['RR_mm']].head()

Unnamed: 0,RR_mm
0,2.2
1,2.3
2,0.7
3,0.0
4,0.0


### Handling Missing Reading

The dataframe column `Q_RR` indicates the quality of each reading and our dataset contains few missing reading.

In [8]:
print(heathrow_daily_data[heathrow_daily_data['Q_RR'] !=0].shape[0])
heathrow_daily_data[heathrow_daily_data['Q_RR'] !=0]

14


Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR,datetime,year,month int,month,day,RR_mm
13300,1860,107650,19960531,-9999,9,1996-05-31,1996,5,May,31,-999.9
13683,1860,107650,19970618,-9999,9,1997-06-18,1997,6,Jun,18,-999.9
13684,1860,107650,19970619,-9999,9,1997-06-19,1997,6,Jun,19,-999.9
13685,1860,107650,19970620,-9999,9,1997-06-20,1997,6,Jun,20,-999.9
13686,1860,107650,19970621,-9999,9,1997-06-21,1997,6,Jun,21,-999.9
13687,1860,107650,19970622,-9999,9,1997-06-22,1997,6,Jun,22,-999.9
13688,1860,107650,19970623,-9999,9,1997-06-23,1997,6,Jun,23,-999.9
13689,1860,107650,19970624,-9999,9,1997-06-24,1997,6,Jun,24,-999.9
13690,1860,107650,19970625,-9999,9,1997-06-25,1997,6,Jun,25,-999.9
13691,1860,107650,19970626,-9999,9,1997-06-26,1997,6,Jun,26,-999.9


We have 14 missing reading, 1 is from May the 5th 1996 and from 18th to 30th of June 1997. Altough our analysis will be focused on the last 15 years, it is a good exercise to fill this missing reading. As a first step, we can transform the `RR` and `RR_mm` reading, corresponding to a `Q_RR` code of $9$, into a more easy to handle `np.nan`

In [9]:
heathrow_daily_data.loc[heathrow_daily_data['Q_RR'] !=0, ['RR_mm', 'RR']] = np.nan 

In [10]:
heathrow_daily_data[heathrow_daily_data['RR_mm'].isna()]

Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR,datetime,year,month int,month,day,RR_mm
13300,1860,107650,19960531,,9,1996-05-31,1996,5,May,31,
13683,1860,107650,19970618,,9,1997-06-18,1997,6,Jun,18,
13684,1860,107650,19970619,,9,1997-06-19,1997,6,Jun,19,
13685,1860,107650,19970620,,9,1997-06-20,1997,6,Jun,20,
13686,1860,107650,19970621,,9,1997-06-21,1997,6,Jun,21,
13687,1860,107650,19970622,,9,1997-06-22,1997,6,Jun,22,
13688,1860,107650,19970623,,9,1997-06-23,1997,6,Jun,23,
13689,1860,107650,19970624,,9,1997-06-24,1997,6,Jun,24,
13690,1860,107650,19970625,,9,1997-06-25,1997,6,Jun,25,
13691,1860,107650,19970626,,9,1997-06-26,1997,6,Jun,26,


In [22]:
## filling single date 1996-05-31

idx = heathrow_daily_data[heathrow_daily_data['datetime'] == "1996-05-31"].index[0]

mean_RR_m_value = heathrow_daily_data.iloc[idx-2:idx+3]['RR_mm'].mean()
mean_RR_value = heathrow_daily_data.iloc[idx-2:idx+3]['RR'].mean()

heathrow_daily_data.loc[idx, 'RR'] = mean_RR_value
heathrow_daily_data.loc[idx, 'RR_mm'] = mean_RR_m_value

In [23]:
heathrow_daily_data[heathrow_daily_data['datetime'] == "1996-05-31"]

Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR,datetime,year,month int,month,day,RR_mm
13300,1860,107650,19960531,0.5,9,1996-05-31,1996,5,May,31,0.05


In [24]:
heathrow_daily_data[(heathrow_daily_data['month'] == 'Jun') &
                    ((heathrow_daily_data['day'] >=18) & (heathrow_daily_data['day'] <= 30)) &
                    (heathrow_daily_data['Q_RR'] == 0)]

Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR,datetime,year,month int,month,day,RR_mm
169,1860,107650,19600618,0.0,0,1960-06-18,1960,6,Jun,18,0.0
170,1860,107650,19600619,0.0,0,1960-06-19,1960,6,Jun,19,0.0
171,1860,107650,19600620,0.0,0,1960-06-20,1960,6,Jun,20,0.0
172,1860,107650,19600621,0.0,0,1960-06-21,1960,6,Jun,21,0.0
173,1860,107650,19600622,141.0,0,1960-06-22,1960,6,Jun,22,14.1
...,...,...,...,...,...,...,...,...,...,...,...
23553,1860,107650,20240626,0.0,0,2024-06-26,2024,6,Jun,26,0.0
23554,1860,107650,20240627,0.0,0,2024-06-27,2024,6,Jun,27,0.0
23555,1860,107650,20240628,0.0,0,2024-06-28,2024,6,Jun,28,0.0
23556,1860,107650,20240629,0.0,0,2024-06-29,2024,6,Jun,29,0.0


There are different strategy to fill this missing reading. The best approach would be to find alternative sources containing the missing data and but in our example, we will use the available data.

As we can see, there is a isolated day, 1996-05-31, and a series of consecutive days without reading. A sensitive approach could be using a simple mean value of the days before and after for the isolated date, and the daily mean within the 18th of June and 30th of June of each year and fill for the consecutive days. 

In [None]:
heathrow_daily_data[heathrow_daily_data["Q_RR"] !=0]['DATE']

In [None]:
heathrow_daily_data.dtypes

In [None]:
rainfall_data_se = pd.read_pickle('./data/rainfall_data_se.pkl')
rainfall_data_se

In [None]:
rainfall_data_se = rainfall_data_se[rainfall_data_se['year'] >= 1980].copy()
rainfall_data_se.head()

In [None]:
# Cheching consistency with monthly rainfall data collected for South East England region
heathrow_daily_data[heathrow_daily_data['Q_RR'] != 0]

In [None]:
heathrow_daily_data[(heathrow_daily_data['month'] == 'Jun') &
                    ((heathrow_daily_data['day'] >=18) & (heathrow_daily_data['day'] <= 30)) &
                    (heathrow_daily_data['Q_RR'] == 0)]

### Acknowledge

```
EUROPEAN CLIMATE ASSESSMENT & DATASET (ECA&D), file created on 11-06-2025
THESE DATA CAN BE USED FREELY PROVIDED THAT THE FOLLOWING SOURCE IS ACKNOWLEDGED:

Klein Tank, A.M.G. and Coauthors, 2002. Daily dataset of 20th-century surface
air temperature and precipitation series for the European Climate Assessment.
Int. J. of Climatol., 22, 1441-1453.
Data and metadata available at http://www.ecad.eu
```