In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def transform_data_datetime(df):
    df = df.copy()
    df['datetime'] = pd.to_datetime(df['DATE'], format="%Y%m%d")
    df['year'] = df['datetime'].dt.year
    df['month int'] = df['datetime'].dt.month
    df['month'] = df['datetime'].dt.strftime('%b')
    df['month'] = pd.Categorical(df['month'], categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                                                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], ordered=True)

    df['day'] = df['datetime'].dt.day


    return df

# Heathrow Precipitation Data

This notebook provides information about the daily precipitation recorded in Heathrow weather station by [European Climate Assessment and Dataset website.](https://www.ecad.eu/)

The website allows you to download data from different stations and Heathrow's one is the best option to perform the intended analysis. The data we will use in this notebook are the one contained in the `RR_SOUID219137.csv` file. 

In [3]:
heathrow_daily_data = pd.read_csv('./data/non_blended/RR_SOUID107650.csv')
print(heathrow_daily_data.shape)
heathrow_daily_data.head()

(23862, 5)


Unnamed: 0,STAID,SOUID,DATE,RR,Q_RR
0,1860,107650,19600101,22,0
1,1860,107650,19600102,23,0
2,1860,107650,19600103,7,0
3,1860,107650,19600104,0,0
4,1860,107650,19600105,0,0


In [None]:
heathrow_daily_data.dtypes

In [None]:
# Adding columns for handling dates
heathrow_daily_data = transform_data_datetime(heathrow_daily_data)

In [None]:
heathrow_daily_data.head()

The `RR` unit is 0.1mm, we create a collumn rr_mm that contains the same measurement but in 1mm unit.

In [None]:
heathrow_daily_data['RR_mm'] = heathrow_daily_data['RR'] * 0.1
heathrow_daily_data.sort_values('datetime').reset_index(inplace=True, drop=True)
heathrow_daily_data[['RR_mm']].head()

### Handling Missing Reading

The dataframe column `Q_RR` indicates the quality of each reading and our dataset contains few missing reading.

In [None]:
print(heathrow_daily_data[heathrow_daily_data['Q_RR'] !=0].shape[0])
heathrow_daily_data[heathrow_daily_data['Q_RR'] !=0]

We have 14 missing reading, 1 is from May the 5th 1996 and from 18th to 30th of June 1997. Altough our analysis will be focused on the last 15 years, it is a good exercise to fill this missing reading. As a first step, we can transform the `RR` and `RR_mm` reading, corresponding to a `Q_RR` code of $9$, into a more easy to handle `np.nan`

In [None]:
heathrow_daily_data.loc[heathrow_daily_data['Q_RR'] !=0, ['RR_mm', 'RR']] = np.nan 

In [None]:
heathrow_daily_data[heathrow_daily_data['RR_mm'].isna()]

There are different strategy to fill this missing reading. The best approach would be to find alternative sources containing the missing data and but in our example, we will use the available data.

As we can see, there is a isolated day, 1996-05-31, and a series of consecutive days without reading. A sensitive approach could be using a simple mean value of the days before and after for that date.

In [None]:
## filling single date 1996-05-31

idx = heathrow_daily_data[heathrow_daily_data['datetime'] == "1996-05-31"].index[0]

mean_RR_m_value = heathrow_daily_data.iloc[idx-2:idx+3]['RR_mm'].mean()
mean_RR_value = heathrow_daily_data.iloc[idx-2:idx+3]['RR'].mean()

heathrow_daily_data.loc[idx, 'RR'] = mean_RR_value
heathrow_daily_data.loc[idx, 'RR_mm'] = mean_RR_m_value

In [None]:
heathrow_daily_data[heathrow_daily_data['datetime'] == "1996-05-31"]

As for the missing readings between the 18th and 30th of June 1997, we can calculate the daily mean within that datetime range for each year and fill the missing data. 

In [None]:
heathrow_fillin_data = heathrow_daily_data[(heathrow_daily_data['month'] == 'Jun') &
                                           ((heathrow_daily_data['day'] >=18) & (heathrow_daily_data['day'] <= 30)) &
                                           (heathrow_daily_data['Q_RR'] == 0)].reset_index(drop = True).copy()
heathrow_fillin_data.head()


In [None]:
heathrow_1997_fillin = (heathrow_fillin_data
                        .groupby(['month', 'day'])[['RR', 'RR_mm']]
                        .mean()
                        .reset_index()
                        )

heathrow_1997_fillin['year'] = 1997
heathrow_1997_fillin

In [None]:
merged = heathrow_daily_data.merge(
    heathrow_1997_fillin, 
    on=['year', 'month', 'day'], 
    how='left', 
    suffixes=('', '_fill')
)

merged[merged['RR_mm'].isna()]

In [None]:
# Fill NaN values in original columns with values from fill-in dataframe
merged['RR'] = merged['RR'].fillna(merged['RR_fill'])
merged['RR_mm'] = merged['RR_mm'].fillna(merged['RR_mm_fill'])

In [None]:
merged[(merged['month'] == 'Jun') &
       ((merged['day'] >=18) & (merged['day'] <= 30)) &
       (merged['year'] == 1997)].reset_index(drop = True).copy()

In [None]:
# Drop the temporary fill columns
heathrow_daily_data = merged.drop(columns=['RR_fill', 'RR_mm_fill'])

Let's now check we did not introduced an outlier in the data, meaning that the values for June 1997 are somehow in line with the other months in different years.

In [None]:
june_data = heathrow_daily_data[heathrow_daily_data['month'] == 'Jun'].copy()
june_data

In [None]:
from src.data_plotting import plot_rainfall

test_data = (june_data
             .groupby('year')[['RR', 'RR_mm']]
             .mean()
             .reset_index()
             )

plot_rainfall(test_data, 
              start_year= 1960, 
              latest_year = 2024, 
              feature='RR_mm', 
              special_year=1997, 
              title_label='Monthly Mean precipitation in June', year_shift = False,
              fig_size=(16,5),
              rotation_x_label=75)

In [None]:
heathrow_monthly_sum = (heathrow_daily_data.groupby(['year', 'month'], observed=True)['RR_mm']
                        .sum()
                        .reset_index()
                        )
heathrow_monthly_sum = heathrow_monthly_sum.sort_values(['year', 'month']).reset_index(drop=True).copy()
heathrow_monthly_sum.head()

In [None]:
heathrow_daily_data.to_csv('./data/heathrow_daily_data.csv')

In [None]:
rainfall_se = pd.read_pickle('./data/rainfall_se.pkl')
rainfall_data_se = pd.read_pickle('./data/rainfall_data_se.pkl')

rainfall_se.head()

In [None]:
rainfall_data_se.head()

In [None]:
se_monthly_data = rainfall_data_se[rainfall_data_se['year'] >= 1960].sort_values(['year', 'month']).reset_index(drop = True)[['year', 'month', 'rainfall']].copy()
se_monthly_data.dropna(axis=0, inplace=True)

In [None]:
se_monthly_data = se_monthly_data[:-1].copy()
se_monthly_data.shape

In [None]:
se_monthly_data.head()

In [None]:
heathrow_monthly_sum.head()

In [None]:
year = 1997
heathrow_test_year = heathrow_monthly_sum[heathrow_monthly_sum['year'] == year]
se_test_year = se_monthly_data[se_monthly_data['year'] == year]

# Create a combined dataframe for easier plotting
combined_df = pd.DataFrame({
    'index': heathrow_test_year.index,
    'RR_mm': heathrow_test_year['RR_mm'],
    'rainfall': se_test_year['rainfall']
})

plt.figure(figsize=(12, 6))
sns.pointplot(data=combined_df, x='index', y='RR_mm', label='Heathrow sum', alpha=0.7)
sns.pointplot(data=combined_df, x='index', y='rainfall', label='SE sum', alpha=0.7)
plt.xlabel('Index (Time sequence)')
plt.ylabel('Rainfall (mm)')
plt.title('Rainfall Comparison Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
start_year = 1965
end_year = 1970
heathrow_test_year = heathrow_monthly_sum[(heathrow_monthly_sum['year'] >= start_year) &
                                          (heathrow_monthly_sum['year'] <= end_year)]

se_test_year = se_monthly_data[(se_monthly_data['year'] >= start_year) &
                               (se_monthly_data['year'] <= end_year)]

# Create a combined dataframe for easier plotting
combined_df = pd.DataFrame({
    'index': heathrow_test_year.index,
    'RR_mm': heathrow_test_year['RR_mm'],
    'rainfall': se_test_year['rainfall']
})

plt.figure(figsize=(12, 6))
sns.lineplot(data=combined_df, x='index', y='RR_mm', label='Heathrow sum', alpha=0.7)
sns.lineplot(data=combined_df, x='index', y='rainfall', label='SE sum', alpha=0.7)
plt.xlabel('Index (Time sequence)')
plt.ylabel('Rainfall (mm)')
plt.title('Rainfall Comparison Over Time')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

### Aggregate analysis

Aggregate analysis - sum, mean, standard deviation, median, min, max.
- Yearly
- First Month
- Weatherly Years

#### Yearly Daily Aggregate

In [None]:
yearly_daily_aggregate = (heathrow_daily_data
                          .groupby('year')
                          .agg(
                              yearly_mm_sum = ('RR_mm', 'sum'),
                              daily_mm_mean = ('RR_mm', 'mean'),
                              daily_mm_std = ('RR_mm', 'std'),
                              daily_mm_median = ('RR_mm', 'median'),
                              daily_mm_min = ('RR_mm', 'min'),
                              daily_mm_max = ('RR_mm', 'max')
                              ).reset_index()
                              )

In [None]:
plot_rainfall(yearly_daily_aggregate, 1960, 2025, 'daily_mm_mean', special_year=2022,rotation_x_label=90,fig_size=(14,8));

#### Yearly Monthly Aggregate

In [None]:
yearly_monthly_aggregate = (heathrow_monthly_sum
 .groupby('year')
 .agg(
     yearly_mm_sum = ('RR_mm', 'sum'),
     monthly_mm_mean = ('RR_mm', 'mean'),
     monthly_mm_std = ('RR_mm', 'std'),
     monthly_mm_median = ('RR_mm', 'median'),
     monthly_mm_min = ('RR_mm', 'min'),
     monthly_mm_max = ('RR_mm', 'max')
 ).reset_index()
)

In [None]:
plot_rainfall(yearly_monthly_aggregate, 1960, 2025, 'yearly_mm_sum', special_year=2022,rotation_x_label=90,fig_size=(14,8));

#### First Months Aggregate 

Comparison between the first months of the year. As for 2025 I have data from the fisrt 4 months, the resulting mean may be effected by seasonal variance. If I want to compare yearly monthly mean, I need to do it comparing the same months.

In [None]:
heathrow_daily_data_jan_apr = heathrow_daily_data[heathrow_daily_data['month int'] <= 4].copy()
heathrow_daily_data_jan_apr['month'].unique()

In [None]:
yearly_daily_aggregate_jan_apr = (
    heathrow_daily_data_jan_apr.groupby('year')
    .agg(
        yearly_mm_sum = ('RR_mm', 'sum'),
        daily_mm_mean = ('RR_mm', 'mean'),
        daily_mm_std = ('RR_mm', 'std'),
        daily_mm_median = ('RR_mm', 'median'),
        daily_mm_min = ('RR_mm', 'min'),
        daily_mm_max = ('RR_mm', 'max')
    )
    .reset_index()
)

In [None]:
plot_rainfall(yearly_monthly_aggregate, 2013, 2025, 'monthly_mm_mean', special_year=2022,rotation_x_label=90,fig_size=(14,8));

In [None]:
plot_rainfall(yearly_daily_aggregate_jan_apr, 2013, 2025, 'daily_mm_mean', special_year=2022,rotation_x_label=0,fig_size=(14,8), title_label = "Daily Average Rainfall - First 4 Months of the Year");

In [None]:
heathrow_monthly_sum_jan_apr = (
    heathrow_daily_data_jan_apr
        .groupby(['year', 'month'], observed=True)
        .agg(
            monthly_sum=('RR_mm', 'sum')
        )
        .reset_index()
        .rename(columns = {'monthly_sum':'RR_mm'})
)

In [None]:
heathrow_monthly_sum_jan_apr.head()

In [None]:
yearly_monthly_aggregate_jan_apr = (
    heathrow_monthly_sum_jan_apr
    .groupby('year')
    .agg(
        monthly_sum = ('RR_mm', 'sum'),
        monthly_avg = ('RR_mm', 'mean'),
        monthly_std = ('RR_mm', 'std'),
        monthly_median = ('RR_mm', 'median'),
        monthly_min = ('RR_mm', 'min'),
        monthly_max = ('RR_mm', 'max')

    ).reset_index()
)

yearly_monthly_aggregate_jan_apr.head()

In [None]:
plot_rainfall(yearly_monthly_aggregate_jan_apr, 2013, 2025, 'monthly_avg', 
              special_year=2022,rotation_x_label=0,fig_size=(14,8), 
              title_label = "Monthly Average Rainfall - First 4 Months of the Year");


### Daily Plot

In [None]:
heathrow_daily_data.tail(3)

In [None]:
start_date = "2022-01-01"
end_date = "2022-08-30"

plt.figure(figsize=(12, 6))

sns.scatterplot(data = heathrow_daily_data[(heathrow_daily_data['datetime'] >= start_date) & (heathrow_daily_data['datetime'] <= end_date)],
                x = 'datetime',
                y = 'RR_mm'
);

In [None]:
# years = [2010, 2015, 2020, 2022, 2025]
years = [2022, 2025]
start_int_month = 0
end_int_month = 4

test_df = heathrow_daily_data[(heathrow_daily_data['year'].isin(years)) &
                              (heathrow_daily_data['month int'] >= start_int_month) &
                              (heathrow_daily_data['month int'] <= end_int_month)].copy()

test_df['month_day'] = test_df['datetime'].dt.month.astype(str) + '-' + test_df['datetime'].dt.day.astype(str)
test_df.head()

In [None]:
plt.figure(figsize=(14, 6))

sns.lineplot(
    data = test_df,
    x = 'month_day',
    y = 'RR_mm',
    hue = 'year'
);

## Number of days (consecutive or not) without precipitation

In [None]:
(heathrow_daily_data['RR']
 .value_counts()
 .reset_index()
).sort_values('RR')

In [None]:
12559/heathrow_daily_data.shape[0]

In [None]:
heathrow_daily_data['no_rain'] = 0
heathrow_daily_data.loc[heathrow_daily_data['RR'] == 0, 'no_rain'] = 1
heathrow_daily_data.head()

In [None]:
## number of days without rain for each year

no_rainy_days_per_year = heathrow_daily_data[['year', 'no_rain']].value_counts().reset_index().sort_values(['year'])

plt.figure(figsize=(12, 6))

sns.regplot(
    data = no_rainy_days_per_year[no_rainy_days_per_year['no_rain'] == 1],
    x = 'year',
    y = 'count'
);

In [None]:
## number of days without rain for each year

no_rainy_days_per_year = heathrow_daily_data[(heathrow_daily_data['year'] >= 2010) & (heathrow_daily_data['year'] < 2025)][['year', 'no_rain']].value_counts().reset_index().sort_values(['year'])

plt.figure(figsize=(12, 6))

sns.regplot(
    data = no_rainy_days_per_year[no_rainy_days_per_year['no_rain'] == 1],
    x = 'year',
    y = 'count'
);

In [None]:
## number of days without rain for each year

no_rainy_days_per_year = heathrow_daily_data[(heathrow_daily_data['year'] >= 1960) & (heathrow_daily_data['year'] < 1970)][['year', 'no_rain']].value_counts().reset_index().sort_values(['year'])

plt.figure(figsize=(12, 6))

sns.regplot(
    data = no_rainy_days_per_year[no_rainy_days_per_year['no_rain'] == 1],
    x = 'year',
    y = 'count'
);

In [None]:
no_rainy_days_per_year_2010 = heathrow_daily_data[(heathrow_daily_data['year'] >= 1990) & (heathrow_daily_data['year'] <= 2000)][['year', 'no_rain']].value_counts().reset_index().sort_values(['year'])

plt.figure(figsize=(12, 6))

sns.regplot(
    data = no_rainy_days_per_year_2010[no_rainy_days_per_year_2010['no_rain'] == 1],
    x = 'year',
    y = 'count'
);

### Number of consecutive days without rain.

In [None]:
# Ensure data is sorted by date
heathrow_daily_data = heathrow_daily_data.sort_values('datetime').reset_index(drop=True)

# Create groups that change each time rain status changes
rain_change = heathrow_daily_data['no_rain'].ne(heathrow_daily_data['no_rain'].shift()).cumsum()

# Count consecutive days within each group, only for no-rain periods
heathrow_daily_data['consecutive_no_rain'] = (
    heathrow_daily_data.groupby(rain_change).cumcount() + 1
).where(heathrow_daily_data['no_rain'] == 1, 0)

In [None]:
heathrow_daily_data[heathrow_daily_data['consecutive_no_rain'] > 20]['year'].value_counts()

In [None]:
sns.barplot(
    data = yearly_monthly_aggregate_jan_apr[yearly_monthly_aggregate_jan_apr['year'] >=2013],
    x = 'monthly_avg',
    y = 'year',
    orient='h'
);

### Acknowledge

```
EUROPEAN CLIMATE ASSESSMENT & DATASET (ECA&D), file created on 11-06-2025
THESE DATA CAN BE USED FREELY PROVIDED THAT THE FOLLOWING SOURCE IS ACKNOWLEDGED:

Klein Tank, A.M.G. and Coauthors, 2002. Daily dataset of 20th-century surface
air temperature and precipitation series for the European Climate Assessment.
Int. J. of Climatol., 22, 1441-1453.
Data and metadata available at http://www.ecad.eu
```