In [1]:
import pandas as pd
from warnings import filterwarnings
# plotly and pandas aren't playing nice right now when we use multiple colors
filterwarnings(action='ignore', category=FutureWarning) 

THAMES = '/kaggle/input/thames-water-daily-sewage-spills/thames_water_aggregated_daily_sewage_spills.csv'

df = pd.read_csv(filepath_or_buffer=THAMES, parse_dates=['start_day'], date_format='%d/%m/%Y')
df.head()

Unnamed: 0,start_day,chenies_rain,crossness_rain,edenvale_rain,haveringbower_rain,hollandpark_rain,leatherhead_rain,northmymms_rain,num_active_events,num_unique_locs,num_start_events,median_spill_duration,mean_spill_duration,sum_spill_duration
0,2022-12-12,0.0,4.4,1.14,1.38,1.4,0.0,1.6,2,2,2,44.5,44.5,89
1,2022-12-13,0.2,2.0,0.28,0.0,0.6,1.2,0.2,1,1,1,540.0,540.0,540
2,2022-12-14,0.0,0.0,4.86,0.0,0.0,0.0,0.2,0,0,0,0.0,0.0,0
3,2022-12-15,0.2,0.0,1.18,0.69,0.2,0.0,1.6,0,0,0,0.0,0.0,0
4,2022-12-16,1.2,0.2,0.02,0.02,0.0,0.0,1.0,0,0,0,0.0,0.0,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   start_day              366 non-null    datetime64[ns]
 1   chenies_rain           366 non-null    float64       
 2   crossness_rain         366 non-null    float64       
 3   edenvale_rain          366 non-null    float64       
 4   haveringbower_rain     366 non-null    float64       
 5   hollandpark_rain       366 non-null    float64       
 6   leatherhead_rain       366 non-null    float64       
 7   northmymms_rain        366 non-null    float64       
 8   num_active_events      366 non-null    int64         
 9   num_unique_locs        366 non-null    int64         
 10  num_start_events       366 non-null    int64         
 11  median_spill_duration  366 non-null    float64       
 12  mean_spill_duration    366 non-null    float64       
 13  sum_s

In [3]:
from plotly.express import line
line(data_frame=df, x='start_day', y=[column for column in df.columns if column.endswith('rain')])

This is pretty but we should probably render it as a scatter plot because rainfall is fundamentally discontinuous.

In [4]:
from plotly.express import scatter
scatter(data_frame=df, x='start_day', y=[column for column in df.columns if column.endswith('rain')],
       trendline='ols', trendline_scope='overall')

Our trendline is hard to interpret in terms of the available variables, but it appears to be saying we get 1.9-2.6 units of rain all the time on average.

In [5]:
scatter(data_frame=df, x='start_day', y=[column for column in df.columns if column.endswith('duration')], log_y=True)

We have a choice of target variables; unfortunately the spill duration (the output variable) and the rain (the input variables) are probably provided in different units and we don't know what they are. 

In [6]:
from sklearn.manifold import TSNE
duration_columns = [column for column in df.columns if column.endswith('duration')]
rain_columns = [column for column in df.columns if column.endswith('rain')]
tsne = TSNE(n_iter=10000, init='pca', verbose=1, random_state=2024)
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df[rain_columns]), columns=['tx', 'ty'])
tsne_df[duration_columns] = df[duration_columns].copy()
tsne_df['start_day'] = df['start_day'].copy()
for color in duration_columns:
    scatter(data_frame=tsne_df, x='tx', y='ty', color=color, hover_name='start_day').show()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 366 samples in 0.001s...
[t-SNE] Computed neighbors for 366 samples in 0.007s...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.866669
[t-SNE] KL divergence after 1450 iterations: 0.394350


In [7]:
scatter(data_frame=tsne_df, x='start_day', y=['tx', 'ty'])