<a href="https://colab.research.google.com/github/pippahtlin/Extreme-Precipitation/blob/main/orisoi_daily.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.preprocessing import MinMaxScaler

In [38]:
df = pd.read_csv("https://raw.githubusercontent.com/pippahtlin/Extreme-Precipitation/main/data/precipitation_originsoi",index_col = "date")
df.index = pd.to_datetime(df.index)
df

Unnamed: 0_level_0,year,month,day,daily rain,binary rain,soi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1899-10-11,1899,10,11,3.048000,0,-2.0
1899-10-12,1899,10,12,2.032000,0,-2.0
1899-10-13,1899,10,13,33.020000,1,-2.0
1899-10-14,1899,10,14,0.508000,0,-2.0
1899-10-15,1899,10,15,0.000000,0,-2.0
...,...,...,...,...,...,...
2022-12-27,2022,12,27,0.000000,0,2.6
2022-12-28,2022,12,28,22.859999,0,2.6
2022-12-29,2022,12,29,0.000000,0,2.6
2022-12-30,2022,12,30,1.270000,0,2.6


#### MinMax Scaler:
Here we want to scale the `daily rain` column, and then creating the 7 lagged columns based in the scaled `daily rain`:
1. First determine the training part of the data, and only fit the scaler with training data to avoid data leakage
2. Create a new column `tf_daily rain` which transform the `daily rain` column using the scaler, so that the 7 `rain_lag_i` columns can be created.
3. After that, we can split the train, validation, and test data, and we only need to transform the `daily rain` part, which is the target.

In [39]:
scaler = MinMaxScaler()

# 1. Determine the training target set and fit the scaler
total_rows = len(df)
train_end = int(total_rows * 0.6) # 60%

train_rain = df.iloc[:train_end]['daily rain']
train_rain_2d = train_rain.values.reshape(-1, 1)

# Fit the scaler
sc = MinMaxScaler(feature_range=(0,80))
sc.fit(train_rain_2d)

In [40]:
# 2. With the fitted scaler, create  the transformed daily rain column
df['tf_daily rain'] = sc.transform(df['daily rain'].values.reshape(-1, 1))

# Create lagged rainfall features for the previous 7 days
for lag in range(1, 8):  # for 1 to 7 days lag
    df[f'rain_lag_{lag}'] = df['tf_daily rain'].shift(lag)

# Drop rows where any of the lagged values are NaN, which will mostly be the initial rows
df.dropna(inplace=True)

df

Unnamed: 0_level_0,year,month,day,daily rain,binary rain,soi,tf_daily rain,rain_lag_1,rain_lag_2,rain_lag_3,rain_lag_4,rain_lag_5,rain_lag_6,rain_lag_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1899-10-18,1899,10,18,0.000000,0,-2.0,0.000000,0.000000,0.000000,0.000000,0.230216,14.964029,0.920863,1.381295
1899-10-19,1899,10,19,0.000000,0,-2.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.230216,14.964029,0.920863
1899-10-20,1899,10,20,0.000000,0,-2.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.230216,14.964029
1899-10-21,1899,10,21,11.684000,0,-2.0,5.294964,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.230216
1899-10-22,1899,10,22,0.000000,0,-2.0,0.000000,5.294964,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-27,2022,12,27,0.000000,0,2.6,0.000000,0.000000,0.000000,0.115108,0.000000,0.000000,0.000000,0.000000
2022-12-28,2022,12,28,22.859999,0,2.6,10.359712,0.000000,0.000000,0.000000,0.115108,0.000000,0.000000,0.000000
2022-12-29,2022,12,29,0.000000,0,2.6,0.000000,10.359712,0.000000,0.000000,0.000000,0.115108,0.000000,0.000000
2022-12-30,2022,12,30,1.270000,0,2.6,0.575540,0.000000,10.359712,0.000000,0.000000,0.000000,0.115108,0.000000


In [46]:
# 3. Split the data (train 60%, validation 20%, test 20% )
total_rows = len(df)
train_end = int(total_rows * 0.6) # same as 1
validation_end = train_end + int(total_rows * 0.2)

# Define the feature column
col_feature = ['soi']+[f'rain_lag_{i}' for i in range(1, 8)]

# Using iloc for integer-location based indexing
train_features = df[col_feature].iloc[:train_end]
train_rain = df.iloc[:train_end]['tf_daily rain']

validation_features = df[col_feature].iloc[train_end:validation_end]
validation_rain = df.iloc[train_end:validation_end]['tf_daily rain']

test_features = df[col_feature].iloc[validation_end:]
test_rain = df.iloc[validation_end:]['tf_daily rain']

In [None]:
##########################

In [17]:
# Calculate indices for splitting
total_rows = len(df)
train_end = int(total_rows * 0.6)
validation_end = train_end + int(total_rows * 0.2)

# Using iloc for integer-location based indexing
features = ['soi'] + [f'rain_lag_{i}' for i in range(1, 8)]

# Using iloc for integer-location based indexing
train_features = df[features].iloc[:train_end]
train_rain = df.iloc[:train_end]['daily rain']

validation_features = df[features].iloc[train_end:validation_end]
validation_rain = df.iloc[train_end:validation_end]['daily rain']

test_features = df[features].iloc[validation_end:]
test_rain = df.iloc[validation_end:]['daily rain']

In [None]:
# Scale the lagged features and the train data


In [None]:
T = 90 # Consider seasonality

# Prepend the last T-1 observations from the training set to the validation features
prepend_features_to_validation = train_features.iloc[-(T-1):]
validation_features = pd.concat([prepend_features_to_validation, validation_features], axis=0)

# Similarly, prepend the last T-1 observations from the validation set (including the part that was just prepended) to the test features
prepend_features_to_test = validation_features.iloc[-(T-1):]
test_features = pd.concat([prepend_features_to_test, test_features], axis=0)

# Output the shapes of the datasets to check everything is as expected
train_features.shape, train_rain.shape, validation_features.shape, validation_rain.shape, test_features.shape, test_rain.shape