In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/pippahtlin/Extreme-Precipitation/main/data/precipitation_soi_monthly")
df.set_index('date', inplace=True)
df

## 1. EDA

In [None]:
plt.figure(figsize=(14, 7))
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(nbins=30))

plt.plot(df.index,df['monthly rain'],label = 'Monthly Rain')
plt.title('Monthly Rain Over Time')
plt.xlabel('Date')
plt.ylabel('Rainfall (mm)')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20, 7))
plt.xticks(rotation=45)
plt.gca().xaxis.set_major_locator(MaxNLocator(nbins=40))

plt.plot(df.index, df['binary rain'], 'o', label = 'Extreme Precipitation')
plt.title('Extreme Precipitation over Time')
plt.xlabel('Date')
plt.ylabel('Rainfall (mm)')
plt.tight_layout()
plt.show()

In [None]:
months = len(df['binary rain'])
pos_prop = df['binary rain'].sum()/ len(df['binary rain'])
print(f'Number of months: {months}')
print(f'Proportion of extreme precpiptation: {pos_prop}')

<br>

## 2. Split Data
Training set: 70% | Validation set: 15% | Testing set: 15%

In [None]:
# Calculate indices for splitting
total_rows = len(df)
train_end = int(total_rows * 0.7)
validation_end = train_end + int(total_rows * 0.15)

# Using iloc for integer-location based indexing
train_features = df["soi"].iloc[:train_end]
train_rain = df.iloc[:train_end]['binary rain']

validation_features = df["soi"].iloc[train_end:validation_end]
validation_rain = df.iloc[train_end:validation_end]['binary rain']

test_features = df["soi"].iloc[validation_end:]
test_rain = df.iloc[validation_end:]['binary rain']

In [None]:
T = 30 # Consider seasonality

# Prepend the last T-1 observations from the training set to the validation features
prepend_features_to_validation = train_features.iloc[-(T-1):]
validation_features = pd.concat([prepend_features_to_validation, validation_features], axis=0)

# Similarly, prepend the last T-1 observations from the validation set (including the part that was just prepended) to the test features
prepend_features_to_test = validation_features.iloc[-(T-1):]
test_features = pd.concat([prepend_features_to_test, test_features], axis=0)

# Output the shapes of the datasets to check everything is as expected
train_features.shape, train_rain.shape, validation_features.shape, validation_rain.shape, test_features.shape, test_rain.shape

<br>

## 3. Data Preprocessing
Input data for the Keras LSTM layer has 3 dimensions: (M, T, N), where
* M - number of examples (2D: sequences of timesteps x features),
* T - sequence length (timesteps) and
* N - number of features (input_dim)

In [None]:
# Create sequences of T timesteps
X_train, y_train = [], []
for i in range(len(train_rain) - (T-1)):
    X_train.append(train_features.iloc[i:i+T].values)
    y_train.append(train_rain.iloc[i + (T-1)])

X_train, y_train = np.array(X_train), np.array(y_train).reshape(-1,1) # Reshape to vector for model
X_train = X_train[:, :, np.newaxis]
print(f'Train data dimensions: {X_train.shape}, {y_train.shape}')


X_val, y_val = [], []
for i in range(len(validation_rain) - (T-1)):
    X_val.append(validation_features.iloc[i:i+T].values)
    y_val.append(validation_rain.iloc[i + (T-1)])
X_val, y_val = np.array(X_val), np.array(y_val).reshape(-1,1)
X_val = X_val[:, :, np.newaxis]
print(f'Validation data dimensions: {X_val.shape}, {y_val.shape}')

X_test, y_test = [], []
for i in range(len(test_rain)):
    X_test.append(test_features.iloc[i:i+T].values)
    y_test.append(test_rain.iloc[i])
X_test, y_test = np.array(X_test), np.array(y_test).reshape(-1,1)
X_test = X_test[:, :, np.newaxis]

print(f'Test data dimensions: {X_test.shape}, {y_test.shape}')

<br>

## 4. Batch Training and Predictions

In [None]:
# Import Keras
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2
from time import time

In [None]:
# 1. Make a list of CONSTANTS for modelling:
LAYERS = [30, 30, 30, 1]                # number of units in hidden and output layers (possible tune)
M_TRAIN = X_train.shape[0]           # number of training examples (2D)
M_VAL = X_val.shape[0]           # number of training examples (2D)
M_TEST = X_test.shape[0]             # number of test examples (2D),full=X_test.shape[0]
N = X_train.shape[2]                 # number of features
BATCH = M_TRAIN                          # batch size
EPOCH = 5                           # number of epochs (possible tune)
LR = 0.05                            # learning rate of the gradient descent (possible tune)
LAMBD = 0.03                         # lambda in L2 regularizaion (possible tune)
DP = 0.0                             # dropout rate (possible tune)
RDP = 0.0                            # recurrent dropout rate (possible tune)
print(f'layers={LAYERS}, train_examples={M_TRAIN}, test_examples={M_TEST}')
print(f'batch = {BATCH}, timesteps = {T}, features = {N}, epochs = {EPOCH}')
print(f'lr = {LR}, lambda = {LAMBD}, dropout = {DP}, recurr_dropout = {RDP}')