In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn

# torch.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


# Dataset Preprocessing

Split data into train, validation and test:  
80% train, 10% validation, 10% test:

Years selected for test and validation (based of random number generation):
Valid and test years pairs are selected within one year of seperation
valid = 2022, 1999, 2008, 1990
test = 2023,  1998, 2009, 1991

Validation years

In [2]:
datafolder = "./data/"
merge_df = pd.DataFrame()
for i, file in enumerate(os.listdir(datafolder)):
    if i == 0:
        merge_df = pd.read_csv(datafolder + file)
    else:
        df = pd.read_csv(datafolder + file)
        merge_df = pd.merge(merge_df, df, on="month")

merge_df["month_num"] = merge_df["month"].apply(lambda x: int(x.split("-")[1]))

group_2 = ['2022', '1999', '2008', '1990']
group_3 = ['2023', '2009', '1998', '1991']

# Filter DataFrames by year
valid_df = merge_df[merge_df['month'].str.split('-').str[0].isin(group_2)]
test_df = merge_df[merge_df['month'].str.split('-').str[0].isin(group_3)]

train_df = merge_df[~merge_df['month'].str.split('-').str[0].isin(group_2 + group_3)]

In [3]:
train_df.month.unique()

array(['1982-01', '1982-02', '1982-03', '1982-04', '1982-05', '1982-06',
       '1982-07', '1982-08', '1982-09', '1982-10', '1982-11', '1982-12',
       '1983-01', '1983-02', '1983-03', '1983-04', '1983-05', '1983-06',
       '1983-07', '1983-08', '1983-09', '1983-10', '1983-11', '1983-12',
       '1984-01', '1984-02', '1984-03', '1984-04', '1984-05', '1984-06',
       '1984-07', '1984-08', '1984-09', '1984-10', '1984-11', '1984-12',
       '1985-01', '1985-02', '1985-03', '1985-04', '1985-05', '1985-06',
       '1985-07', '1985-08', '1985-09', '1985-10', '1985-11', '1985-12',
       '1986-01', '1986-02', '1986-03', '1986-04', '1986-05', '1986-06',
       '1986-07', '1986-08', '1986-09', '1986-10', '1986-11', '1986-12',
       '1987-01', '1987-02', '1987-03', '1987-04', '1987-05', '1987-06',
       '1987-07', '1987-08', '1987-09', '1987-10', '1987-11', '1987-12',
       '1988-01', '1988-02', '1988-03', '1988-04', '1988-05', '1988-06',
       '1988-07', '1988-08', '1988-09', '1988-10', 

In [4]:
valid_df.month.unique()

array(['1990-01', '1990-02', '1990-03', '1990-04', '1990-05', '1990-06',
       '1990-07', '1990-08', '1990-09', '1990-10', '1990-11', '1990-12',
       '1999-01', '1999-02', '1999-03', '1999-04', '1999-05', '1999-06',
       '1999-07', '1999-08', '1999-09', '1999-10', '1999-11', '1999-12',
       '2008-01', '2008-02', '2008-03', '2008-04', '2008-05', '2008-06',
       '2008-07', '2008-08', '2008-09', '2008-10', '2008-11', '2008-12',
       '2022-01', '2022-02', '2022-03', '2022-04', '2022-05', '2022-06',
       '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12'],
      dtype=object)

In [5]:
test_df.month.unique()

array(['1991-01', '1991-02', '1991-03', '1991-04', '1991-05', '1991-06',
       '1991-07', '1991-08', '1991-09', '1991-10', '1991-11', '1991-12',
       '1998-01', '1998-02', '1998-03', '1998-04', '1998-05', '1998-06',
       '1998-07', '1998-08', '1998-09', '1998-10', '1998-11', '1998-12',
       '2009-01', '2009-02', '2009-03', '2009-04', '2009-05', '2009-06',
       '2009-07', '2009-08', '2009-09', '2009-10', '2009-11', '2009-12',
       '2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06',
       '2023-07', '2023-08', '2023-09', '2023-10', '2023-11', '2023-12'],
      dtype=object)

In [6]:
def normalised(df, predicting_column = "total_rainfall", min_max=True):
    excluded_columns = ["month", predicting_column]

    # Reorder the columns so that predicting value is at the end
    cols = [col for col in df.columns if col not in excluded_columns]  # Get all columns except excluded ones
    cols.append(predicting_column)  # Add "total_rainfall" at the end
    df = df[cols]  # Reorder the DataFrame columns
    # print(df)
    merge_df_normalized_gaussian = df.copy()
    merge_df_normalized_minmax = df.copy()

    epsilon = 1e-10

    for column in merge_df_normalized_gaussian.columns:
        if column not in excluded_columns:
            merge_df_normalized_gaussian[column] = (merge_df_normalized_gaussian[column] - merge_df_normalized_gaussian[column].mean()) / merge_df_normalized_gaussian[column].std()

    for column in merge_df_normalized_minmax.columns:
        if column not in excluded_columns:
            merge_df_normalized_minmax[column] = (merge_df_normalized_minmax[column] - merge_df_normalized_minmax[column].min() + epsilon) / (merge_df_normalized_minmax[column].max() - merge_df_normalized_minmax[column].min() + epsilon)
    return merge_df_normalized_minmax if min_max else merge_df_normalized_gaussian

normalised_train_df = normalised(train_df)
normalised_valid_df = normalised(valid_df)
normalised_test_df = normalised(test_df)

In [8]:
len(normalised_valid_df)

48

In [10]:
from torch.utils.data import Dataset, DataLoader
n_inputs = 24 # 48 months
n_outputs = 3 # predicting 6 months ahead
hidden_size = 32

class CustomDataset(Dataset):
    def __init__(self, df, n_input, n_output):
        self.df = df
        self.inputs = []
        self.outputs = []

        if len(df) >= n_input + n_output:
            for i in range(n_input, len(df) - n_output + 1):
                self.inputs.append(df.iloc[i - n_input:i].values)
                self.outputs.append(df.iloc[i:i + n_output]["total_rainfall"].values)
        else:
            raise ValueError(f"DataFrame length ({len(df)}) is too short for the given n_input ({n_input}) and n_output ({n_output}).")

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return self.inputs[index], self.outputs[index]
    
train = CustomDataset(normalised_train_df, n_inputs, n_outputs)
valid = CustomDataset(normalised_valid_df, n_inputs, n_outputs)
test = CustomDataset(normalised_test_df, n_inputs, n_outputs)

train_dataloader = DataLoader(train, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test, batch_size=32, shuffle=True)

In [11]:
def train(model, dataloader, num_epochs, learning_rate):
    model.train()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0.0
        for inputs, targets in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            optimizer.zero_grad()
            outputs, _ = model(inputs, model.initHidden())
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if (epoch%25 == 0):
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

    return model

# RNN

In [12]:
num_layers = 3

class RNN_model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNN_model, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden
    

rnn_model = RNN_model(n_inputs, hidden_size, n_outputs, num_layers)
rnn_model.to(device)

RNN_model(
  (rnn): RNN(24, 32, num_layers=3, batch_first=True)
  (fc): Linear(in_features=32, out_features=3, bias=True)
)

In [13]:
num_epochs = 1000
learning_rate = 0.001

train(rnn_model, train_dataloader, num_epochs, learning_rate)

torch.save(rnn_model.state_dict(), "rnn_model.pt")

KeyboardInterrupt: 