In [37]:
import pandas as pd
import os

import numpy as np
import torch
from torch import nn
import math
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [38]:
# Data preprocessing

def process_file(path):
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    loc = path.split('.')[3] + path.split('.')[4]

    df.columns = [
        'year', 'month', f'temp_anomaly_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}', f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}'
    ]

    df = df.drop(columns=[f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}'])

    return df

data_path = "data/"

all_data = pd.DataFrame()

for file in os.listdir(data_path):
    if file.endswith(".txt"):
        file_path = os.path.join(data_path, file)
        loc_df = process_file(file_path)


    # print(file, len(loc_df))
    if len(all_data) == 0:
        all_data = loc_df
    else:
        all_data = pd.merge(all_data, loc_df, on = ["year", "month"])

In [120]:
segments = []
start_year = all_data['year'].min()
end_year = all_data['year'].max()

for year in range(start_year, end_year, 11):
    # print("start year: ", year)
    # Get the subset of data for the current 11-year segment
    segment_end = min(year + 11, end_year + 1)  # Ensure we don't go beyond the last year
    # print("end_year: ", segment_end)
    segment_df = all_data[(all_data['year'] >= year) & (all_data['year'] < segment_end)]
    
    if not segment_df.empty:
        segments.append(segment_df)

# Each element (13 elements) in segment is a segment of 11 years or 132 months - which is the dimension of each segment
print("Length of each segment:", segments[0].shape)
print("# of segments:", len(segments))

input_data = []
target_data = []

feature_columns = all_data.columns.difference(['year', 'month'])

for segment in segments:
    # Extract 10 years of data as input
    input_years = segment[:120]
    input_data.append(input_years)

    # Extract the 11th year's data as target
    target_year = segment[120:132]
    target_data.append(target_year)

# In input_data, there are 13 elements, corresponding to the 13 segments of 10 years which is the input. 

print("\nLength of each input vector (120 months / 10 years):", len(input_data[0]))
print("Length of each target vector (12 months / 1 year):", len(target_data[0]))

print(f"\nUse {len(input_data[0])} months to predict {len(target_data[0])} months")

# Flatten each DataFrame into a 1D array and store in new lists
flattened_input_data = [df[feature_columns].values.flatten() for df in input_data]
flattened_target_data = [df[feature_columns].values.flatten() for df in target_data]

Length of each segment: (132, 17)
# of segments: 13

Length of each input vector (120 months / 10 years): 120
Length of each target vector (12 months / 1 year): 12

Use 120 months to predict 12 months


### Model Components

In [5]:
# Positional Encoding Block

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [7]:
# Transformer Model

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src))
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output