In [1]:
import pandas as pd
import os

import math
import numpy as np
import torch
from torch import nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [2]:
# Data preprocessing

def process_file(path):
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    loc = path.split('.')[3] + path.split('.')[4]

    df.columns = [
        'year', 'month', f'temp_anomaly_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}', f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}'
    ]

    df = df.drop(columns=[f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}'])

    return df

data_path = "data/"

all_data = pd.DataFrame()

for file in os.listdir(data_path):
    if file.endswith(".txt"):
        file_path = os.path.join(data_path, file)
        loc_df = process_file(file_path)


    # print(file, len(loc_df))
    if len(all_data) == 0:
        all_data = loc_df
    else:
        all_data = pd.merge(all_data, loc_df, on = ["year", "month"])

In [3]:
segments = []
start_year = all_data['year'].min()
end_year = all_data['year'].max()

for year in range(start_year, end_year, 11):
    # print("start year: ", year)
    # Get the subset of data for the current 11-year segment
    segment_end = min(year + 11, end_year + 1)  # Ensure we don't go beyond the last year
    # print("end_year: ", segment_end)
    segment_df = all_data[(all_data['year'] >= year) & (all_data['year'] < segment_end)]
    
    if not segment_df.empty:
        segments.append(segment_df)

# Each element (13 elements) in segment is a segment of 11 years or 132 months - which is the dimension of each segment
print("Length of each segment:", segments[0].shape)
print("# of segments:", len(segments))

input_data = []
target_data = []

feature_columns = all_data.columns.difference(['year', 'month'])

for segment in segments:
    # Extract 10 years of data as input
    input_years = segment[:120]
    input_data.append(input_years)

    # Extract the 11th year's data as target
    target_year = segment[120:132]
    target_data.append(target_year)
    
# In input_data, there are 13 elements, corresponding to the 13 segments of 10 years which is the input. 

print("\nLength of each input vector (120 months / 10 years):", len(input_data[0]))
print("Length of each target vector (12 months / 1 year):", len(target_data[0]))

print(f"\nUse {len(input_data[0])} months to predict {len(target_data[0])} months")

# Grab only the values of each dataframe and put them into a 2d array. Each element of the array represents one input vector.
input_data_value = [df[feature_columns].values for df in input_data]
target_data_value = [df[feature_columns].values for df in target_data]

# Make them Pytorch tensors
input_tensors = torch.tensor(input_data_value, dtype=torch.float32)
target_tensors = torch.tensor(target_data_value, dtype=torch.float32)

# Tensor lengths
print("\nTensor Lengths:")

print(f"There are {len(input_tensors)} input tensors and {len(target_tensors)} target tensors corresponding to each of the 13 intervals")
print(f"Input tensors legnth: {len(input_tensors[0])}")
print(f"Target tensors legnth: {len(target_tensors[0])}")

Length of each segment: (132, 17)
# of segments: 13

Length of each input vector (120 months / 10 years): 120
Length of each target vector (12 months / 1 year): 12

Use 120 months to predict 12 months

Tensor Lengths:
There are 13 input tensors and 13 target tensors corresponding to each of the 13 intervals
Input tensors legnth: 120
Target tensors legnth: 12


  input_tensors = torch.tensor(input_data_value, dtype=torch.float32)


In [19]:
# Batch Size and Random Shuffling of the data

def get_random_batch(input_tensors, target_tensors, batch_size):
    # Ensure batch size is not larger than the dataset
    batch_size = min(batch_size, len(input_tensors))

    # Randomly select indices for the batch
    indices = np.random.choice(len(input_tensors), batch_size, replace=False)

    # Extract batches using the selected indices
    input_batch = input_tensors[indices]
    target_batch = target_tensors[indices]

    return input_batch, target_batch

batch_size = 8  # Set your batch size
input_batch, target_batch = get_random_batch(input_tensors, target_tensors, batch_size)

print(input_batch.shape)


torch.Size([8, 120, 15])


### Model Components

In [48]:
# Positional Encoding Block

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 1) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::1] = torch.sin(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [99]:
# Transformer Model

class TransformerModel(nn.Module):
    def __init__(self, input_dim: int, d_model: int, nhead: int, d_hid: int, nlayers: int, output_dim: int, dropout: float = 0.5):
        super().__init__()
        self.d_model = d_model
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        decoder_layers = nn.TransformerDecoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)

        self.memory = torch.rand(emsize, emsize)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layers, nlayers)
        self.linear1 = nn.Linear(input_dim, d_model)
        self.linear2 = nn.Linear(d_model, output_dim)
        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.linear1.bias.data.zero_()
        self.linear1.weight.data.uniform_(-initrange, initrange)
        self.linear2.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor = None) -> torch.Tensor:
        src = self.linear1(src) * math.sqrt(self.d_model)
        if src_mask is None:
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src))
        output = self.transformer_encoder(src, src_mask)
        print("encoding output", output.shape)
        output = self.transformer_decoder(output, self.memory)
        print("decoding output", output.shape)
        output = self.linear2(output)
        return output

input_size = 15
output_size = 15
emsize = 120  # embedding dimension
d_hid = 300  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 12  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(input_dim = input_size, d_model = emsize, nhead = nhead, d_hid = d_hid, nlayers = nlayers, output_dim = output_size, dropout = dropout)


In [100]:
# Training loop

criterion = nn.MSELoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
batch_size = 8

def train(model: nn.Module, epoch: int):
    model.train()  # turn on train mode
    log_interval = 2

    num_batches = int(len(input_tensors) / batch_size)
    for batch, i in enumerate(range(num_batches)):
        loss = 0.
        inputs, targets = get_random_batch(input_tensors, target_tensors, batch_size)
        print(f"Data shape: {inputs.shape} | Target shape: {targets.shape}")
        for train_input, train_target in zip(inputs, targets):
            flattened_train_input = train_input
            output = model(flattened_train_input)
            print(f"Output shape: {output.shape} | Target shape: {train_target.shape}")
            loss += criterion(output, train_target)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        if batch % log_interval == 0 and batch > 0:
            cur_loss = loss / log_interval
            print(f'| Epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | 'f'Loss: {cur_loss:5.2f}')

def evaluate(model: nn.Module, eval_data):
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    with torch.no_grad():
        num_batches = len(input_tensors) / batch_size
        for i in range(num_batches):
            data, targets = get_random_batch(eval_data, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1)
            total_loss += seq_len * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [101]:
epochs = 3

best_model_params_path = os.path.join("models/", "best_model_params.pt")

for epoch in range(1, epochs + 1):
    train(model, epoch)
    # val_loss = evaluate(model, val_data)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d}')
    print('-' * 89)
    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     torch.save(model.state_dict(), best_model_params_path)

model.load_state_dict(torch.load(best_model_params_path)) # load best model states



Data shape: torch.Size([8, 120, 15]) | Target shape: torch.Size([8, 12, 15])
encoding output torch.Size([120, 120])
decoding output torch.Size([120, 120])
Output shape: torch.Size([120, 15]) | Target shape: torch.Size([12, 15])


RuntimeError: The size of tensor a (120) must match the size of tensor b (12) at non-singleton dimension 0