In [16]:
import pandas as pd
import os

from model import Transformer # this is the transformer.py file
import torch
from torch import nn
import numpy as np

In [17]:
# Data preprocessing

def process_file(path):
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    loc = path.split('.')[3] + path.split('.')[4]

    df.columns = [
        'year', 'month', f'temp_anomaly_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}', f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}'
    ]

    df = df.drop(columns=[f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}'])

    return df

data_path = "data/"

all_data = pd.DataFrame()

for file in os.listdir(data_path):
    if file.endswith(".txt"):
        file_path = os.path.join(data_path, file)
        loc_df = process_file(file_path)


    # print(file, len(loc_df))
    if len(all_data) == 0:
        all_data = loc_df
    else:
        all_data = pd.merge(all_data, loc_df, on = ["year", "month"])

# all_data.insert(0, "time", [0] * len(all_data))
# all_data['time'] = all_data.apply(lambda x: f'{x["year"]}_{x["month"]}', axis=1)
# all_data = all_data.drop(columns=['year', 'month'])

all_data.to_csv("data/processed_data.csv")

In [18]:
# Segmentation

segments = []
start_year = all_data['year'].min()
end_year = all_data['year'].max()

for year in range(start_year, end_year, 11):
    # print("start year: ", year)
    # Get the subset of data for the current 11-year segment
    segment_end = min(year + 11, end_year + 1)  # Ensure we don't go beyond the last year
    # print("end_year: ", segment_end)
    segment_df = all_data[(all_data['year'] >= year) & (all_data['year'] < segment_end)]
    
    if not segment_df.empty:
        segments.append(segment_df)

# Each element (13 elements) in segment is a segment of 11 years or 132 months - which is the dimension of each segment
print("Length of each segment:", segments[0].shape)
print("# of segments:", len(segments))

input_data = []
target_data = []

feature_columns = all_data.columns.difference(['year', 'month'])

for segment in segments:
    # Extract 10 years of data as input
    input_years = segment[:120]
    input_data.append(input_years)

    # Extract the 11th year's data as target
    target_year = segment[120:132]
    target_data.append(target_year)
    
# In input_data, there are 13 elements, corresponding to the 13 segments of 10 years which is the input. 

print("\nLength of each input vector (120 months / 10 years):", len(input_data[0]))
print("Length of each target vector (12 months / 1 year):", len(target_data[0]))

print(f"\nUse {len(input_data[0])} months to predict {len(target_data[0])} months")

# Grab only the values of each dataframe and put them into a 2d array. Each element of the array represents one input vector.
input_data_value = [df[feature_columns].values for df in input_data]
target_data_value = [df[feature_columns].values for df in target_data]

# Make them Pytorch tensors
input_tensors = torch.tensor(input_data_value, dtype=torch.float32)
target_tensors = torch.tensor(target_data_value, dtype=torch.float32)

# Tensor lengths
print("\nTensor Lengths:")

print(f"There are {len(input_tensors)} input tensors and {len(target_tensors)} target tensors corresponding to each of the 13 intervals")
print(f"Input tensors legnth: {len(input_tensors[0])}")
print(f"Target tensors legnth: {len(target_tensors[0])}")


Length of each segment: (132, 17)
# of segments: 13

Length of each input vector (120 months / 10 years): 120
Length of each target vector (12 months / 1 year): 12

Use 120 months to predict 12 months

Tensor Lengths:
There are 13 input tensors and 13 target tensors corresponding to each of the 13 intervals
Input tensors legnth: 120
Target tensors legnth: 12


In [19]:
# Batch Size and Random Shuffling of the data

def get_random_batch(input_tensors, target_tensors, batch_size):
    # Ensure batch size is not larger than the dataset
    batch_size = min(batch_size, len(input_tensors))

    # Randomly select indices for the batch
    indices = np.random.choice(len(input_tensors), batch_size, replace=False)

    # Extract batches using the selected indices
    input_batch = input_tensors[indices]
    target_batch = target_tensors[indices]

    return input_batch, target_batch

batch_size = 8  # Set your batch size
input_batch, target_batch = get_random_batch(input_tensors, target_tensors, batch_size)

print(input_batch.shape) # 8 batches x 120 months (10 years) x 15 locations
print(target_batch.shape) # 8 batches x 12 months (1 year) x 15 locations


torch.Size([8, 120, 15])
torch.Size([8, 12, 15])


#### Model

In [20]:
# Change these parameters

d_model = 15
batch_size = 8
ffn_hidden = 2048
num_heads = 5
drop_prob = 0.1
num_layers = 1

transformer = Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

In [21]:
test_output = transformer(input_batch, target_batch)
test_output.size(), test_output

(torch.Size([8, 12, 15]),
 tensor([[[ 1.5888e-01, -5.1789e-01,  3.8739e-01,  ...,  1.0995e+00,
            1.7238e-01,  3.2223e-01],
          [ 5.8304e-01, -8.2627e-01, -1.3868e-02,  ...,  7.6242e-01,
            1.2629e+00,  7.7521e-01],
          [ 5.5063e-01, -5.6164e-01, -3.6300e-01,  ...,  4.3689e-03,
            9.9995e-01,  9.2244e-01],
          ...,
          [ 2.3619e-01, -5.8041e-01, -7.0785e-01,  ...,  6.4826e-02,
            2.6943e-01,  2.5088e-01],
          [ 1.6135e-01, -3.9725e-01, -5.7036e-01,  ..., -8.7249e-03,
           -1.1513e+00, -9.2528e-01],
          [-1.5883e-01, -7.9643e-01, -3.4784e-01,  ..., -2.4040e-01,
            1.1470e-01, -1.7130e-01]],
 
         [[-3.5559e-01, -4.0591e-01, -5.7647e-01,  ...,  2.4523e-01,
            6.5355e-02, -1.8971e-01],
          [-3.9606e-01,  3.8251e-01, -5.1888e-01,  ...,  6.5567e-02,
           -1.1031e+00, -9.7499e-01],
          [-1.8251e-01,  2.7164e-01, -4.2413e-01,  ...,  1.5890e-01,
           -1.1823e+00, -1.0910

In [22]:

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(params=transformer.parameters(), lr = 1e-3)


In [None]:
epochs = 100

for epoch in range(epochs):
    
    transformer.train()
    epoch_loss = 0

    for batch in range(len(input_tensors) // batch_size):
        input_batch, target_batch = get_random_batch(input_tensors, target_tensors, batch_size)

        logits_batch = transformer(input_batch, target_batch)

        optimizer.zero_grad()

        loss = criterion(logits_batch, target_batch)
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
    
    # Calculate average loss for the epoch
    epoch_loss /= (len(input_tensors) // batch_size)
    
    # Print epoch stats
    print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}')

    torch.save(transformer.state_dict(), f'saved_models/transformer_epoch_{epoch+1}_batch_{batch+1}.pt')

