In [1]:
import pandas as pd
import os

from model import Transformer # this is the transformer.py file
import torch
from torch import nn
import numpy as np

In [22]:
# Data preprocessing

def process_file(path):
    df = pd.read_csv(path, delim_whitespace=True, header=None)
    loc = path.split('.')[3] + path.split('.')[4]

    df.columns = [
        'year', 'month', f'temp_anomaly_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}', f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}'
    ]

    df = df.drop(columns=[f'diag_var1_{loc}', f'diag_var2_{loc}', f'diag_var3_{loc}', f'total_error_var_{loc}', f'high_freq_error_var_{loc}',
        f'low_freq_error_var_{loc}', f'bias_error_var_{loc}'])

    return df

data_path = "data/"

all_data = pd.DataFrame()

for file in os.listdir(data_path):
    if file.endswith(".txt"):
        file_path = os.path.join(data_path, file)
        loc_df = process_file(file_path)


    # print(file, len(loc_df))
    if len(all_data) == 0:
        all_data = loc_df
    else:
        all_data = pd.merge(all_data, loc_df, on = ["year", "month"])

# all_data.insert(0, "time", [0] * len(all_data))
# all_data['time'] = all_data.apply(lambda x: f'{x["year"]}_{x["month"]}', axis=1)
# all_data = all_data.drop(columns=['year', 'month'])

all_data.to_csv("data/processed_data.csv")

In [39]:
def create_segments(data, input_years_length, target_years_length):
    input_data = []
    target_data = []
    
    input_months = input_years_length * 12
    target_months = target_years_length * 12
    total_months = input_months + target_months

    start_year = data['year'].min()
    end_year = data['year'].max()

    for year in range(start_year, end_year - input_years_length - target_years_length + 1):
        segment_end_year = year + input_years_length + target_years_length
        segment_df = data[(data['year'] >= year) & (data['year'] < segment_end_year)]

        if not segment_df.empty:
            input_segment = segment_df[:input_months]
            target_segment = segment_df[input_months:total_months]
            input_data.append(input_segment)
            target_data.append(target_segment)

    return input_data, target_data

# Usage example
input_years = 5  # Length of input period in years
target_years = 1   # Length of target period in years

input_segments, target_segments = create_segments(all_data, input_years, target_years)

print("# of segments:", len(input_segments))
print("Length of each input vector (months):", len(input_segments[0]))
print("Length of each target vector (months):", len(target_segments[0]))

# Convert to tensors
feature_columns = all_data.columns.difference(['year', 'month'])
input_tensors = torch.tensor([df[feature_columns].values for df in input_segments], dtype=torch.float32)
target_tensors = torch.tensor([df[feature_columns].values for df in target_segments], dtype=torch.float32)

# Print tensor lengths
print("\nTensor Lengths:")
print(f"There are {len(input_tensors)} input tensors and {len(target_tensors)} target tensors")
print(f"Input tensors length: {len(input_tensors[0])}")
print(f"Target tensors length: {len(target_tensors[0])}")

# of segments: 137
Length of each input vector (months): 60
Length of each target vector (months): 12

Tensor Lengths:
There are 137 input tensors and 137 target tensors
Input tensors length: 60
Target tensors length: 12


In [40]:
from dataset import TimeSeriesDataset
from torch.utils.data import DataLoader, random_split
batch_size = 8

ts_dataset = TimeSeriesDataset(input_tensors, target_tensors)
train_size = int(0.8 * len(ts_dataset))  # e.g., 80% of data for training
val_size = len(ts_dataset) - train_size  # remaining for validation

# Randomly split the dataset into training and validation datasets
train_dataset, val_dataset = random_split(ts_dataset, [train_size, val_size])

# Create DataLoaders for both training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # Usually, no need to shuffle the validation set

input_batch, target_batch = next(iter(train_loader))

# Print the shapes
print(f'Input batch shape: {input_batch.shape}')  # e.g., torch.Size([8, 120, 15])
print(f'Target batch shape: {target_batch.shape}')  # e.g., torch.Size([8, 12, 15])

Input batch shape: torch.Size([8, 60, 15])
Target batch shape: torch.Size([8, 12, 15])


#### Model

In [44]:
# Change these parameters

d_model = 15
ffn_hidden = 2048
num_heads = 5
drop_prob = 0.1
num_layers = 10

transformer = Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

In [45]:
test_output = transformer(input_batch, target_batch)
test_output.size(), test_output

(torch.Size([8, 12, 15]),
 tensor([[[ 3.8440e-01,  9.6435e-01,  3.7524e-01,  ..., -1.3515e-01,
           -4.6454e-01, -4.0719e-01],
          [ 5.8493e-01,  9.3210e-01,  5.3819e-01,  ...,  3.6844e-01,
           -1.0467e+00, -1.6403e-01],
          [ 2.1902e-01,  4.9124e-01, -1.1541e-01,  ...,  7.5895e-01,
            1.4420e-01,  3.9245e-01],
          ...,
          [ 2.5817e-01,  7.4476e-01, -4.3541e-01,  ...,  7.5843e-01,
           -1.0037e+00, -1.7318e-01],
          [ 9.9909e-02,  5.9469e-01,  2.4657e-01,  ...,  1.8029e-01,
           -1.0473e-01, -2.0503e-01],
          [-1.6641e-01,  5.8088e-01,  8.5724e-02,  ..., -2.6673e-01,
            2.9546e-01, -8.3550e-01]],
 
         [[ 3.7830e-01,  5.0361e-01,  2.2965e-01,  ...,  3.7804e-01,
           -1.1652e+00,  2.2134e-01],
          [ 8.2682e-01,  8.1350e-02,  3.2621e-01,  ...,  5.6927e-01,
           -1.2298e+00,  1.0841e+00],
          [ 8.8740e-02,  1.4449e-01,  3.1841e-02,  ...,  1.3954e+00,
           -1.3678e+00,  4.0665

In [46]:

criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(params=transformer.parameters(), lr = 1e-3)


In [29]:
epochs = 100

for epoch in range(epochs):
    
    transformer.train()
    train_loss = 0

    for input_batch, target_batch in train_loader:
        logits_batch = transformer(input_batch, target_batch)

        loss = criterion(logits_batch, target_batch)
        loss.backward()
        train_loss += loss.item()

        optimizer.zero_grad()
        optimizer.step()
        
    # Calculate average loss for the epoch
    train_loss /= (len(input_tensors) // batch_size)

    # transformer.eval()
    # with torch.inference_mode():



    
    # Print epoch stats
    print(f'Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}')

    torch.save(transformer.state_dict(), f'saved_models/transformer_epoch_{epoch+1}_batch_{batch+1}.pt')



Epoch 1/100, Loss: 0.3230
Epoch 2/100, Loss: 0.2784
Epoch 3/100, Loss: 0.2487


KeyboardInterrupt: 