In [1]:
import pandas as pd
import os
import pickle

from model import Transformer # this is the transformer.py file
import torch
from torch import nn
import numpy as np

In [2]:
from data_processing import create_segments

all_ghg_data = pd.read_csv("data/total-ghg-emissions.csv")
all_ghg_data.rename(columns={'Year': 'year'}, inplace=True)
all_temp_data = pd.read_csv("data/processed_data.csv")
all_temp_data = all_temp_data[all_temp_data['year'] != all_temp_data['year'].max()]

annual_avg_temp = all_temp_data.groupby('year').mean().reset_index()
annual_avg_temp.drop(columns=['Unnamed: 0', 'month'], inplace=True)


def pad_features(df, max_features):
    additional_cols = max_features - df.shape[1]
    for i in range(additional_cols):
        df[f'pad_feature_{i}'] = 0
    return df

max_features = max(all_ghg_data.shape[1], annual_avg_temp.shape[1])
all_ghg_data.drop(columns=['Entity', 'Code'], inplace=True)
all_ghg_data = pad_features(all_ghg_data, max_features)

print("Padded Greenhouse Gas Data: ", all_ghg_data.shape)
print("Average Annual Temperature Data: ", annual_avg_temp.shape)

Padded Greenhouse Gas Data:  (142, 14)
Average Annual Temperature Data:  (142, 14)


In [17]:
ghg_data_padded, temp_data_padded = all_ghg_data, annual_avg_temp

input_years = 10
target_years = 1
overlapping = True

# Segment the data
ghg_input_segments, ghg_target_segments = create_segments(ghg_data_padded, input_years, target_years, overlapping)
temp_input_segments, temp_target_segments = create_segments(temp_data_padded, input_years, target_years, overlapping)

print("GHG Input Segment Shape:", ghg_input_segments[0].shape)
print("Temperature Input Segment Shape:", temp_input_segments[0].shape)
print("GHG Target Segment Shape:", ghg_target_segments[0].shape)
print("Temperature Target Segment Shape:", temp_target_segments[0].shape)

# Iterate over the segments, drop 'year', combine and convert to tensors
# Temperature data is now at index 0 and GHG data at index 1
input_segments_tensors = []
target_segments_tensors = []

for temp_segment, ghg_segment in zip(temp_input_segments, ghg_input_segments):
    temp_tensor = torch.tensor(temp_segment.drop(columns=['year']).values, dtype=torch.float32)
    ghg_tensor = torch.tensor(ghg_segment.drop(columns=['year']).values, dtype=torch.float32)
    combined_input_tensor = torch.stack((temp_tensor, ghg_tensor), dim=0)
    input_segments_tensors.append(combined_input_tensor)

for temp_segment, ghg_segment in zip(temp_target_segments, ghg_target_segments):
    temp_tensor = torch.tensor(temp_segment.drop(columns=['year']).values, dtype=torch.float32)
    ghg_tensor = torch.tensor(ghg_segment.drop(columns=['year']).values, dtype=torch.float32)
    combined_target_tensor = torch.stack((temp_tensor, ghg_tensor), dim=0)
    target_segments_tensors.append(combined_target_tensor)

# Convert lists of combined segments into tensors
input_tensors = torch.stack(input_segments_tensors, dim=0)
target_tensors = torch.stack(target_segments_tensors, dim=0)

print("\nCombined Input Tensor Shape:", input_tensors.shape)
print("Combined Target Tensor Shape:", target_tensors.shape)

GHG Input Segment Shape: (10, 14)
Temperature Input Segment Shape: (10, 14)
GHG Target Segment Shape: (1, 14)
Temperature Target Segment Shape: (1, 14)

Combined Input Tensor Shape: torch.Size([132, 2, 10, 13])
Combined Target Tensor Shape: torch.Size([132, 2, 1, 13])


In [18]:
from dataset import TimeSeriesDataset
from torch.utils.data import DataLoader, random_split
batch_size = 1

ts_dataset = TimeSeriesDataset(input_tensors, target_tensors)
train_size = int(0.6 * len(ts_dataset))  # e.g., 70% of data for training
val_size = int(0.2 * len(ts_dataset))  # 20% of data for validation 
test_size = len(ts_dataset) - train_size - val_size  # Remaining for testing

# Randomly split the dataset into training and validation datasets
train_dataset, val_dataset, test_dataset = random_split(ts_dataset, [train_size, val_size, test_size])

# Create DataLoaders for both training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # Usually, no need to shuffle the validation set
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

print(f"Train Loader Length: {len(train_loader)}")
print(f"Validation Loader Length: {len(val_loader)}")
print(f"Test Loader Length: {len(test_loader)}")

input_batch, target_batch = next(iter(train_loader))

# Print the shapes
print(f'Input batch shape: {input_batch.shape}')  # e.g., torch.Size([8, 120, 15])
print(f'Target batch shape: {target_batch.shape}')  # e.g., torch.Size([8, 12, 15])

Train Loader Length: 79
Validation Loader Length: 26
Test Loader Length: 27
Input batch shape: torch.Size([1, 2, 10, 13])
Target batch shape: torch.Size([1, 2, 1, 13])


In [4]:
# Generate some dooky data

# temp should be 1 * 120 * 15
# precip should be 1 * 120 * 10


# Set random seed for reproducibility
np.random.seed(0)
# Generate input data
temp_input = np.random.randn(1, 120, 15)
precip_input = np.random.randn(1, 120, 15)
combined_input = np.stack([temp_input, precip_input], axis=2)

# Generate target data with a smaller time dimension
temp_target = np.random.randn(1, 12, 15)
precip_target = np.random.randn(1, 12, 15)
combined_target = np.stack([temp_target, precip_target], axis=2)

combined_input_tensor = torch.tensor(combined_input, dtype=torch.float32)
combined_target_tensor = torch.tensor(combined_target, dtype=torch.float32)

combined_input_tensor.shape, combined_target_tensor.shape

(torch.Size([1, 120, 2, 15]), torch.Size([1, 12, 2, 15]))

In [5]:
# Hyperparameters

d_temp = 15
d_ghg = 15
d_enc = 15
d_dec = 30
d_data = 30
ffn_hidden = 2048
num_heads = 5
drop_prob = 0.1
num_layers = 1

transformer = Transformer(d_enc, d_dec, d_data, d_temp, d_ghg, ffn_hidden, num_heads, drop_prob, num_layers)

output = transformer(combined_input_tensor, combined_target_tensor)
print(output.shape)

torch.Size([1, 12, 30])
