In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder


ModuleNotFoundError: No module named 'transformers'

In [4]:
# Load data
data = pd.read_csv('data/filtered_data/P_Bray_mcdowell2023_predictors_he2022_global.csv')


In [6]:
# Drop unnecessary columns
data = data.drop(columns=['x','y','ID', 'EROIDX', 'TCEQ', 'CACO3'])


In [8]:
# One-hot encode categorical variables
categorical_cols = ['BEDROCK', 'SOIL.TYPE', 'DEPTH', 'NPP', 'BIOMES']
data = pd.get_dummies(data, columns=categorical_cols)

In [9]:
# Split data into predictors and target
X = data.drop(columns=['p_avg']).values
y = data['p_avg'].values

In [10]:
# Normalize continuous variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [11]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

In [20]:
# Define the transformer model
class TabularTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, nhead, num_layers, dropout=0.5):
        super(TabularTransformer, self).__init__()
        
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x.squeeze(-1)

In [21]:
# Parameters
input_dim = X_train.shape[1]
output_dim = 1
nhead = 2
num_layers = 2

In [22]:
# Models
model = TabularTransformer(input_dim, output_dim, nhead, num_layers)

In [23]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/10], Loss: 1103.0540
Epoch [2/10], Loss: 885.4127
Epoch [3/10], Loss: 828.2811
Epoch [4/10], Loss: 813.1437
Epoch [5/10], Loss: 805.4667
Epoch [6/10], Loss: 798.2857
Epoch [7/10], Loss: 790.2875
Epoch [8/10], Loss: 781.7697
Epoch [9/10], Loss: 773.4894
Epoch [10/10], Loss: 767.5089


In [25]:
# Validation
with torch.no_grad():
    val_outputs = model(X_val_tensor)
    val_loss = criterion(val_outputs, y_val_tensor)
print(f"Validation Loss: {val_loss.item():.4f}")

Validation Loss: 819.3730


In [26]:
# Compute additional evaluation metrics
mae = torch.mean(torch.abs(val_outputs - y_val_tensor))
rmse = torch.sqrt(val_loss)
r2_score = 1 - torch.sum((y_val_tensor - val_outputs) ** 2) / torch.sum((y_val_tensor - torch.mean(y_val_tensor)) ** 2)

print(f"Mean Absolute Error (MAE): {mae.item():.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse.item():.4f}")
print(f"R-squared Value: {r2_score.item():.4f}")

Mean Absolute Error (MAE): 17.3107
Root Mean Squared Error (RMSE): 28.6247
R-squared Value: -0.0905
