In [5]:
%%capture
import sys

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive

    drive.mount('/content/drive', force_remount=False)
    !pip install pytorch_lightning
    !pip install torchmetrics
else:
    print("Not running in Colab.")

CHECKPOINTS_FOLDER = "/checkpoints/tmp_r1"  #stn_2_r2"
import pytorch_lightning as pl
import torch, math, os
from torch.utils.data import Dataset, DataLoader
import numpy as np

seed = 42
pl.seed_everything(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium")  # to make lightning happy

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.callbacks.lr_monitor import LearningRateMonitor
from sklearn.metrics import r2_score
import random
import numpy as np

class MultiTargetDataset(Dataset):
    def __init__(self, X, y, seed=42):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.seed = seed
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        random.seed(self.seed + idx)
        np.random.seed(self.seed + idx)
        return self.X[idx], self.y[idx]

class SequentialRegressor(pl.LightningModule):
    def __init__(self, input_dim, learning_rate=0.001, seed=42):
        super().__init__()
        self.learning_rate = learning_rate
        self.seed = seed
        
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        
        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        return self.network(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.MSELoss()(y_hat, y)
        self.log('test_loss', loss)
        return {'y_true': y.detach().cpu(), 'y_pred': y_hat.detach().cpu()}
    
    def test_epoch_end(self, outputs):
        y_true = torch.cat([out['y_true'] for out in outputs])
        y_pred = torch.cat([out['y_pred'] for out in outputs])
        r2 = r2_score(y_true.numpy(), y_pred.numpy())
        self.log('test_r2', r2)
    
    def configure_optimizers(self):
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
        return {
            'optimizer': optimizer,
            'lr_scheduler': scheduler,
            'monitor': 'val_loss'
        }

def train_single_target(X_train, X_val, X_test, y_train, y_val, y_test, input_dim, seed=42):
    train_dataset = MultiTargetDataset(X_train, y_train, seed=seed)
    val_dataset = MultiTargetDataset(X_val, y_val, seed=seed)
    test_dataset = MultiTargetDataset(X_test, y_test, seed=seed)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    test_loader = DataLoader(test_dataset, batch_size=32)
    
    model = SequentialRegressor(input_dim, seed=seed)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=1)
    lr_monitor = LearningRateMonitor(logging_interval='epoch')
    
    trainer = pl.Trainer(
        max_epochs=100,
        callbacks=[early_stopping, checkpoint_callback, lr_monitor],
        enable_progress_bar=True,
        enable_model_summary=True,
        accelerator='auto',
        devices=1,
        fast_dev_run=True  # New parameter
    )
    
    trainer.fit(model, train_loader, val_loader)
    trainer.test(model, test_loader)
    
    return model

def sequential_training(df, num_features=1479, num_targets=29, target_range=None, seed=42):
    random.seed(seed)
    np.random.seed(seed)
    
    X = df.iloc[:, :num_features].values
    y = df.iloc[:, num_features:num_features+num_targets].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train)
    y_val = y_scaler.transform(y_val)
    y_test = y_scaler.transform(y_test)
    
    final_predictions = pd.DataFrame()
    current_features = X_train.copy()
    
    r2_scores = []
    
    if target_range:
        start, end = target_range
    else:
        start, end = 0, num_targets
    
    for target_idx in range(start, end):
        print(f"\nTraining model for target {target_idx + 1}/{end - start}")
        
        current_target = y_train[:, target_idx].reshape(-1, 1)
        
        model = train_single_target(
            current_features, 
            X_val, 
            X_test,
            current_target, 
            y_val[:, target_idx].reshape(-1, 1), 
            y_test[:, target_idx].reshape(-1, 1),
            input_dim=current_features.shape[1],
            seed=seed
        )
        
        model.eval()
        with torch.no_grad():
            predictions = model(torch.tensor(current_features, dtype=torch.float32))
            predictions = y_scaler.inverse_transform(predictions.numpy())
        
        final_predictions[f'target_{target_idx+1}_pred'] = predictions.flatten()
        
        current_features = np.hstack([current_features, predictions])
        
        # Calculate R2 score for current target
        r2 = r2_score(y_test[:, target_idx], predictions.flatten())
        r2_scores.append(r2)
        
    print("\nR2 scores for all targets on test set:")
    for i, r2 in enumerate(r2_scores):
        print(f"Target {i+1}: {r2:.2f}")
    
    # Load best checkpoint and calculate R2 score on full dataset
    model = SequentialRegressor(X.shape[1], seed=seed)
    trainer = pl.Trainer(seed=seed)
    model = trainer.load_from_checkpoint(checkpoint_path=checkpoint_callback.best_model_path)
    
    model.eval()
    with torch.no_grad():
        full_predictions = model(torch.tensor(X, dtype=torch.float32))
        full_predictions = y_scaler.inverse_transform(full_predictions.numpy())
    
    full_r2 = r2_score(y, full_predictions)
    print(f"\nR2 score on full dataset: {full_r2:.2f}")
    
    return final_predictions


In [8]:
import numpy as np
datafile='src/pl/merged_data_last29_reordered_byR2.csv'
max_rows=200
df = pd.read_csv(resolve_path_gdrive(datafile), delimiter=',', skiprows=1, dtype=float, nrows=max_rows)

In [9]:
# Run sequential training
predictions = sequential_training(df)
# Save predictions
predictions.to_csv('predictions.csv', index=False)


Training model for target 1/29


TypeError: Trainer.__init__() got an unexpected keyword argument 'seed'