In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


## Define parameters

In [2]:
learning_rate = 1e-3
input_size = 100
output_size = 100
batch_size = 100
num_epochs = 500
num_workers = 0
train_dir = os.path.join(os.getcwd(), 'data', 'train')
test_dir = os.path.join(os.getcwd(), 'data', 'test')
dtype_to_use = torch.float32

## Define the neural network architecture
Layer options (More info at https://pytorch.org/docs/stable/nn.html):
+ Linear: fully connected layer
+ Conv1d/Conv2d: Convolutional layers
+ BatchNorm2d/LayerNorm/InstanceNorm2d: Normalization layers
+ Dropout: Dropout layer
+ MaxPool2d/AvgPool2d: Pooling layers

In [3]:
class NN(pl.LightningModule):
    def __init__(self, learning_rate, input_size, output_size):
        super(NN, self).__init__()
        self.learning_rate = learning_rate
        self.layer1 = nn.Linear(input_size, 100, dtype=dtype_to_use)
        self.layer2 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer3 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer4 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer5 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer6 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer7 = nn.Linear(100, 100, dtype=dtype_to_use)
        self.layer8 = nn.Linear(100, output_size, dtype=dtype_to_use)

        self.relu = nn.ReLU()
        self.loss_fn = nn.MSELoss()

        self.training_step_outputs = []
        self.testing_step_outputs = []
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.relu(self.layer3(x))
        x = self.relu(self.layer4(x))
        x = self.relu(self.layer5(x))
        x = self.relu(self.layer6(x))
        x = self.relu(self.layer7(x))
        x = self.layer8(x)
        return x

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
    
    def on_train_epoch_end(self):
        if self.current_epoch == self.trainer.max_epochs - 1:
            # Calculate distances for the last epoch
            avg_distances, max_distances = self.calculate_distances(self.training_step_outputs)
            for i, avg_distance in enumerate(avg_distances):
                self.logger.experiment.add_scalar("train/avg_distance", avg_distance, i)
                self.logger.experiment.add_scalar("train/max_distance", max_distances[i], i)	

    def on_test_epoch_end(self):
        if self.current_epoch == self.trainer.max_epochs - 1:
            # Calculate distances for the last epoch
            avg_distances, max_distances = self.calculate_distances(self.testing_step_outputs)
            for i, avg_distance in enumerate(avg_distances):
                self.logger.experiment.add_scalar("test/avg_distance", avg_distance, i)
                self.logger.experiment.add_scalar("test/max_distance", max_distances[i], i)

            

    def calculate_distances(self, output_target_pairs):
        avg_distances, max_distances = [], []
        for output, target in output_target_pairs:
            avg_distance, max_distance = self.calculate_distance(output, target)
            avg_distances.append(avg_distance)
            max_distances.append(max_distance)

        return avg_distances, max_distances

    def calculate_distance(self, output, target):
        abs_diff = torch.abs(output - target)
        avg_distance = torch.mean(abs_diff)
        max_distance = torch.max(abs_diff)
        return (avg_distance.item(), max_distance.item())
    

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss_fn(outputs, targets)
        if torch.isnan(loss):
            self.trainer.should_stop = True
        self.log("train_loss", loss)
        if self.current_epoch == self.trainer.max_epochs - 1:
            for output, target in zip(outputs, targets):
                self.training_step_outputs.append((output, target))
        else:
            self.training_step_outputs = []
        return loss
    
    def test_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss_fn(outputs, targets)
        self.log("test_loss", loss)
        if self.current_epoch >= self.trainer.max_epochs - 1:
            for output, target in zip(outputs, targets):
                self.training_step_outputs.append((output, target))
        else:
            self.testing_step_outputs = []
        return loss


## Custom dataset class

In [4]:
class CustomDataset(Dataset):
    def __init__(self, data, device):
        self.data = data
        self.device = device

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Assuming each item in the dataset is a tuple of (input, output)
        sample = self.data[index]
        input_array, output_array = sample[0], sample[1]

        # Convert to PyTorch tensors
        input_tensor = torch.tensor(input_array, dtype=dtype_to_use, device=self.device)
        output_tensor = torch.tensor(output_array, dtype=dtype_to_use, device=self.device)

        return input_tensor, output_tensor
    
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, batch_size, num_workers, device):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
        self.train_ds = None
        self.test_ds = None

    def prepare_data(self):
        pass

    #Arrays are transposed to make input size 100 instead of 2
    def setup(self, stage):
        train_data = []
        test_data = []
        for subdir in os.listdir(train_dir):
            input_output = (pd.read_csv(os.path.join(train_dir, subdir, 'polar.csv')).values.transpose(), pd.read_csv(os.path.join(train_dir, subdir, 'coords.csv')).values.transpose())
            train_data.append(input_output)
        for subdir in os.listdir(test_dir):
            input_output = (pd.read_csv(os.path.join(test_dir, subdir, 'polar.csv')).values.transpose(), pd.read_csv(os.path.join(test_dir, subdir, 'coords.csv')).values.transpose())
            test_data.append(input_output)

        self.train_ds = CustomDataset(train_data, self.device)
        self.test_ds = CustomDataset(test_data, self.device)

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
        )

In [5]:
# Set device cuda for GPU if it's available otherwise run on the CPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
if device == torch.device("cuda"):
    print(f"Using: {torch.cuda.get_device_name(device)}")
else:
    print("Using: CPU")
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
lightning_model = NN(learning_rate=learning_rate, input_size=input_size, output_size=output_size).to(device)


dm = CustomDataModule(batch_size, num_workers, device)

Using: Quadro P600


# Train and plot loss over epochs, as well as average and maximum difference for every sample in the last epoch

In [6]:
#! Change this to your name so that each one has their results in a separate folder
tb_logger = TensorBoardLogger("logs", name="results_Marco")
trainer = pl.Trainer(max_epochs=num_epochs, logger=tb_logger)  # Adjust parameters as needed

# Train the model
trainer.fit(lightning_model, dm)
%tensorboard --logdir logs/results_Marco

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | layer1  | Linear  | 10.1 K
1 | layer2  | Linear  | 10.1 K
2 | layer3  | Linear  | 10.1 K
3 | layer4  | Linear  | 10.1 K
4 | layer5  | Linear  | 10.1 K
5 | layer6  | Linear  | 10.1 K
6 | layer7  | Linear  | 10.1 K
7 | layer8  | Linear  | 10.1 K
8 | relu    | ReLU    | 0     
9 | loss_fn | MSELoss | 0     
------------------------------------
80.8 K    Trainable params
0         Non-trainable params
80.8 K    Total params
0.323     Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Epoch 499: 100%|██████████| 13/13 [00:00<00:00, 49.00it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=500` reached.


Epoch 499: 100%|██████████| 13/13 [00:02<00:00,  6.50it/s, v_num=0]


UsageError: Line magic function `%tensorboard` not found.


In [8]:
trainer = pl.Trainer(max_epochs=2000, logger=tb_logger)  # Adjust parameters as needed

# Train the model
trainer.fit(lightning_model, dm, ckpt_path=r".\logs\results_Marco\version_0\checkpoints\epoch=499-step=6500.ckpt")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
Restoring states from the checkpoint path at .\logs\results_Marco\version_0\checkpoints\epoch=499-step=6500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | layer1  | Linear  | 10.1 K
1 | layer2  | Linear  | 10.1 K
2 | layer3  | Linear  | 10.1 K
3 | layer4  | Linear  | 10.1 K
4 | layer5  | Linear  | 10.1 K
5 | layer6  | Linear  | 10.1 K
6 | layer7  | Linear  | 10.1 K
7 | layer8  | Linear  | 10.1 K
8 | relu    | ReLU    | 0     
9 | loss_fn | MSELoss | 0     
------------------------------------
80.8 K    Trainable params
0         Non-trainable params
80.8 K    Total params
0.323     Total estimated model params size (MB)
Restored all states from the checkpoint at .\logs\results_Marco\versio

Epoch 1999: 100%|██████████| 13/13 [00:00<00:00, 41.87it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=2000` reached.


Epoch 1999: 100%|██████████| 13/13 [00:02<00:00,  5.29it/s, v_num=0]


# Get results from testing:


In [9]:
trainer.test(lightning_model, dm)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing DataLoader 0: 100%|██████████| 4/4 [00:00<00:00, 61.70it/s] 


[{'test_loss': 0.00024028671032283455}]