In [1]:
import torch
import pytorch_lightning as pl
import os
from training_scripts.MNIST.model import feedforward
from training_scripts.MNIST.dataset import MnistDataModule
import training_scripts.MNIST.default_config as config
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.profilers import PyTorchProfiler

torch.set_float32_matmul_precision("medium")
torch.set_default_dtype(torch.float32)

print("Number of CPUs available: ", torch.get_num_threads())

Number of CPUs available:  8


In [2]:
# Additional RNN cell configurations
kwargs_dict = {
    "Wr_identity": False,
}

In [3]:
# change the accellerator to cpu for pixel by pixel mnist
# config.ACCELERATOR = "cpu"
# config.DEVICES = 1
config.LEARNING_RATE = 0.001
config.HIDDEN_SIZES = [120, 60]

In [4]:
start_from_checkpoint = False
# model_name = "MNIST"
model_name = "MNIST_multilayer_layer_120_60"
# model_name = "MNIST_MLP_80"
folder_name = config.FOLDER_NAME
logger = TensorBoardLogger(folder_name, name=model_name)
# profiler = PyTorchProfiler(
#     on_trace_ready=torch.profiler.tensorboard_trace_handler(f"{folder_name}/profiler0"),
#     schedule=torch.profiler.schedule(skip_first=10, wait=1, warmup=1, active=20),
# )
dm = MnistDataModule(
    data_dir=config.DATA_DIR,
    batch_size=config.BATCH_SIZE,
    num_workers=config.NUM_WORKERS,
    permuted=config.PERMUTED,
)

In [5]:
model = feedforward(
    input_size=config.INPUT_SIZE,
    ff_input_size=config.FF_INPUT_SIZE,
    hidden_sizes=config.HIDDEN_SIZES,
    learning_rate=config.LEARNING_RATE,
    scheduler_change_step=config.SCHEDULER_CHANGE_STEP,
    scheduler_gamma=config.SCHEDULER_GAMMA,
    num_classes=config.NUM_CLASSES, 
    kwargs_dict=kwargs_dict,
)
trainer = pl.Trainer(
    profiler=None,
    logger=logger,
    accelerator=config.ACCELERATOR,
    callbacks=[LearningRateMonitor(logging_interval='epoch'),
               ModelCheckpoint(save_top_k=-1, every_n_epochs=1)],
    devices=config.DEVICES,
    min_epochs=1,
    max_epochs=200,
    precision=config.PRECISION,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
if start_from_checkpoint:
    version = 1
    checkpoint_folder = f'{folder_name}/{model_name}/version_{version}/checkpoints/'
    checkpoint_files = os.listdir(checkpoint_folder)
    epoch_idx = [int(file.split('epoch=')[1].split('-')[0]) for file in checkpoint_files]
    max_idx = epoch_idx.index(max(epoch_idx))
    checkpoint_path = os.path.join(checkpoint_folder, checkpoint_files[max_idx])
    trainer.fit(model, dm, ckpt_path=checkpoint_path)
else:
    trainer.fit(model, dm)
# trainer.validate(model, dm)
# trainer.test(model, dm)

Missing logger folder: ../../tb_logs/MNIST_multilayer_layer_120_60
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                   | Type               | Params
--------------------------------------------------------------
0 | autoencoder            | Autoencoder        | 662 K 
1 | org1                   | ff                 | 52.9 K
2 | org2                   | ff                 | 25.3 K
3 | fc                     | Linear             | 610   
4 | autoencoder_loss_fn    | MSELoss            | 0     
5 | classification_loss_fn | CrossEntropyLoss   | 0     
6 | accuracy               | MulticlassAccuracy | 0     
7 | f1_score               | MulticlassF1Score  | 0     
--------------------------------------------------------------
723 K     Trainable params
18.0 K    Non-trainable params
741 K     Total params
2.964     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [7]:
trainer.test(model, dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.9575999975204468
         test_f1            0.9575999975204468
        test_loss           0.2052241712808609
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc': 0.9575999975204468,
  'test_f1': 0.9575999975204468,
  'test_loss': 0.2052241712808609}]