In [1]:
%reload_ext autoreload

import os
import sys
import random

import torch
import numpy as np
import matplotlib.pyplot as plt

# Append python path - needed to import text_recognizer
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run ../training/run_experiment.py --model_class LineCNNTransformer --data_class IAMLines \
  --fast_dev_run True --log_every_n_steps 1 --accelerator 'auto' --precision 'bf16' \
  --loss transformer --batch_size 32  --max_epochs 2 \
  --limit_train_batches 0.1 --limit_val_batches 0.1 --limit_test_batches 0.1 

Using bfloat16 Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


Namespace(logger=True, enable_checkpointing=True, default_root_dir=None, gradient_clip_val=None, gradient_clip_algorithm=None, num_nodes=1, num_processes=None, devices=None, gpus=None, auto_select_gpus=False, tpu_cores=None, ipus=None, enable_progress_bar=True, overfit_batches=0.0, track_grad_norm=-1, check_val_every_n_epoch=1, fast_dev_run=True, accumulate_grad_batches=None, max_epochs=2, min_epochs=None, max_steps=-1, min_steps=None, max_time=None, limit_train_batches=0.1, limit_val_batches=0.1, limit_test_batches=0.1, limit_predict_batches=None, val_check_interval=None, log_every_n_steps=1, accelerator='auto', strategy=None, sync_batchnorm=False, precision='bf16', enable_model_summary=True, num_sanity_val_steps=2, resume_from_checkpoint=None, profiler=None, benchmark=None, deterministic=None, reload_dataloaders_every_n_epochs=0, auto_lr_find=False, replace_sampler_ddp=True, detect_anomaly=False, auto_scale_batch_size=False, plugins=None, amp_backend='native', amp_level=None, move_me


   | Name                      | Type               | Params
------------------------------------------------------------------
0  | model                     | LineCNNTransformer | 4.3 M 
1  | model.line_cnn            | LineCNN            | 1.6 M 
2  | model.embedding           | Embedding          | 21.2 K
3  | model.fc                  | Linear             | 21.3 K
4  | model.pos_encoder         | PositionalEncoding | 0     
5  | model.transformer_decoder | TransformerDecoder | 2.6 M 
6  | train_acc                 | MulticlassAccuracy | 0     
7  | val_acc                   | MulticlassAccuracy | 0     
8  | test_acc                  | MulticlassAccuracy | 0     
9  | val_cer                   | CharacterErrorRate | 0     
10 | test_cer                  | CharacterErrorRate | 0     
11 | loss_fn                   | CrossEntropyLoss   | 0     
------------------------------------------------------------------
4.3 M     Trainable params
0         Non-trainable params
4.3 M     Tota

Epoch 0: 100%|██████████| 2/2 [15:27<00:00, 463.75s/it, loss=4.86, v_num=, validation/loss=3.650, validation/cer=2.050]

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 2/2 [15:27<00:00, 463.76s/it, loss=4.86, v_num=, validation/loss=3.650, validation/cer=2.050]
Testing DataLoader 0: 100%|██████████| 1/1 [12:06<00:00, 726.50s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test/cer            1.8999269008636475
        test/loss            3.658341884613037
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


### Local Experiment Tracking with Tensorboard

In [3]:
# we use a sequence of bash commands to get the latest experiment's directory
#  by hand, you can just copy and paste it from the terminal

list_all_log_files = "find training/logs/lightning_logs/"  # find avoids issues ls has with \n in filenames
filter_to_folders = "grep '_[0-9]*$'"  # regex match on end of line
sort_version_descending = "sort -Vr"  # uses "version" sorting (-V) and reverses (-r)
take_first = "head -n 1"  # the first n elements, n=1

In [4]:
latest_log, = ! {list_all_log_files} | {filter_to_folders} | {sort_version_descending} | {take_first}
latest_log

'training/logs/lightning_logs//version_0'

In [5]:
!ls -lh {latest_log}

total 38816
-rw-r--r--  1 niall.turbitt  staff    19M Jan  1 16:39 epoch=0000-validation.loss=0.567.ckpt
-rw-r--r--  1 niall.turbitt  staff   3.9K Jan  1 16:40 events.out.tfevents.1672590723.C02ZLTN5MD6M
-rw-r--r--  1 niall.turbitt  staff   179B Jan  1 16:43 events.out.tfevents.1672591334.C02ZLTN5MD6M
-rw-r--r--  1 niall.turbitt  staff     3B Jan  1 16:32 hparams.yaml


In [6]:
%load_ext tensorboard

In [7]:
port = 12511  # pick an open port on your machine
host = "0.0.0.0" # allow connections from the internet
# make sure you turn TensorBoard off

%tensorboard --logdir {latest_log} --port {port} --host {host}

In [8]:
# See all results pointing TensorBoard at the whole lightning_logs directory, rather than just one experiment
%tensorboard --logdir training/logs/lightning_logs --port {port + 1} --host "0.0.0.0"

In [None]:
# KILL ALL TENSORBOARD PROCESSES
import tensorboard.manager

# get the process IDs for all tensorboard instances
pids = [tb.pid for tb in tensorboard.manager.get_all()]

done_with_tensorboard = False

if done_with_tensorboard:
    # kill processes
    for pid in pids:
        !kill {pid} 2> /dev/null
        
    # remove the temporary files that sometimes persist, see https://stackoverflow.com/a/59582163
    !rm -rf {tensorboard.manager._get_info_dir()}

### Weights and Biases