In [1]:
%load_ext autoreload

In [2]:
%load_ext jupyter_black

In [3]:
%autoreload 2

In [4]:
# !black .

In [5]:
import warnings

warnings.filterwarnings("ignore")

In [6]:
# !pip uninstall ipywidgets -y

In [7]:
import os

# os.environ["TORCH_CPP_LOG_LEVEL"] = "INFO"
# os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# The GPU id to use, "0" to  "7"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [8]:
import torch
import pytorch_lightning as pl

print(f"Torch={torch.__version__}, Lightening={pl.__version__}")

Torch=2.0.1+cu117, Lightening=2.0.6


In [10]:
from parrotletml.config import config

In [12]:
from parrotletml.datamodule import BilingualDataModule

data_module = BilingualDataModule(
    dataset_path=config.dataset_path,
    src_lang=config.lang_src,
    tgt_lang=config.lang_tgt,
    seq_len=config.seq_len,
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    pin_memory=config.pin_memory,
    tokenizer_file=config.tokenizer_file,
)

# data_module.prepare_data()

# data_module.setup()

Max length of source sentence: 150
Max length of target sentence: 159


In [13]:
from parrotletml.bilingualmodule import BilingualModule

model = BilingualModule(
    tokenizer_src=data_module.tokenizer_src,
    tokenizer_tgt=data_module.tokenizer_tgt,
    seq_len=config.seq_len,
    d_model=config.d_model,
    lr=config.lr,
    weight_decay=config.weight_decay,
    eps=config.eps,
    label_smoothing=config.label_smoothing,
)

In [14]:
import sys
from pytorch_lightning.callbacks import (
    TQDMProgressBar,
    LearningRateMonitor,
    ModelCheckpoint,
    ModelPruning,
    EarlyStopping,
)
from pytorch_lightning.loggers import TensorBoardLogger

from pytorch_lightning.profilers import PyTorchProfiler


class MyProgressBar(TQDMProgressBar):
    def init_validation_tqdm(self):
        bar = super().init_validation_tqdm()
        if not sys.stdout.isatty():
            bar.disable = True
        return bar

    def init_predict_tqdm(self):
        bar = super().init_predict_tqdm()
        if not sys.stdout.isatty():
            bar.disable = True
        return bar

    # def init_test_tqdm(self):
    #     bar = super().init_test_tqdm()
    #     if not sys.stdout.isatty():
    #         bar.disable = True
    #     return bar


logger = TensorBoardLogger("tb_logs", name="aiayn")

# training
trainer = pl.Trainer(
    log_every_n_steps=1,
    callbacks=[
        MyProgressBar(refresh_rate=1),
        LearningRateMonitor(logging_interval="epoch"),
        ModelCheckpoint(
            dirpath="ckpt_logs/aiayn",
            save_top_k=3,
            monitor="train_loss",
            mode="min",
            filename="model-{epoch:02d}-{train_loss:4f}",
            save_last=True,
        ),
        # ModelPruning(
        #     pruning_fn="l1_unstructured",
        #     amount=0.1,
        #     use_global_unstructured=True,
        # ),
        EarlyStopping(monitor="train_loss", mode="min", stopping_threshold=1.5),
    ],
    logger=logger,
    precision="16-mixed",
    accelerator="gpu",
    devices="auto",
    # strategy="ddp_notebook",
    check_val_every_n_epoch=1,
    # limit_train_batches=5,
    limit_val_batches=2,
    # limit_test_batches=1,
    max_epochs=config.num_epochs,
    # max_epochs=1,
    # profiler=PyTorchProfiler(),
)

# Uncomment the following line to train the model
trainer.fit(
    model,
    # train_dataloaders=data_module.train_dataloader(),
    # val_dataloaders=data_module.val_dataloader(),
    datamodule=data_module,
)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type             | Params
----------------------------------------------
0 | bimodel  | Transformer      | 56.3 M
1 | criteria | CrossEntropyLoss | 0     
----------------------------------------------
56.3 M    Trainable params
0         Non-trainable params
56.3 M    Total params
225.350   Total estimated model params size (MB)


Epoch 0: 100%|██████████| 849/849 [04:48<00:00,  2.95it/s, v_num=66, train_loss_step=4.870]
Epoch 1: 100%|██████████| 849/849 [04:47<00:00,  2.95it/s, v_num=66, train_loss_step=3.950, val_cer=0.597, val_wer=0.923, val_bleu=0.000, train_loss_epoch=5.800]
Epoch 2: 100%|██████████| 849/849 [04:47<00:00,  2.95it/s, v_num=66, train_loss_step=3.440, val_cer=0.540, val_wer=0.731, val_bleu=0.000, train_loss_epoch=4.490]
Epoch 3: 100%|██████████| 849/849 [04:47<00:00,  2.95it/s, v_num=66, train_loss_step=3.120, val_cer=0.604, val_wer=0.808, val_bleu=0.000, train_loss_epoch=3.850]
Epoch 4: 100%|██████████| 849/849 [04:48<00:00,  2.95it/s, v_num=66, train_loss_step=2.910, val_cer=0.561, val_wer=0.769, val_bleu=0.000, train_loss_epoch=3.520]
Epoch 5: 100%|██████████| 849/849 [04:47<00:00,  2.95it/s, v_num=66, train_loss_step=2.660, val_cer=0.568, val_wer=0.885, val_bleu=0.000, train_loss_epoch=3.320]
Epoch 6: 100%|██████████| 849/849 [04:47<00:00,  2.95it/s, v_num=66, train_loss_step=2.460, val_ce

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 849/849 [04:54<00:00,  2.88it/s, v_num=66, train_loss_step=1.580, val_cer=0.640, val_wer=0.962, val_bleu=0.000, train_loss_epoch=1.730]


In [None]:
# session14/ckpt_logs/aiayn/last-v32.ckpt
# session14/tb_logs/aiayn/version_66