# Model Training
- Author: Myles Dunlap


This notebook is used to train a model using a single configuration file. The steps in this notebook are used in the Python script.

# User Inputs

In [None]:
# Append the path for the custom modules
path_custom_modules = '../'

# Path to the YAML config. file
path_cfg = {'base_dir': '../cfgs',
            'filename': 'train-0.yaml'}

# Import Libraries and Modules

In [None]:
# Libraries
import sys
import os
from pathlib import Path

import torch
import argparse
import inspect

# Append Path to Custom Modules
sys.path.append(path_custom_modules)

# Custom Modules
from src.models import llm_multiclass
from src.utils import (RecursiveNamespace,
                       seed_everything,
                       load_cfg,
                       RunIDs)
from src.dataloading.load_data import LoadData
from src.dataloading.stratify import StratifyData
from src.dataloading.preprocess import PreprocessData
from src.dataloading.load_datasets import (TrainDataset,
                                           CustomTextCollator,
                                           )

# Allow HF tokenizer parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'True'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [None]:
# Load the configuration file
CFG = load_cfg(base_dir=Path(path_cfg['base_dir']),
               filename=path_cfg['filename'])

# Set random seed on everything
seed_everything(seed=CFG.seed)

# Get Device type for processing (CPU/GPU)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Group ID and ID for each fold tested
run_ids = RunIDs(test_folds=CFG.cv.val_folds,
                 num_folds=CFG.cv.num_folds)
run_ids.generate_run_ids()

# Print the group id and ids for each fold
print(f'Group ID: {run_ids.group_id}')
for fold_num in CFG.cv.val_folds:
    fold_id = getattr(run_ids.folds_id,
                      f'fold{fold_num}').run_id
    entire_id = f'{run_ids.group_id}-{fold_id}' 
    print((f'ID for Testing Fold #{fold_num}: '
           f'{fold_id}\n\tFull/Entire ID: {entire_id}'))

In [None]:
# Load Data from Disk
load_data_file = LoadData(base_dir=CFG.paths.data.base_dir)
if CFG.debug:
    data = load_data_file.load(filename=CFG.paths.data.debug_data)
else:
    data = load_data_file.load(filename=CFG.paths.data.data)


In [None]:
# Stratify the Data
data = (StratifyData(technique=CFG.stratify.technique,
                     n_folds=CFG.cv.num_folds,
                     target=CFG.data_info.target)
            .stratify(df=data))
cols = CFG.data_info.source_fields + \
       [CFG.data_info.target, 'fold']

# Number of classes for downstream use
N_CLASSES = data[CFG.data_info.target].nunique()

# Print information
display(data[cols].head(5))
print(f'Distribution of a Product for Each Fold')
print(f'Notice how the quantities are evenly distributed across folds')
display(data.groupby('Product').fold.value_counts()
        .sort_index().head(5))

# Start looping over folds here

In [None]:
from torch.utils.data import DataLoader
# Train a model for each validation fold
fold_num = CFG.cv.val_folds[0]

# Split Data into Training and Validation
df_train = data.copy()[data.fold != fold_num].reset_index(drop=True)
df_val = data.copy()[data.fold == fold_num].reset_index(drop=True)
print(f'Train Number of Instances: {len(df_train):,}')
print(f'Validation Number of Instances: {len(df_val):,}')

# Preprocess Target

Convert the text target into a label.

In [None]:
# Preprocessing Encoders
encoders = {}
for technique in CFG.preprocessing.apply_techniques:
    fields = getattr(CFG.preprocessing, technique).fields
    for col in fields:
        enc = PreprocessData(y=df_train[col].values,
                             technique=technique)
        encoders[col] = {'encoder': enc.encoder,
                         'technique': technique}

# Tokenizer and Collator

A collator is an object that forms a batch of data by using a list of dataset elements as inputs. A custom text collator is used here and it has several benefits such as:
- the data does **NOT** have to be tokenizer prior to being passed to the PyTorch DataLoader class. This means the text data is tokenized once a batch of data is processed in in the DataLoader class and it allows for RAM usage to stay lower. For example, if your dataset exceeds the available RAM size then tokenizing all the data prior to the DataLoader (e.g., in the Dataset class) it will become problematic.
- This enables dynamic padding over the batches. Later the maximum token length for a few batches will be printed to illustrate the dynamic padding. This ultimately leads to faster processing of data as explain in this [HuggingFace YouTube video](https://www.youtube.com/watch?v=7q5NyFT8REg).



In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
# Path to the model and tokenizer model card saved on disk
model_path = Path(CFG.model_tokenizer.base_dir) / CFG.model_tokenizer.name

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Collator
collator = CustomTextCollator(tokenizer=tokenizer,
                              tokenizer_cfg=CFG.tokenizer)

## Combine all mix-data type fields

In this [blob post by Chris McCormick](https://mccormickml.com/2021/06/29/combining-categorical-numerical-features-with-bert/) an interesting approach was taken to combine mixed data types which was to convert all categorical and numerical into the text feed into the LLM. This repository will try this technique ingle string that will be processed by the LLM. 

Another approach is only pass the unstructured text into the LLM, take its last layer output and combine with the other mixed data types into a dense layer. This type of approach seems more common and an example can be found on [Google Colab here](https://colab.research.google.com/drive/1F7COnwHqcLDPg_SS-oFgW3c2GPDWnS5Y#scrollTo=BAQFbN-wBpoz).

In [None]:
# Train Dataset and Dataloader
train_dataset = TrainDataset(df=df_train,
                             tok=tokenizer,
                             tok_cfg=CFG.tokenizer,
                             X_cols=CFG.data_info.source_fields,
                             label=CFG.data_info.target,
                             encoder=encoders[CFG.data_info.target]['encoder'])
train_dataloader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              collate_fn=collator,
                              shuffle=True,
                              num_workers=CFG.num_workers,
                              pin_memory=True,
                              )

# Validation Dataset and Dataloader
val_dataset = TrainDataset(df=df_val,
                           tok=tokenizer,
                           tok_cfg=CFG.tokenizer,
                           X_cols=CFG.data_info.source_fields,
                           label=CFG.data_info.target,
                           encoder=encoders[CFG.data_info.target]['encoder'])
val_dataloader = DataLoader(val_dataset,
                            batch_size=CFG.batch_size,
                            collate_fn=collator,
                            shuffle=True,
                            num_workers=CFG.num_workers,
                            pin_memory=True,
                            )

print(f'# of Training Samples: {len(df_train):,}')
print(f'# of Validation Samples: {len(df_val):,}')
print(f'Batch Size: {CFG.batch_size}')
print(f'{len(df_train):,} \ {CFG.batch_size:,} = {len(train_dataloader):,}')
print(f'Train DataLoader # of Iters: {len(train_dataloader):,}')
print(f'Val. DataLoader # of Iters: {len(val_dataloader):,}')

# Dynamic Padding of Maximum Token Lengths
print(f'\nDynamic Padding\n\tThe shape is [batch size, maximum token length]')
for i, inputs in enumerate(train_dataloader):
    if i > 10:
        break
    else:
        print((f'Batch {i + 1} of {len(train_dataloader):,}: '
               f'{inputs["input_ids"].shape}'))

# Load the Model

- Generic HF info. on Bert: [HuggingFace Bert](https://huggingface.co/docs/transformers/model_doc/bert)
- HF Model Card for: [bert-base-uncased](https://huggingface.co/bert-base-uncased)

In [None]:
from src.models.llm_multiclass import CustomModel
# Load custom model
model = CustomModel(llm_model_path=model_path,
                    cfg=CFG.model,
                    num_classes=N_CLASSES)

# Set model on device
model.to(DEVICE)
!nvidia-smi


## Optimizer

In [None]:
from src.training.optimizers import get_optimizer
# Optimizer
optimizer = get_optimizer(cfg=CFG.optimizer,
                          model=model)

In [None]:
# Total number of steps/iterations
total_steps = CFG.epochs * len(train_dataloader)

# Learning Rate Scheduler

Learning rate schedulers can be used to help an algorithm converge to a more optimal solution. Please refer to this references for more information and a visual representation of various learning rate schedules.
 - [Medium Article](https://towardsdatascience.com/a-visual-guide-to-learning-rate-schedulers-in-pytorch-24bbb262c863)
 - [Kaggle Notebook](https://www.kaggle.com/code/isbhargav/guide-to-pytorch-learning-rate-scheduling)

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingLR, OneCycleLR
import matplotlib.pyplot as plt
import gc
# Learning rate scheduler
scheduler = OneCycleLR(optimizer=optimizer,
                       total_steps=total_steps,
                       max_lr=CFG.optimizer.lr.max)
lr_value = []
steps = []
step_count = 0
for epoch in range(CFG.epochs):
    for batch_count in range(len(train_dataloader)):
        optimizer.step()
        lr_value.append(optimizer.param_groups[0]['lr'])
        steps.append(step_count)
        scheduler.step()
        step_count +=1

# Re-establish optimizer and scheduler since they were called
del optimizer, scheduler
_ = gc.collect()

# Optimizer
optimizer = get_optimizer(cfg=CFG.optimizer,
                          model=model)

# Learning rate scheduler
scheduler = OneCycleLR(optimizer=optimizer,
                       total_steps=total_steps,
                       max_lr=CFG.optimizer.lr.max)

# # Plot Learning Rate Schedule
# plt.plot(steps, lr_value)
# plt.xlabel('Steps')
# plt.ylabel('Learning Rate')
# plt.show()

# Training

### Loss Functions References
- [Machine Learning Mastery](https://machinelearningmastery.com/how-to-choose-loss-functions-when-training-deep-learning-neural-networks/)
- [Neptune AI Article](https://neptune.ai/blog/pytorch-loss-functions)
- [PyTorch Loss Functions](https://pytorch.org/docs/stable/nn.html#loss-functions)

Since this is a multi-class classification we can use nn.CrossEntropyLoss as the loss function. More elborate modules can be built for selecting and creating custom loss functions.

### Performance Metrics

[TorchMetrics](https://torchmetrics.readthedocs.io/en/latest/) will be used for implementing performance metrics.
- [F1-Score](https://torchmetrics.readthedocs.io/en/stable/classification/f1_score.html)
- [Precision]()
- [Recall]()

In [None]:
idx=0
batch = next(iter(train_dataloader))

In [None]:
import time
from src.training.metrics import AverageMeter
from tqdm import tqdm
from torch import nn
from torchmetrics import F1Score, Precision, Recall


# Loss Function
loss_fn = nn.CrossEntropyLoss()

# Performance metrics
f1 = F1Score(task='multiclass', num_classes=N_CLASSES)
precision = Precision(task='multiclass', num_classes=N_CLASSES)
recall = Recall(task='multiclass', num_classes=N_CLASSES)

# Training loop over epochs
start_training_time = time.time()
step_count = 0
best_score = 0.0
for epoch in range(CFG.epochs):
    epoch_start_time = time.time()
    print(f'\nStart Epoch {epoch + 1}')
    train_meters = {
        'loss': AverageMeter(),
        'f1': AverageMeter(),
        'precision': AverageMeter(),
        'recall': AverageMeter(),
    }
    model.train()
    
    # TRAINING
    # Iterate over each batch in an epoch
    # for idx, batch in enumerate(train_dataloader):
    X = {'input_ids': batch['input_ids'].to(DEVICE),
         'attention_mask': batch['attention_mask'].to(DEVICE)}
    y = batch['labels'].to(DEVICE)

    # Model prediction
    y_pred = model(X)
    # Calculate loss
    loss = loss_fn(input=y_pred, target=y)
    
    # Backward pass, optimizer & scheduler steps
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    scheduler.step()
    
    # Performance metrics for the batch of data
    f1_score = f1(y_pred, y)
    precision_score = precision(y_pred, y)
    recall_score = recall(y_pred, y)

    # Store loss and performance metrics
    train_meters['loss'].update(loss, n=X.size(0))
    train_meters['cos'].update(trn_cos, n=X.size(0))  
    
    # Progress bar info.
    tk0.set_postfix(train_loss=float(train_meters['loss'].avg.detach().cpu().numpy()),
                    train_cos=train_meters['cos'].avg,
                    lr=scheduler.get_last_lr()[0])       
    
    # Log Iterations results in WandB
    if ((step_count + 1) % 500) == 0:
        wandb.log({'step': step_count,
                    'train_loss': float(train_meters['loss'].avg.detach().cpu().numpy()),
                    'train_cos': train_meters['cos'].avg,
                    'lr': scheduler.get_last_lr()[0],
                    })
    step_count += 1
print('Epoch {:d} / trn/loss={:.4f}, trn/cos={:.4f}'.format(
    epoch + 1,
    train_meters['loss'].avg,
    train_meters['cos'].avg))
print(f'Epoch {epoch + 1} Training Time: '
        f'{(((time.time() - epoch_start_time) / 60) / 60):.1f} hrs.')