# Model Training
- Author: Myles Dunlap


This notebook is used to train a model using a single configuration file. The steps in this notebook are used in the Python script.

# User Inputs

In [1]:
# Append the path for the custom modules
path_custom_modules = '../'

# Path to the YAML config. file
path_cfg = {'base_dir': '../cfgs',
            'filename': 'train-0.yaml'}

# Import Libraries and Modules

In [19]:
# Libraries
import sys
import os
from pathlib import Path

import torch
import argparse
import inspect

# Append Path to Custom Modules
sys.path.append(path_custom_modules)

# Custom Modules
from src.models import llm_multiclass
from src.utils import (RecursiveNamespace,
                       seed_everything,
                       load_cfg,
                       RunIDs)
from src.dataloading.load_data import LoadData
from src.dataloading.stratify import StratifyData
from src.dataloading.preprocess import PreprocessData
from src.dataloading.load_datasets import (TrainDataset,
                                           CustomTextCollator,
                                           )

# Allow HF tokenizer parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'True'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [23]:
# Load the configuration file
CFG = load_cfg(base_dir=Path(path_cfg['base_dir']),
               filename=path_cfg['filename'])

# Set random seed on everything
seed_everything(seed=CFG.seed)

# Get Device type for processing (CPU/GPU)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Sun Jul 23 08:32:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   50C    P8    18W / 350W |    793MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# Group ID and ID for each fold tested
run_ids = RunIDs(test_folds=CFG.cv.val_folds,
                 num_folds=CFG.cv.num_folds)
run_ids.generate_run_ids()

# Print the group id and ids for each fold
print(f'Group ID: {run_ids.group_id}')
for fold_num in CFG.cv.val_folds:
    fold_id = getattr(run_ids.folds_id,
                      f'fold{fold_num}').run_id
    entire_id = f'{run_ids.group_id}-{fold_id}' 
    print((f'ID for Testing Fold #{fold_num}: '
           f'{fold_id}\n\tFull/Entire ID: {entire_id}'))

Group ID: 0d4da237
ID for Testing Fold #1: d569d59d
	Full/Entire ID: 0d4da237-d569d59d
ID for Testing Fold #2: 42866ec3
	Full/Entire ID: 0d4da237-42866ec3
ID for Testing Fold #3: 7e7d2bc8
	Full/Entire ID: 0d4da237-7e7d2bc8
ID for Testing Fold #4: cc2bf649
	Full/Entire ID: 0d4da237-cc2bf649
ID for Testing Fold #5: 7d6d49fc
	Full/Entire ID: 0d4da237-7d6d49fc


In [5]:
# Load Data from Disk
load_data_file = LoadData(base_dir=CFG.paths.data.base_dir)
if CFG.debug:
    data = load_data_file.load(filename=CFG.paths.data.debug_data)
else:
    data = load_data_file.load(filename=CFG.paths.data.data)


In [6]:
# Stratify the Data
data = (StratifyData(technique=CFG.stratify.technique,
                     n_folds=CFG.cv.num_folds,
                     target=CFG.data_info.target)
            .stratify(df=data))
cols = CFG.data_info.source_fields + \
       [CFG.data_info.target, 'fold']

# Number of classes for downstream use
N_CLASSES = data[CFG.data_info.target].nunique()

# Print information
display(data[cols].head(5))
print(f'Distribution of a Product for Each Fold')
print(f'Notice how the quantities are evenly distributed across folds')
display(data.groupby('Product').fold.value_counts()
        .sort_index().head(5))

Unnamed: 0,Consumer complaint narrative,State,Company response to consumer,Product,fold
0,The Summer of XX/XX/2018 I was denied a mortga...,IL,Closed with explanation,"Credit reporting, credit repair services, or o...",1
1,There are many mistakes appear in my report wi...,VA,Closed with explanation,"Credit reporting, credit repair services, or o...",1
2,There are many mistakes appear in my report wi...,TX,Closed with explanation,"Credit reporting, credit repair services, or o...",1
3,There are many mistakes appear in my report wi...,TX,Closed with explanation,"Credit reporting, credit repair services, or o...",1
4,There are many mistakes appear in my report wi...,CA,Closed with explanation,"Credit reporting, credit repair services, or o...",1


Distribution of a Product for Each Fold
Notice how the quantities are evenly distributed across folds


Product                  fold
Bank account or service  1       2953
                         2       2953
                         3       2954
                         4       2953
                         5       2953
Name: count, dtype: int64

# Start looping over folds here

In [7]:
from torch.utils.data import DataLoader
# Train a model for each validation fold
fold_num = CFG.cv.val_folds[0]

# Split Data into Training and Validation
df_train = data.copy()[data.fold != fold_num].reset_index(drop=True)
df_val = data.copy()[data.fold == fold_num].reset_index(drop=True)
print(f'Train Number of Instances: {len(df_train):,}')
print(f'Validation Number of Instances: {len(df_val):,}')

Train Number of Instances: 305,739
Validation Number of Instances: 76,435


# Preprocess Target

Convert the text target into a label.

In [8]:
# Preprocessing Encoders
encoders = {}
for technique in CFG.preprocessing.apply_techniques:
    fields = getattr(CFG.preprocessing, technique).fields
    for col in fields:
        enc = PreprocessData(y=df_train[col].values,
                             technique=technique)
        encoders[col] = {'encoder': enc.encoder,
                         'technique': technique}

# Tokenizer and Collator

A collator is an object that forms a batch of data by using a list of dataset elements as inputs. A custom text collator is used here and it has several benefits such as:
- the data does **NOT** have to be tokenizer prior to being passed to the PyTorch DataLoader class. This means the text data is tokenized once a batch of data is processed in in the DataLoader class and it allows for RAM usage to stay lower. For example, if your dataset exceeds the available RAM size then tokenizing all the data prior to the DataLoader (e.g., in the Dataset class) it will become problematic.
- This enables dynamic padding over the batches. Later the maximum token length for a few batches will be printed to illustrate the dynamic padding. This ultimately leads to faster processing of data as explain in this [HuggingFace YouTube video](https://www.youtube.com/watch?v=7q5NyFT8REg).



In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding
# Path to the model and tokenizer model card saved on disk
model_path = Path(CFG.model_tokenizer.base_dir) / CFG.model_tokenizer.name

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# Collator
collator = CustomTextCollator(tokenizer=tokenizer,
                              tokenizer_cfg=CFG.tokenizer)

## Combine all mix-data type fields

In this [blob post by Chris McCormick](https://mccormickml.com/2021/06/29/combining-categorical-numerical-features-with-bert/) an interesting approach was taken to combine mixed data types which was to convert all categorical and numerical into the text feed into the LLM. This repository will try this technique ingle string that will be processed by the LLM. 

Another approach is only pass the unstructured text into the LLM, take its last layer output and combine with the other mixed data types into a dense layer. This type of approach seems more common and an example can be found on [Google Colab here](https://colab.research.google.com/drive/1F7COnwHqcLDPg_SS-oFgW3c2GPDWnS5Y#scrollTo=BAQFbN-wBpoz).

In [10]:
# Train Dataset and Dataloader
train_dataset = TrainDataset(df=df_train,
                             tok=tokenizer,
                             tok_cfg=CFG.tokenizer,
                             X_cols=CFG.data_info.source_fields,
                             label=CFG.data_info.target,
                             encoder=encoders[CFG.data_info.target]['encoder'])
train_dataloader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              collate_fn=collator,
                              shuffle=True,
                              num_workers=CFG.num_workers,
                              pin_memory=True)

# Validation Dataset and Dataloader
val_dataset = TrainDataset(df=df_val,
                           tok=tokenizer,
                           tok_cfg=CFG.tokenizer,
                           X_cols=CFG.data_info.source_fields,
                           label=CFG.data_info.target,
                           encoder=encoders[CFG.data_info.target]['encoder'])
val_dataloader = DataLoader(val_dataset,
                            batch_size=CFG.batch_size,
                            collate_fn=collator,
                            shuffle=True,
                            num_workers=CFG.num_workers,
                            pin_memory=True)

print(f'# of Training Samples: {len(df_train):,}')
print(f'# of Validation Samples: {len(df_val):,}')
print(f'Batch Size: {CFG.batch_size}')
print(f'{len(df_train):,} \ {CFG.batch_size:,} = {len(train_dataloader):,}')
print(f'Train DataLoader # of Iters: {len(train_dataloader):,}')
print(f'Val. DataLoader # of Iters: {len(val_dataloader):,}')

# Dynamic Padding of Maximum Token Lengths
print(f'\nDynamic Padding\n\tThe shape is [batch size, maximum token length]')
for i, inputs in enumerate(train_dataloader):
    if i > 10:
        break
    else:
        print((f'Batch {i + 1} of {len(train_dataloader):,}: '
               f'{inputs["input_ids"].shape}'))

# of Training Samples: 305,739
# of Validation Samples: 76,435
Batch Size: 4
305,739 \ 4 = 76,435
Train DataLoader # of Iters: 76,435
Val. DataLoader # of Iters: 19,109

Dynamic Padding
	The shape is [batch size, maximum token length]
Batch 1 of 76,435: torch.Size([4, 512])
Batch 2 of 76,435: torch.Size([4, 428])
Batch 3 of 76,435: torch.Size([4, 334])
Batch 4 of 76,435: torch.Size([4, 512])
Batch 5 of 76,435: torch.Size([4, 512])
Batch 6 of 76,435: torch.Size([4, 512])
Batch 7 of 76,435: torch.Size([4, 512])
Batch 8 of 76,435: torch.Size([4, 169])
Batch 9 of 76,435: torch.Size([4, 445])
Batch 10 of 76,435: torch.Size([4, 125])
Batch 11 of 76,435: torch.Size([4, 257])


# Load the Model

- Generic HF info. on Bert: [HuggingFace Bert](https://huggingface.co/docs/transformers/model_doc/bert)
- HF Model Card for: [bert-base-uncased](https://huggingface.co/bert-base-uncased)

In [26]:
from src.models.llm_multiclass import CustomModel
# Load custom model
model = CustomModel(llm_model_path=model_path,
                    cfg=CFG.model,
                    num_classes=N_CLASSES)

# Set model on device
model.to(DEVICE)
!nvidia-smi


Sun Jul 23 08:35:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
|  0%   59C    P2    29W / 350W |   1265MiB / 24576MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Optimizer

In [None]:
# Optimizer


In [None]:
# for i in range(18):
#     a, b = next(iter(train_dataset), i)
#     print(a['input_ids'].shape)
#     print(b.shape)

In [None]:
a = next(iter(train_dataset))

# # Train a model for each validation fold
# for fold_num in CFG.stratify.val_folds:
#     print(f'Starting Training for Fold {fold_num}')
    
#     # Training Module
    
#     # Inference Module
    
#     print(f'\tFinished Training for Fold {fold_num}')
# print('Training and Validation Completed')
