# Model Training
- Author: Myles Dunlap


This notebook is used to train a model using a single configuration file. The steps in this notebook are used in the Python script.

# User Inputs

In [1]:
# Append the path for the custom modules
path_custom_modules = '../'

# Path to the YAML config. file
path_cfg = {'base_dir': '../cfgs',
            'filename': 'train-0.yaml'}

# Import Libraries and Modules

In [2]:
# Libraries
import sys
import os
from pathlib import Path

import argparse
import inspect

# Append Path to Custom Modules
sys.path.append(path_custom_modules)

# Custom Modules
from src.models import llm_multiclass
from src.utils import (RecursiveNamespace,
                       seed_everything,
                       load_cfg,
                       RunIDs)
from src.dataloading.load_data import LoadData
from src.dataloading.stratify import StratifyData

# Allow HF tokenizer parallelism
os.environ['TOKENIZERS_PARALLELISM'] = 'True'

In [3]:
# Load the configuration file
CFG = load_cfg(base_dir=Path(path_cfg['base_dir']),
               filename=path_cfg['filename'])

# Set random seed on everything
seed_everything(seed=CFG.seed)

In [4]:
# Group ID and ID for each fold tested
run_ids = RunIDs(test_folds=CFG.cv.val_folds,
                 num_folds=CFG.cv.num_folds)
run_ids.generate_run_ids()

# Print the group id and ids for each fold
print(f'Group ID: {run_ids.group_id}')
for fold_num in CFG.cv.val_folds:
    fold_id = getattr(run_ids.folds_id,
                      f'fold{fold_num}').run_id
    entire_id = f'{run_ids.group_id}-{fold_id}' 
    print((f'ID for Testing Fold #{fold_num}: '
           f'{fold_id}\n\tFull/Entire ID: {entire_id}'))

Group ID: 5d9bc20b
ID for Testing Fold #1: df40444f
	Full/Entire ID: 5d9bc20b-df40444f
ID for Testing Fold #2: 743468be
	Full/Entire ID: 5d9bc20b-743468be
ID for Testing Fold #3: 6715b467
	Full/Entire ID: 5d9bc20b-6715b467
ID for Testing Fold #4: a12799e1
	Full/Entire ID: 5d9bc20b-a12799e1
ID for Testing Fold #5: 78134789
	Full/Entire ID: 5d9bc20b-78134789


In [5]:
# Load Data from Disk
load_data_file = LoadData(base_dir=CFG.paths.data.base_dir)
if CFG.debug:
    raw_data = load_data_file.load(filename=CFG.paths.data.debug_data)
else:
    raw_data = load_data_file.load(filename=CFG.paths.data.raw_data)


In [33]:
# Stratify the Data
raw_data = (StratifyData(technique=CFG.stratify.technique,
                         n_folds=CFG.cv.num_folds,
                         target=CFG.data_info.target,
                         shuffle=CFG.stratify.shuffle,
                         seed=CFG.seed)
            .stratify(df=raw_data))
cols = CFG.data_info.source_fields + \
       [CFG.data_info.target, 'fold']
display(raw_data[cols].head(5))
print(f'Distribution of a Product for Each Fold')
print(f'Notice how the quantities are evenly distributed across folds')
display(raw_data.groupby('Product').fold.value_counts()
        .sort_index().head(5))

Unnamed: 0,Consumer complaint narrative,State,Company response to consumer,Product,fold
0,The Summer of XX/XX/2018 I was denied a mortga...,IL,Closed with explanation,"Credit reporting, credit repair services, or o...",3
1,There are many mistakes appear in my report wi...,VA,Closed with explanation,"Credit reporting, credit repair services, or o...",3
2,There are many mistakes appear in my report wi...,TX,Closed with explanation,"Credit reporting, credit repair services, or o...",2
3,There are many mistakes appear in my report wi...,TX,Closed with explanation,"Credit reporting, credit repair services, or o...",4
4,There are many mistakes appear in my report wi...,CA,Closed with explanation,"Credit reporting, credit repair services, or o...",2


Distribution of a Product for Each Fold
Notice how the quantities are evenly distributed across folds


Product                  fold
Bank account or service  1       2953
                         2       2953
                         3       2954
                         4       2953
                         5       2953
Name: count, dtype: int64

Product                  fold
Bank account or service  1       2953
                         2       2953
                         3       2954
                         4       2953
                         5       2953
Name: count, dtype: int64

In [None]:

# Train a model for each validation fold
fold_num = CFG.stratify.val_folds[0]

In [None]:
# # Train a model for each validation fold
# for fold_num in CFG.stratify.val_folds:
#     print(f'Starting Training for Fold {fold_num}')
    
#     # Training Module
    
#     # Inference Module
    
#     print(f'\tFinished Training for Fold {fold_num}')
# print('Training and Validation Completed')
