In [1]:
#| default_exp run_grid
#| default_cls_lvl 3

In [2]:
#| hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# run_grid

Setup a multi-fold, multi-parameter experimentation grid via `config` and run `train` for all rows.

In [3]:
#| export
import argparse, os
import warnings

import pandas as pd
from itertools import product

from kaggle_comp import config, train_dispatcher, utils, train, create_folds, preprocessing


# silence all the HF warnings
warnings.simplefilter("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [4]:
#| hide_input
import pdb
from fastcore.test import *

## Prepare the grid with a run_id for the current date & time

In [5]:
#| export
def prep_grid(CFG=config.CFG):
    grid_df = pd.DataFrame(list(product(*[getattr(CFG, x) for x in CFG.grid_params])), columns=CFG.grid_params)
    grid_df["run_id"] = config.get_run_id()
    grid_df = grid_df.rename_axis("grid_id").reset_index()

    return grid_df

In [6]:
prep_grid()

Unnamed: 0,grid_id,experiments,fold,run_id
0,0,blurr_deberta_v3_small,0,2022_11_09_21_33


In [7]:
grid_df = prep_grid()

In [8]:
type(grid_df.experiments[0])

str

In [9]:
#| export
def run_grid(CFG=config.CFG):

    grid_df = prep_grid(CFG)

    print("This is the experimentation grid:")
    print(grid_df)
    print("")

    print("Preprocessing and creating folds")
    print("")
    train_df = create_folds.build_folds(
        n_folds=CFG.n_fold,
        seed=CFG.random_seed,
        subset=CFG.subset,
        #strat_feat = CFG.strat_feat,
        preprocess = CFG.preprocess,
        return_file = True,
        save_file = False
        )

    # def build_folds(
    #     ds: str = "train",
    #     save_pre: bool = False,
    #     return_pre: bool = True
    # ):

    print("Training folds")
    print("")
    for i in range(len(grid_df)):


        # print(f"train.run_fold(n_fold = {grid_df.fold[i]}, experiment = '{grid_df.experiments[i]}', run_id = '{grid_df.run_id[i]}',\
        #     grid_id = {grid_df.grid_id[i]}, bs = {grid_df.bs[i]}, n_epoch = {grid_df.n_epoch[i]}, seed = {grid_df.random_seed[i]}, subset = {grid_df.subset[i]})")

        train.run_fold(
            CFG=CFG,
            n_fold=int(grid_df.fold[i]),
            experiment_name=str(grid_df.experiments[i]),
            run_id=str(grid_df.run_id[i]),
            grid_id=int(grid_df.grid_id[i]),
            train_data=train_df,
        )


In [10]:
# manual debugging
#train.run_fold(CFG = config.CFG, n_fold = 0, experiment_name = 'blurr_distilroberta_base', run_id = '2022_05_21_15_56', grid_id = 0)


The CFG can be dynamically supplied at runtime:

augment: "none", "rev", "shuffle", "remove_word"

In [13]:
class CFG:
    subset = 1.0
    n_fold = 5
    fold = [0, 1]
    random_seed = 42
    bs = 128
    use_fp16 = True
    n_unfrozen_epochs = 1
    #strat_feat = "section_scores"
    preprocess = "basic"
    postprocess = "none"
    augment = "none"
    train_folds = "train_folds.csv"
    experiments = ['blurr_deberta_v3_small']
    grid_params = ['experiments', 'fold']


In [14]:
# manual debugging
#run_grid(CFG)


This is the experimentation grid:
   grid_id             experiments  fold            run_id
0        0  blurr_deberta_v3_small     0  2022_11_09_21_35
1        1  blurr_deberta_v3_small     1  2022_11_09_21_35

Preprocessing and creating folds

Training folds

Experiment: blurr_deberta_v3_small
Run: 2022_11_09_21_35 | Grid ID: 0 | Fold: 0
Training config: f{'comp_trainer_cls': <class 'kaggle_comp.framework_utils.fastai.FastaiCompTrainer'>, 'model_checkpoint': 'microsoft/deberta-v3-small', 'hf_config_kwargs': {'num_labels': 1, 'cls_dropout': 0.15, 'pooler_dropout': 0.15, 'hidden_dropout_prob': 0.05, 'attention_probs_dropout_prob': 0.1, 'layer_norm_eps': 1e-05}, 'hf_tokenizer_kwargs': {}, 'anchor_col': 'anchor', 'target_col': 'target', 'max_length': 140, 'include_labels': True, 'tok_kwargs': {}, 'batch_size': 128, 'adam_beta1': 0.9, 'adam_beta2': 0.995, 'adam_eps': 1e-06, 'weight_decay': 0.01, 'max_grad_norm': None, 'save_best_model': True, 'use_fp16': True, 'n_frozen_epochs': 0, 'froze

IndexError: tuple index out of range

In [16]:
#| export
if __name__ == "__main__" and utils.run_env != "local_nb":
    # instantiate argparser
    parser = argparse.ArgumentParser()

    # define args
    parser.add_argument("--experiments", help='experiments to run', type=lambda s: [str(item) for item in s.split(',')], default=config.CFG.experiments)
    parser.add_argument("--subset", type=float, default=config.CFG.subset)
    #parser.add_argument("--strat_feat", type=str, default=config.CFG.strat_feat)
    parser.add_argument("--preprocess", type=str, default=config.CFG.preprocess)
    parser.add_argument("--n_folds", type=int, default=config.CFG.n_fold)
    parser.add_argument('--fold', help='folds to use', type=lambda s: [int(item) for item in s.split(',')], default=config.CFG.fold)
    parser.add_argument("--seed", type=int, default=config.CFG.random_seed)
    
    args = parser.parse_args()

    config.CFG.experiments = args.experiments
    config.CFG.subset = args.subset
    #config.CFG.strat_feat = args.strat_feat
    config.CFG.preprocess = args.preprocess
    config.CFG.n_fold = args.n_folds
    config.CFG.fold = args.fold
    config.CFG.random_seed = args.seed

    run_grid(config.CFG)


## Export -

In [17]:
#| hide
import nbdev

nbdev.nbdev_export()