In [None]:
#| default_exp train
#| default_cls_lvl 3

In [None]:
#| hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# train

> TODO

In [None]:
#| export
import argparse, os
from pathlib import Path
import warnings

import pandas as pd
from transformers import logging

from kaggle_comp import config, train_dispatcher, utils
from kaggle_comp.config import CFG

# silence all the HF warnings
warnings.simplefilter("ignore")
logging.set_verbosity_error()
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
#| hide_input
import pdb
from fastcore.test import *

## Train a fold

In [None]:
#| export
def run_fold(
    CFG,
    n_fold: int,
    experiment_name: str,
    run_id: str,
    grid_id: str,
    train_data=config.TRAINING_FILE,
    verbose: bool = True,
):

    train_config = train_dispatcher.experiments[experiment_name]
    model_name = f"{experiment_name}_{run_id}_{grid_id}_fold_{n_fold}"

    comp_trainer_cls = train_config["comp_trainer_cls"]
    comp_trainer = comp_trainer_cls(
        train_config=train_config, model_name=model_name, model_output_path=config.MODEL_OUTPUT, log_output_path=config.LOG_OUTPUT
    )

    if isinstance(train_data, pd.DataFrame):
        train_df = train_data.copy()
    else:
        train_df = pd.read_csv(train_data)
    
    train_df["is_valid"] = train_df["k_fold"] == n_fold

    log_df, oof_df = comp_trainer.train(
        CFG,
        train_df,
        n_fold=n_fold,
        run_id=run_id,
        grid_id=grid_id,
        experiment_name=experiment_name,
        verbose=verbose,
    )

    oof_preds = oof_df[[col for col in oof_df.columns if col.startswith("pred")]].values
    oof_targs = oof_df[[col for col in oof_df.columns if col.startswith("targ")]].values

    score = utils.comp_metric_score(oof_preds, oof_targs)

    if verbose:
        print("--- logging results ---")

    log_df.to_csv(Path(config.LOG_OUTPUT) / str(model_name + ".csv"), index=False)
    oof_df.to_csv(Path(config.LOG_OUTPUT) / str("oof_" + model_name + ".csv"), index=False)

    if verbose:
        print(f"Fold={n_fold}, Score={score}")


In [None]:
#| export
if __name__ == "__main__" and utils.run_env != "local_nb":
    # instantiate argparser
    parser = argparse.ArgumentParser()

    # define args
    parser.add_argument("--experiment_name", type=str)
    parser.add_argument("--n_fold", type=int, default=0)
    parser.add_argument("--run_id", type=str, default="dummy")
    parser.add_argument("--grid_id", type=int, default=0)
    parser.add_argument("--verbose", type=str, default=True)

    # read in args from terminal
    args = parser.parse_args()

    run_fold(
        CFG=CFG,
        n_fold=args.n_fold,
        experiment_name=args.experiment_name,
        run_id=args.run_id,
        grid_id=args.grid_id,
        train_data=config.TRAINING_FILE,
        verbose=args.verbose,
    )


In [None]:
class CFG:
    subset = 1.0
    n_fold = 5
    fold = [0, 1]
    random_seed = 42
    batch_size = 4
    use_fp16 = False
    n_unfrozen_epochs = 10
    #strat_feat = "section_scores"
    preprocess = "basic"
    postprocess = "none"
    augment = "none"
    train_folds = "train_folds.csv"
    experiments = ['blurr_deberta_v3_small']
    grid_params = ['experiments', 'fold']


In [None]:
# manual debugging check
# run_fold(CFG=CFG, n_fold = 0, experiment_name = "blurr_deberta_v3_small", run_id = "2022_01_01_01_01", grid_id = 0)
run_fold(CFG=CFG, n_fold = 0, experiment_name = "blurr_cocolm_large", run_id = "2022_01_01_01_01", grid_id = 0)

## Export -

In [None]:
#| hide
import nbdev

nbdev.nbdev_export()