### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 3, 4, 5, 6, 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-6,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [2]:
dataset = pd.read_csv(f"../data/processed/conala/20240327/conala_clustered.csv")
dataset = dataset.drop("time_batch", axis=1)

test_4_examples = dataset[dataset["cluster"]==4].sample(frac=0.85, random_state=RS)
print("Cluster 4 obsevations: ", test_4_examples.shape)
test_non4_examples = dataset[dataset["cluster"]!=4].sample(n=156, random_state=RS)
print("Cluster not 4 obsevations: ", test_non4_examples.shape)

test_dataset = pd.concat([test_4_examples, test_non4_examples])
train_dataset = dataset[~dataset.index.isin(test_dataset.index)]
print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

Cluster 4 obsevations:  (344, 6)
Cluster not 4 obsevations:  (156, 6)
Train Data:  (2379, 6)
Test Data:  (500, 6)
Fold 0
Fold 1
Fold 2


Filter: 100%|██████████| 500/500 [00:00<00:00, 85850.34 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 53263.71 examples/s]
Map: 100%|██████████| 499/499 [00:00<00:00, 2225.70 examples/s]


In [3]:
train_dataset

Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 2379
})

In [4]:
test_dataset


Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 500
})

In [5]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}
    
for i, (train_idxs, val_idxs) in enumerate(splits):
    latest_run_epoch = 0
    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        print(f"Fold {i}")
        fold_dataset = DatasetDict({
            "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
            "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
        })
        fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer)
        fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer)
        fold_df = pd.DataFrame(fold_val)

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
        
        fold_df["prediction"] = summaries[1]
        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df

        print("FOLDS IN RESULTS ", fold_results[epoch_set].keys())
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open('cv_df_check_drift.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 65136.00 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 82409.02 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 57240.83 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 59596.40 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 1992.34 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 50433.40 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 51639.50 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 1954.66 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75436.78 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94955.88 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 39568.43 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 57455.22 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2128.99 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 33633.57 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 54652.70 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2073.32 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.386,3.830533,0.3669,0.1094,0.3332,0.3328,9.8135,0.031,0.5005,0.591,6061,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74747.36 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 98130.95 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 61176.45 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 63489.79 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2441.28 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 57664.82 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 59693.43 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2435.80 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7159,3.55856,0.4309,0.1777,0.3947,0.3949,13.0451,0.1472,0.8184,0.8331,8544,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74479.00 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92451.98 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 54092.22 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 54030.04 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2265.51 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 53068.07 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 51089.23 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2311.67 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.4666,3.469716,0.4437,0.1916,0.4054,0.4056,13.5119,0.1709,0.8541,0.8638,8859,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76899.51 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 90158.11 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 42913.76 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 45334.83 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2043.22 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 53377.44 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 60354.59 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2313.70 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.2957,3.42574,0.4538,0.2002,0.4129,0.4131,13.7572,0.1858,0.8631,0.8717,8940,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 5
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 56694.28 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 91737.99 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 48287.67 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 53210.54 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2026.14 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 34957.27 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 59341.45 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2327.24 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.1539,3.409338,0.4547,0.2005,0.4104,0.4102,14.0438,0.1908,0.8861,0.8922,9150,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 6
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 70022.31 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 83675.05 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 38775.02 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 32231.55 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 1846.78 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 33765.39 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 54163.35 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2081.31 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.0252,3.410481,0.454,0.1977,0.4079,0.4079,14.1051,0.1863,0.8871,0.893,9159,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 7
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76610.79 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 89035.07 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 34444.23 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 41905.64 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2371.70 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 54534.42 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 60358.94 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2374.98 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.9036,3.423048,0.4524,0.1954,0.4069,0.407,14.0826,0.1824,0.8861,0.8922,9150,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 79753.90 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 99953.41 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 59047.33 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 63355.09 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2384.08 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 33986.85 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 61129.17 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2344.80 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74667.93 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 97044.86 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 49439.04 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 60099.41 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 1928.12 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 41422.17 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 43855.82 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 1903.24 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.367,3.874389,0.3679,0.1123,0.3364,0.3373,9.5735,0.0321,0.4702,0.5699,5606,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 73124.83 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 69357.46 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 46791.32 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 47848.63 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 1969.77 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 48061.69 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 46895.42 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2026.13 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6916,3.61512,0.4231,0.1709,0.386,0.3868,12.7269,0.1501,0.7921,0.811,7977,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75916.02 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92012.07 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 46326.51 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 50762.09 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 1993.42 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 56101.09 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 58716.59 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2296.66 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.4453,3.527614,0.4364,0.1856,0.3981,0.3985,13.1261,0.175,0.8238,0.8376,8239,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76342.34 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 86688.23 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 40478.52 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 37688.25 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 1867.62 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 51141.13 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 55867.87 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2044.15 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.2743,3.484163,0.4435,0.192,0.4033,0.4035,13.4577,0.1824,0.8516,0.8616,8475,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 5
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74251.21 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93669.61 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 51649.25 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 54280.14 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2243.56 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 32913.45 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 55750.06 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2277.03 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.1307,3.467803,0.4463,0.1934,0.4062,0.4065,13.5969,0.1863,0.8606,0.8695,8552,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 6
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77235.22 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92763.11 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 37440.21 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 60067.87 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2336.37 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54749.79 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 61187.16 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2325.02 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.0023,3.469553,0.4471,0.1925,0.4051,0.4053,13.883,0.19,0.8812,0.8878,8732,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 7
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 80265.85 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 98478.64 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 60418.81 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 49122.56 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2388.86 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 57070.91 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 59032.54 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2442.35 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8787,3.483274,0.4476,0.192,0.4045,0.4045,13.9402,0.1914,0.8877,0.8936,8789,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74915.72 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 89312.40 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 47625.61 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 49505.96 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2068.38 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 52058.10 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 33205.59 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2017.15 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 70926.68 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 88900.22 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 51444.53 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 53090.31 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2205.26 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 50833.59 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54443.58 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2301.58 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4071,3.825557,0.3687,0.116,0.3373,0.3369,9.4735,0.0303,0.467,0.5677,5801,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 73062.19 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 82253.46 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 44845.71 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 48367.10 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1746.33 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 43214.98 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 46977.32 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2257.96 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7261,3.557242,0.4378,0.187,0.4023,0.4023,12.6831,0.1557,0.7932,0.8119,8296,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75777.07 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 90011.72 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 51611.67 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 58349.84 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2291.41 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 55696.96 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54926.22 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2381.12 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.4779,3.470162,0.4515,0.2052,0.4167,0.4162,13.2873,0.1901,0.8475,0.858,8767,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76751.04 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93772.61 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 53607.45 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 42709.25 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2182.03 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 50312.54 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 53827.59 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2267.37 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.3079,3.427826,0.4588,0.211,0.4234,0.4234,13.5117,0.1981,0.866,0.8742,8933,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 5
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 79195.60 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 97471.44 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 46910.53 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 62430.41 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2324.52 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54415.71 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54936.86 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2384.97 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.1646,3.412829,0.4639,0.2154,0.4284,0.4281,13.7053,0.204,0.8797,0.8864,9057,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 6
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74670.73 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92063.86 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 48704.88 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 48164.49 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2180.43 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 57900.23 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 63340.61 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2286.60 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.0348,3.41473,0.4631,0.2171,0.4255,0.4254,13.8792,0.2047,0.8908,0.8964,9159,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 7
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76996.82 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93557.20 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 49849.68 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 53587.79 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1954.45 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 49341.17 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 56226.33 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2089.85 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.9098,3.430384,0.4624,0.2168,0.4254,0.4255,14.0185,0.2065,0.9033,0.9077,9275,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])


In [6]:
########## LOAD CV RESULTS

import pickle
with open('cv_df_check_drift.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## ADD ZERO SHOT

# model_name="Salesforce/codet5-base-multi-sum"
# tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# zs_df = cv_df.copy()
# zs_df = zs_df.drop_duplicates(["question_id", "input_sequence"])
# zs_df["epoch_set"] = 0
# zs_df["prediction"] = infer.generate_summary(list(zs_df["input_sequence"].values), model, tokenizer, FULL_TRAIN_ARGS["ENCODER_LENGTH"], FULL_TRAIN_ARGS["DECODER_LENGTH"])[1]
# zs_df["rouge"] = rouge.compute(predictions=zs_df["prediction"].values, 
#                     references=zs_df["output_sequence"].values,
#                     use_stemmer=True, 
#                     use_aggregator=False,
#                     rouge_types=["rouge1"])["rouge1"]
# cv_df = pd.concat([cv_df, zs_df])

# with open('cv_df_chekc.pickle', 'wb') as handle:
#     pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0    0.312683
1    0.365668
2    0.429093
3    0.442423
4    0.451241
5    0.454752
6    0.455086
7    0.454211
Name: rouge, dtype: float64
STD
epoch_set
0    0.172770
1    0.186344
2    0.187874
3    0.188987
4    0.188181
5    0.189293
6    0.189902
7    0.189046
Name: rouge, dtype: float64


### Step 2. Learn performance

In [8]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_drift.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_drift.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [9]:
cv_df.fold.value_counts()

fold
2    6488
0    6392
1    6152
Name: count, dtype: int64

In [10]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}

cv_df["perf_hat"] = 0


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        cv_df.loc[cv_df.fold==test_fold, f"perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

0
lr


  cv_df.loc[cv_df.fold==test_fold, f"perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.416754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5561
[LightGBM] [Info] Number of data points in the train set: 12640, number of used features: 427
[LightGBM] [Info] Start training from score 0.419642
catboost
Learning rate set to 0.06113
0:	learn: 0.1915241	total: 57.8ms	remaining: 57.8s
1:	learn: 0.1906316	total: 63.6ms	remaining: 31.8s
2:	learn: 0.1897726	total: 67.9ms	remaining: 22.6s
3:	learn: 0.1890624	total: 71.5ms	remaining: 17.8s
4:	learn: 0.1883739	total: 74.6ms	remaining: 14.8s
5:	learn: 0.1877911	total: 77.7ms	remaining: 12.9s
6:	learn: 0.1871941	total: 80.6ms	remaining: 11.4s
7:	learn: 0.1867037	total: 83.2ms	remaining: 10.3s
8:	learn: 0.1862342	total: 85.8ms	remaining: 9.45s
9:	learn: 0.1857689	total: 88.3ms	remaining: 8.75s
10:	learn: 0.1853581	total: 91ms	rem

In [15]:
cv_df.shape

(19032, 18)

In [16]:
cv_df.groupby("epoch_set")["catboost_perf_hat"].mean()

epoch_set
0    0.318266
1    0.372663
2    0.438554
3    0.451153
4    0.452206
5    0.452428
6    0.453545
7    0.455152
Name: catboost_perf_hat, dtype: float64

In [13]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

with open('cd_df_with_predictions_drift.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.24874840017839772
MAE  0.19132703806159826


svm
RMSE  0.19461252919353958
MAE  0.15449195065671972


lgbm
RMSE  0.19166876192930205
MAE  0.15143726110635428


catboost
RMSE  0.1888002702061641
MAE  0.14960342227533033




In [14]:
with open('cd_df_with_predictions_drift.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[:, "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[:, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[:, "rouge"]
    
with open("./models/vectorizer_drift.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.485999 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8212
[LightGBM] [Info] Number of data points in the train set: 19032, number of used features: 564
[LightGBM] [Info] Start training from score 0.420645
catboost
Learning rate set to 0.065214
0:	learn: 0.1918780	total: 11.7ms	remaining: 11.6s
1:	learn: 0.1908530	total: 19.9ms	remaining: 9.92s
2:	learn: 0.1898420	total: 25.9ms	remaining: 8.62s
3:	learn: 0.1890027	total: 30.6ms	remaining: 7.61s
4:	learn: 0.1881557	total: 34.8ms	remaining: 6.93s
5:	learn: 0.1874279	total: 38.8ms	remaining: 6.42s
6:	learn: 0.1867878	total: 42.7ms	remaining: 6.06s
7:	learn: 0.1862615	total: 46.2ms	remaining: 5.73s
8:	learn: 0.1857741	total: 49.7ms	remaining: 5.47s
9:	learn: 0.1852823	total: 53.1ms	remaining: 5.26s
10:	learn: 0.1848760	total: 56.7