### 1. Settings

In [25]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 3, 4, 7, 9],
        "num_train_epochs_cluster": 1,
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-5,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [2]:
dataset = pd.read_csv(f"../data/processed/conala/20240327/conala_clustered.csv")
dataset = dataset.drop("time_batch", axis=1)

test_4_examples = dataset[dataset["cluster"]==4].sample(frac=0.85, random_state=RS)
print("Cluster 4 obsevations: ", test_4_examples.shape)
test_non4_examples = dataset[dataset["cluster"]!=4].sample(n=156, random_state=RS)
print("Cluster not 4 obsevations: ", test_non4_examples.shape)

test_dataset = pd.concat([test_4_examples, test_non4_examples])
train_dataset = dataset[~dataset.index.isin(test_dataset.index)]
print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

Cluster 4 obsevations:  (344, 6)
Cluster not 4 obsevations:  (156, 6)
Train Data:  (2379, 6)
Test Data:  (500, 6)
Fold 0
Fold 1
Fold 2


Filter: 100%|██████████| 500/500 [00:00<00:00, 78085.86 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 48470.01 examples/s]


Map: 100%|██████████| 499/499 [00:00<00:00, 2166.77 examples/s]


In [3]:
train_dataset

Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 2379
})

In [4]:
test_dataset


Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 500
})

In [5]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}
    
for i, (train_idxs, val_idxs) in enumerate(splits):
    latest_run_epoch = 0
    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        print(f"Fold {i}")
        fold_dataset = DatasetDict({
            "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
            "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
        })
        fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer)
        fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer)
        fold_df = pd.DataFrame(fold_val)

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
        
        fold_df["prediction"] = summaries[1]
        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df

        print("FOLDS IN RESULTS ", fold_results[epoch_set].keys())
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open('cv_df_check_drift.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 82278.55 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92824.38 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 56999.10 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 57191.43 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2150.49 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 47828.52 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 49310.63 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2192.33 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75030.07 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 91230.54 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 52506.90 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 55410.63 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2041.51 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 46767.92 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 37600.40 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 1997.66 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6693,3.236693,0.4755,0.2246,0.4347,0.4352,14.4768,0.2169,0.9338,0.9359,9599,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 62509.00 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 88190.71 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 41587.96 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 52155.64 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2087.00 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 45696.56 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 50340.97 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2076.50 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8358,3.216984,0.4819,0.2351,0.4359,0.4364,14.9762,0.2179,0.9687,0.9692,9940,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 48133.65 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 84889.48 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 40138.34 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 44441.91 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 1929.06 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 46367.38 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 48623.09 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2074.92 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.3412,3.302416,0.474,0.2228,0.4274,0.428,14.9549,0.2132,0.9655,0.9661,9908,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 78888.79 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 98962.10 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 59949.52 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 58242.45 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2358.70 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 57377.52 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 47695.78 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2406.26 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.054,3.412664,0.4716,0.2235,0.427,0.4271,15.0013,0.2122,0.9644,0.965,9897,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 7
TRAINING EPOCHS 3
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74946.10 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 91269.76 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 54165.17 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 54152.34 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 2251.80 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 54280.91 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 53554.00 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2310.59 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.9062,3.287638,0.4774,0.2248,0.4336,0.4338,14.4068,0.2247,0.9151,0.9185,9420,10256
2,2.6245,3.243054,0.4787,0.2246,0.4292,0.4293,14.9199,0.2207,0.9408,0.9425,9666,10256
3,2.3745,3.298782,0.4759,0.2265,0.4287,0.4289,15.0763,0.2249,0.9556,0.9565,9810,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 9
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 72010.81 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 92170.16 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 48275.36 examples/s]
Filter: 100%|██████████| 1580/1580 [00:00<00:00, 38132.89 examples/s]
Map: 100%|██████████| 1580/1580 [00:00<00:00, 1842.17 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 40427.64 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 49472.23 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2015.74 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.6921,3.473659,0.4636,0.2139,0.4186,0.4189,14.6083,0.2029,0.9256,0.9282,9520,10256
2,1.8856,3.486224,0.4688,0.2202,0.4234,0.4232,15.1026,0.2264,0.9676,0.9681,9929,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 62501.95 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94919.75 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 52084.67 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 54246.13 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2229.06 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 51059.36 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54353.98 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2314.10 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 80127.27 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 96817.93 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 58787.22 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 62941.14 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2347.52 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 55335.91 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 60100.62 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2361.05 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.64,3.297258,0.4662,0.2142,0.4279,0.4279,14.0468,0.2099,0.8896,0.8953,8806,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77156.38 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93645.00 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 51579.03 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 52873.79 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2222.31 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 50876.54 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 56274.33 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2302.97 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8206,3.275213,0.4642,0.2114,0.4245,0.4248,14.3017,0.2079,0.908,0.912,8970,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76200.67 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 91316.54 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 47632.29 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 40636.86 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 1903.45 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 47338.66 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 50640.89 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2041.91 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.334,3.351566,0.4628,0.205,0.4207,0.421,14.4473,0.2058,0.9179,0.9211,9060,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77770.37 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 99237.68 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 55808.51 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 54184.32 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2252.63 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 53673.80 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 55456.75 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2315.60 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.047,3.469796,0.4565,0.2051,0.4141,0.4144,14.554,0.2049,0.9181,0.9213,9062,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 7
TRAINING EPOCHS 3
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 64240.28 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 70908.54 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 47530.70 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 45628.46 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2033.33 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 45852.75 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 45999.23 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2028.42 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.904,3.347619,0.4563,0.2067,0.4151,0.4161,14.3004,0.2029,0.8958,0.9009,8861,9836
2,2.5988,3.323474,0.4674,0.2113,0.4259,0.4268,15.2237,0.2156,0.9931,0.9931,9768,9836
3,2.3469,3.381261,0.4616,0.2004,0.4185,0.4191,14.9311,0.208,0.959,0.9598,9441,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 9
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74759.12 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94908.02 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 53440.77 examples/s]
Filter: 100%|██████████| 1610/1610 [00:00<00:00, 55086.47 examples/s]
Map: 100%|██████████| 1610/1610 [00:00<00:00, 2300.31 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54236.99 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 56180.24 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2332.09 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.6909,3.561371,0.4619,0.2108,0.4185,0.4189,15.4733,0.2187,0.9955,0.9955,9792,9836
2,1.8858,3.586042,0.4615,0.2088,0.4154,0.4165,15.251,0.2162,0.9899,0.9899,9737,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76483.95 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95048.14 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 55302.84 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 54324.35 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2283.62 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 55870.78 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 58131.77 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2375.49 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74947.79 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 88862.21 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 45477.09 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 45430.91 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1986.29 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 46562.55 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 49067.86 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2080.23 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6821,3.233121,0.4801,0.2296,0.4398,0.4397,14.4094,0.226,0.9442,0.9457,9663,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 2
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74065.48 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 88628.58 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 43652.10 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 45900.81 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1941.63 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 52345.70 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 53502.48 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2311.46 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8506,3.219155,0.476,0.2259,0.4383,0.4383,14.6436,0.2247,0.9501,0.9513,9720,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 3
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75623.16 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 89755.06 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 47132.42 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 48951.39 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1927.29 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 48924.60 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 49115.33 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 1981.94 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.3574,3.296908,0.4777,0.2251,0.4383,0.4377,14.7287,0.2272,0.9527,0.9538,9746,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 4
TRAINING EPOCHS 1
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 64290.77 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95439.97 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 53096.31 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 54700.73 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2294.52 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 55390.41 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 56181.76 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2388.31 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.0702,3.403243,0.4719,0.2261,0.4285,0.4284,14.8903,0.2244,0.974,0.9744,9956,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 7
TRAINING EPOCHS 3
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76235.02 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93483.57 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 54897.07 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 54188.72 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 2312.83 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 35801.21 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 55831.35 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2339.89 examples/s]
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.9127,3.290067,0.4762,0.2275,0.4351,0.4349,14.8557,0.2259,0.9624,0.9631,9841,10218
2,2.6414,3.249582,0.4834,0.2367,0.4446,0.4446,14.9273,0.2384,0.9779,0.9782,9995,10218
3,2.3781,3.300187,0.4799,0.232,0.4374,0.4372,15.0136,0.2338,0.9791,0.9794,10007,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET 9
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 73444.00 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 86916.28 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 45370.73 examples/s]
Filter: 100%|██████████| 1568/1568 [00:00<00:00, 46035.76 examples/s]
Map: 100%|██████████| 1568/1568 [00:00<00:00, 1957.40 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 48528.84 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 50189.31 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2116.36 examples/s]


LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.7007,3.492974,0.4585,0.2144,0.4172,0.4171,14.926,0.2176,0.9546,0.9556,9764,10218
2,1.8968,3.494753,0.467,0.2203,0.4221,0.4219,15.3465,0.2228,1.0,1.0023,10241,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])


In [17]:
########## LOAD CV RESULTS

import pickle
import pandas as pd
with open('cv_df_check_drift.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

import pickle
with open('cluster_cv_df_check_drift.pickle', 'rb') as handle:
    cv_df_cluster = pickle.load(handle)

########## ADD ZERO SHOT

# model_name="Salesforce/codet5-base-multi-sum"
# tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# zs_df = cv_df.copy()
# zs_df = zs_df.drop_duplicates(["question_id", "input_sequence"])
# zs_df["epoch_set"] = 0
# zs_df["prediction"] = infer.generate_summary(list(zs_df["input_sequence"].values), model, tokenizer, FULL_TRAIN_ARGS["ENCODER_LENGTH"], FULL_TRAIN_ARGS["DECODER_LENGTH"])[1]
# zs_df["rouge"] = rouge.compute(predictions=zs_df["prediction"].values, 
#                     references=zs_df["output_sequence"].values,
#                     use_stemmer=True, 
#                     use_aggregator=False,
#                     rouge_types=["rouge1"])["rouge1"]
# cv_df = pd.concat([cv_df, zs_df])

# with open('cv_df_chekc.pickle', 'wb') as handle:
#     pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
cv_df["model_id"] = "epoch_set_" + cv_df["epoch_set"].astype(str)
cv_df_cluster["model_id"] = "cluster_set_" + cv_df_cluster["cluster_set"].astype(str)
cv_df = pd.concat([cv_df.drop("epoch_set", axis=1), cv_df_cluster.drop("cluster_set", axis=1)]).reset_index(drop=True)

In [21]:
########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby("model_id")["rouge"].mean())

print("STD")
print(cv_df.groupby("model_id")["rouge"].std())


Mean
model_id
cluster_set_0    0.421810
cluster_set_1    0.400814
cluster_set_2    0.401374
cluster_set_3    0.432673
cluster_set_4    0.312228
cluster_set_5    0.349272
cluster_set_6    0.400410
epoch_set_0      0.312683
epoch_set_1      0.473042
epoch_set_2      0.473410
epoch_set_3      0.472201
epoch_set_4      0.466596
epoch_set_7      0.472444
epoch_set_9      0.465552
Name: rouge, dtype: float64
STD
model_id
cluster_set_0    0.183458
cluster_set_1    0.182638
cluster_set_2    0.186765
cluster_set_3    0.191407
cluster_set_4    0.174070
cluster_set_5    0.189003
cluster_set_6    0.182253
epoch_set_0      0.172770
epoch_set_1      0.191672
epoch_set_2      0.192754
epoch_set_3      0.189420
epoch_set_4      0.189238
epoch_set_7      0.192562
epoch_set_9      0.189637
Name: rouge, dtype: float64


### Step 2. Learn performance

In [26]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_drift.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_drift.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [27]:
cv_df.fold.value_counts()

fold
2    11354
0    11186
1    10766
Name: count, dtype: int64

In [28]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}

cv_df["perf_hat"] = 0


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_id"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_id"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        cv_df.loc[cv_df.fold==test_fold, f"perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

0
lr


  cv_df.loc[cv_df.fold==test_fold, f"perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6290
[LightGBM] [Info] Number of data points in the train set: 22120, number of used features: 679
[LightGBM] [Info] Start training from score 0.416246
catboost
Learning rate set to 0.066781
0:	learn: 0.1909253	total: 71.9ms	remaining: 1m 11s
1:	learn: 0.1899228	total: 79.2ms	remaining: 39.5s
2:	learn: 0.1890138	total: 85.4ms	remaining: 28.4s
3:	learn: 0.1881102	total: 91.2ms	remaining: 22.7s
4:	learn: 0.1874278	total: 95.8ms	remaining: 19.1s
5:	learn: 0.1867471	total: 101ms	remaining: 16.7s
6:	learn: 0.1861609	total: 105ms	remaining: 14.9s
7:	learn: 0.1856596	total: 110ms	remaining: 13.7s
8:	learn: 0.1852226	total: 115ms	remaining: 12.7s
9:	learn: 0.1849355	total: 120ms	remaining: 11.9s
10:	learn: 0.1845484	total: 126ms	remaining: 11.3s
11:	learn: 0.1842428	total: 131ms	remaining: 10.8s
1

In [29]:
cv_df.shape

(33306, 18)

In [30]:
cv_df.groupby("model_id")["catboost_perf_hat"].mean()

model_id
cluster_set_0    0.428039
cluster_set_1    0.406862
cluster_set_2    0.408275
cluster_set_3    0.435107
cluster_set_4    0.318257
cluster_set_5    0.353202
cluster_set_6    0.406899
epoch_set_0      0.318779
epoch_set_1      0.468518
epoch_set_2      0.468573
epoch_set_3      0.467462
epoch_set_4      0.463160
epoch_set_7      0.468741
epoch_set_9      0.463084
Name: catboost_perf_hat, dtype: float64

In [31]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

with open('cd_df_with_predictions_drift.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.23356784371625405
MAE  0.1801408141483302


svm
RMSE  0.19195029365664382
MAE  0.15219262011773071


lgbm
RMSE  0.185548755011633
MAE  0.14767678638052786


catboost
RMSE  0.1850049017069022
MAE  0.1470656882418074




In [32]:
with open('cd_df_with_predictions_drift.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[:, "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[:, "model_id"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[:, "rouge"]
    
with open("./models/vectorizer_drift.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9099
[LightGBM] [Info] Number of data points in the train set: 33306, number of used features: 870
[LightGBM] [Info] Start training from score 0.418179
catboost
Learning rate set to 0.071242
0:	learn: 0.1934260	total: 11.7ms	remaining: 11.7s
1:	learn: 0.1922120	total: 19.8ms	remaining: 9.88s
2:	learn: 0.1911202	total: 26.8ms	remaining: 8.91s
3:	learn: 0.1901598	total: 33.6ms	remaining: 8.36s
4:	learn: 0.1893403	total: 39.3ms	remaining: 7.82s
5:	learn: 0.1885599	total: 45.2ms	remaining: 7.49s
6:	learn: 0.1879252	total: 50.3ms	remaining: 7.13s
7:	learn: 0.1874046	total: 54.8ms	remaining: 6.79s
8:	learn: 0.1868667	total: 59.5ms	remaining: 6.55s
9:	learn: 0.1863687	total: 63.9ms	remaining: 6.33s
10:	learn: 0.1858978	total: 68.1

In [34]:
cv_df.model_id.unique()

array(['epoch_set_0', 'epoch_set_1', 'epoch_set_2', 'epoch_set_3',
       'epoch_set_4', 'epoch_set_7', 'epoch_set_9', 'cluster_set_0',
       'cluster_set_1', 'cluster_set_2', 'cluster_set_3', 'cluster_set_4',
       'cluster_set_5', 'cluster_set_6'], dtype=object)