### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy
from datetime import datetime, date

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold, train_test_split # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
ANALYSIS_POSTFIX = f"mined_no_drift_{str(date.today())}"
DATE_STR = 20240721
SEMANTIC_DRIFT = True
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:
dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv").head(10000)

qids = sorted(dataset.question_id.unique())
train_idx, test_idx = qids[:int(len(qids)*0.785)], qids[int(len(qids)*0.785):]
test_idx.append(train_idx[0])
train_idx.pop(0)
train_dataset = dataset[dataset.question_id.isin(train_idx)]
test_dataset = dataset[dataset.question_id.isin(test_idx)]


print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

print("Train Data: Cluster", train_dataset.cluster.value_counts())
print("Test Data: Cluster", test_dataset.cluster.value_counts())

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer, intent_colum_name="intent")
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')


# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

Train Data:  (7941, 11)
Test Data:  (2059, 11)
Train Data: Cluster cluster
3    3498
2    3409
4     997
0      19
1      18
Name: count, dtype: int64
Test Data: Cluster cluster
3    1240
2     533
4     285
1       1
Name: count, dtype: int64


Filter: 100%|████████████████████| 2059/2059 [00:00<00:00, 104699.97 examples/s]
Filter: 100%|█████████████████████| 2059/2059 [00:00<00:00, 40289.39 examples/s]
Map: 100%|█████████████████████████| 2059/2059 [00:00<00:00, 2128.70 examples/s]


Fold 0
Fold 1
Fold 2


In [None]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}

for i, (train_idxs, val_idxs) in enumerate(splits):

    print(f"Fold {i}")
    fold_dataset = DatasetDict({
        "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
        "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
    })
    fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer, intent_colum_name="intent")
    fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer, intent_colum_name="intent")
    

    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

        fold_df = pd.DataFrame(fold_val)
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = []
        
        if len(text)>1000:
            
            batch_size = 1000
            n_batches = math.ceil(len(text)/batch_size)

            for batch in range(n_batches):

                batch_start_idx = batch*batch_size
                batch_end_idx = batch*batch_size + batch_size

                if batch==(n_batches-1):
                    batch_end_idx = len(text)
                summary = infer.generate_summary(text[batch_start_idx:batch_end_idx],
                                                model,
                                                tokenizer,
                                                TRAIN_ARGS["ENCODER_LENGTH"],
                                                TRAIN_ARGS["DECODER_LENGTH"])[1]
                summaries.append(summary)

            summaries = [sentence for summary_list in summaries for sentence in summary_list]
            
            fold_df["prediction"] = summaries
        else: 
            summaries = infer.generate_summary(text, 
                                               model,
                                               tokenizer,
                                               TRAIN_ARGS["ENCODER_LENGTH"],
                                               TRAIN_ARGS["DECODER_LENGTH"])
            fold_df["prediction"] = summaries[1]


        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 57650.44 examples/s]
Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 72436.37 examples/s]
Filter: 100%|█████████████████████| 5301/5301 [00:00<00:00, 39013.80 examples/s]
Filter: 100%|█████████████████████| 5301/5301 [00:00<00:00, 38065.67 examples/s]
Map: 100%|█████████████████████████| 5301/5301 [00:02<00:00, 2283.56 examples/s]
Filter: 100%|█████████████████████| 2640/2640 [00:00<00:00, 40756.47 examples/s]
Filter: 100%|█████████████████████| 2640/2640 [00:00<00:00, 35303.00 examples/s]
Map: 100%|█████████████████████████| 2640/2640 [00:01<00:00, 2165.41 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5387,4.046259,0.3595,0.1241,0.3225,0.3226,13.6057,0.0552,1.0,1.098,28110,25600


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9892,3.896081,0.371,0.1313,0.3315,0.3316,13.5659,0.0587,1.0,1.0918,27951,25600
2,3.8694,3.834083,0.377,0.134,0.3361,0.3359,13.675,0.0592,1.0,1.1007,28178,25600
3,3.751,3.813528,0.3756,0.1337,0.3358,0.3357,13.5015,0.0599,1.0,1.0759,27543,25600
4,3.6943,3.808077,0.3779,0.1342,0.3367,0.3366,13.5746,0.0599,1.0,1.0832,27731,25600


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5163,3.815096,0.3772,0.1339,0.3354,0.3353,13.4723,0.0599,1.0,1.0722,27449,25600
2,3.4842,3.80744,0.3767,0.134,0.3333,0.3334,13.6519,0.0591,1.0,1.0888,27874,25600
3,3.4552,3.804747,0.3771,0.1356,0.3337,0.3336,13.5496,0.0585,1.0,1.0761,27547,25600


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9092,3.730794,0.3794,0.137,0.3373,0.3374,13.5621,0.0625,1.0,1.0862,27806,25600
2,3.7778,3.714485,0.3833,0.1389,0.3403,0.3403,13.6792,0.063,1.0,1.0945,28020,25600


TRAINING EPOCH SET 16
TRAINING EPOCHS 6


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6105,3.713634,0.3829,0.1384,0.339,0.3391,13.4489,0.0637,1.0,1.0698,27388,25600
2,3.5787,3.694589,0.3841,0.1379,0.3388,0.3389,13.7432,0.0614,1.0,1.0959,28056,25600
3,3.5357,3.691576,0.3818,0.1385,0.3381,0.3383,13.6489,0.0619,1.0,1.0839,27747,25600
4,3.4707,3.691134,0.383,0.1389,0.3392,0.3393,13.6549,0.0626,1.0,1.0835,27737,25600
5,3.4317,3.692607,0.3832,0.1393,0.339,0.3391,13.7295,0.0621,1.0,1.09,27903,25600
6,3.4004,3.695829,0.3826,0.1386,0.3379,0.3381,13.6636,0.0612,1.0,1.0837,27744,25600


Fold 1


Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 66131.57 examples/s]
Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 78039.37 examples/s]
Filter: 100%|█████████████████████| 5290/5290 [00:00<00:00, 39966.11 examples/s]
Filter: 100%|█████████████████████| 5290/5290 [00:00<00:00, 40864.87 examples/s]
Map: 100%|█████████████████████████| 5290/5290 [00:02<00:00, 2350.42 examples/s]
Filter: 100%|█████████████████████| 2651/2651 [00:00<00:00, 39549.91 examples/s]
Filter: 100%|█████████████████████| 2651/2651 [00:00<00:00, 43368.93 examples/s]
Map: 100%|█████████████████████████| 2651/2651 [00:01<00:00, 2411.18 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda
TRAINING EPOCH SET 1
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5353,4.048568,0.3596,0.1229,0.3192,0.3192,13.4945,0.0558,1.0,1.0859,28024,25808


TRAINING EPOCH SET 5
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9867,3.900426,0.3705,0.1284,0.3285,0.3282,13.2376,0.059,1.0,1.054,27202,25808
2,3.8665,3.837449,0.3764,0.133,0.3339,0.3338,13.4048,0.0603,1.0,1.0677,27555,25808
3,3.7601,3.811446,0.3714,0.129,0.3297,0.3296,13.2482,0.0587,1.0,1.0514,27134,25808
4,3.705,3.80862,0.3755,0.131,0.3313,0.3313,13.4994,0.0587,1.0,1.0759,27767,25808


TRAINING EPOCH SET 8
TRAINING EPOCHS 3


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5201,3.818031,0.3733,0.1282,0.3292,0.3291,13.1637,0.0569,1.0,1.0431,26921,25808
2,3.4827,3.807237,0.3753,0.1295,0.3312,0.3311,13.3444,0.0589,1.0,1.0582,27309,25808
3,3.4645,3.802046,0.3736,0.1279,0.3298,0.3298,13.467,0.0565,1.0,1.0701,27618,25808


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.2127,3.86333,0.3719,0.1239,0.3263,0.326,13.06,0.0537,1.0,1.0339,26684,25808
2,3.2196,3.857049,0.3696,0.1221,0.3242,0.324,13.2935,0.0517,1.0,1.0532,27180,25808


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.9576,3.952627,0.3654,0.119,0.3197,0.3198,12.9042,0.0507,1.0,1.0151,26197,25808
2,3.0607,3.878911,0.3678,0.1211,0.3216,0.3216,13.1494,0.0527,1.0,1.0418,26887,25808
3,3.2071,3.83857,0.3663,0.12,0.3217,0.3217,13.1973,0.0506,1.0,1.0475,27033,25808
4,3.2472,3.82799,0.3675,0.1205,0.3217,0.3215,13.384,0.0503,1.0,1.0653,27492,25808
5,3.2245,3.835048,0.3689,0.1219,0.3223,0.3222,13.361,0.0517,1.0,1.0639,27456,25808
6,3.2034,3.83955,0.3693,0.1223,0.3234,0.3231,13.3674,0.0511,1.0,1.0628,27429,25808


Fold 2


Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 57455.33 examples/s]
Filter: 100%|█████████████████████| 7941/7941 [00:00<00:00, 69940.59 examples/s]
Filter: 100%|█████████████████████| 5291/5291 [00:00<00:00, 31006.63 examples/s]
Filter: 100%|█████████████████████| 5291/5291 [00:00<00:00, 31246.74 examples/s]
Map: 100%|█████████████████████████| 5291/5291 [00:02<00:00, 1894.59 examples/s]
Filter: 100%|█████████████████████| 2650/2650 [00:00<00:00, 31129.48 examples/s]
Filter: 100%|█████████████████████| 2650/2650 [00:00<00:00, 34234.89 examples/s]
Map: 100%|█████████████████████████| 2650/2650 [00:01<00:00, 1957.31 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5268,4.070565,0.3686,0.1293,0.3271,0.3272,13.5117,0.0572,1.0,1.0747,27998,26052


TRAINING EPOCH SET 5
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9724,3.923131,0.3768,0.135,0.3334,0.3334,13.5257,0.0604,1.0,1.0731,27957,26052
2,3.8497,3.871617,0.3879,0.1404,0.3418,0.3418,13.6011,0.0619,1.0,1.0727,27946,26052


In [None]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby(["epoch_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


### Step 2. Learn performance

In [None]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 
    global ANALYSIS_POSTFIX
    
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [None]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

# ENSEMBLE ESTIMATE (JUST HIGHEST PREDICTIONS)
models_index = cv_df.groupby("id")["catboost_perf_hat"].idxmax()
optimal_ensemble = cv_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))
cv_df["opt_es_id"] = cv_df.id.map(optimal_ensemble_map)
ensemble_preds = cv_df.loc[cv_df["epoch_set"]==cv_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()
ensemble_preds["epoch_set"] = "ensemble"
cv_df = pd.concat([cv_df, ensemble_preds], axis=0)

In [None]:
cv_df.groupby("epoch_set").catboost_perf_hat.mean()

In [None]:
cv_df.groupby("epoch_set").catboost_perf_hat.std()

In [None]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

    model_results[model]["rmse_std"] = np.array(model_results[model]["rmse"]).std()
    model_results[model]["mae_std"] = np.array(model_results[model]["mae"]).std()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

    print("RMSE STD ", model_results[model]["rmse_std"])
    print("MAE STD",model_results[model]["mae_std"])
    print("\n")

with open(f'reports/results/s2_model_results_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.epoch_set!="ensemble", "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.epoch_set!="ensemble", "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[cv_df.epoch_set!="ensemble", "rouge"]
    
with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)