### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy
from datetime import datetime, date

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold, train_test_split # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
ANALYSIS_POSTFIX = f"mined_sudden_{str(date.today())}"
DATE_STR = 20240721
SEMANTIC_DRIFT = True
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-5,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:

dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv")

if SEMANTIC_DRIFT:
    dataset_4_cl = dataset[dataset.cluster==4].sample(n=2000, random_state=RS)
    dataset_non_4_cl = dataset[dataset.cluster!=4].sample(n=8000, random_state=RS)

    qids_4_cl = sorted(dataset_4_cl.question_id.unique())
    train_idx_4_cl, test_idx_4_cl = qids_4_cl[int(len(qids_4_cl)*0.99):], qids_4_cl[:int(len(qids_4_cl)*0.99)]

    qids_non4_cl = sorted(dataset_non_4_cl.question_id.unique())
    train_idx_non4_cl, test_idx_non4_cl = qids_non4_cl[:int(len(qids_non4_cl)*0.99)], qids_non4_cl[int(len(qids_non4_cl)*0.99):]

    train_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(train_idx_4_cl)]
    test_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(test_idx_4_cl)]

    train_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(train_idx_non4_cl)]
    test_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(test_idx_non4_cl)]

    train_dataset = pd.concat([train_dataset_4cl, train_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    test_dataset = pd.concat([test_dataset_4cl, test_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    
else:
    qids = sorted(dataset.question_id.unique())
    train_idx, test_idx = qids[:int(len(qids)*0.8)], qids[int(len(qids)*0.8):]
    train_dataset = dataset[dataset.question_id.isin(train_idx)]
    test_dataset = dataset[dataset.question_id.isin(test_idx)]


print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

print("Train Data: Cluster", train_dataset.cluster.value_counts())
print("Test Data: Cluster", test_dataset.cluster.value_counts())

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer, intent_colum_name="intent")
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')


# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [3]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}

for i, (train_idxs, val_idxs) in enumerate(splits):

    print(f"Fold {i}")
    fold_dataset = DatasetDict({
        "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
        "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
    })
    fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer, intent_colum_name="intent")
    fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer, intent_colum_name="intent")
    

    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

        fold_df = pd.DataFrame(fold_val)
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = []
        
        if len(text)>1000:
            
            batch_size = 1000
            n_batches = math.ceil(len(text)/batch_size)

            for batch in range(n_batches):

                batch_start_idx = batch*batch_size
                batch_end_idx = batch*batch_size + batch_size

                if batch==(n_batches-1):
                    batch_end_idx = len(text)
                summary = infer.generate_summary(text[batch_start_idx:batch_end_idx],
                                                model,
                                                tokenizer,
                                                TRAIN_ARGS["ENCODER_LENGTH"],
                                                TRAIN_ARGS["DECODER_LENGTH"])[1]
                summaries.append(summary)

            summaries = [sentence for summary_list in summaries for sentence in summary_list]
            
            fold_df["prediction"] = summaries
        else: 
            summaries = infer.generate_summary(text, 
                                               model,
                                               tokenizer,
                                               TRAIN_ARGS["ENCODER_LENGTH"],
                                               TRAIN_ARGS["DECODER_LENGTH"])
            fold_df["prediction"] = summaries[1]


        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Map:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.775,4.417843,0.2616,0.0672,0.2305,0.2304,14.2315,0.0216,1.0,1.1596,29371,25328


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8618,4.406465,0.2622,0.068,0.233,0.2331,13.697,0.021,1.0,1.1015,27900,25328
2,3.8348,4.479659,0.2692,0.0705,0.2355,0.2356,13.9346,0.023,1.0,1.1208,28388,25328
3,3.3874,4.586398,0.2622,0.064,0.2278,0.2281,13.5715,0.0225,1.0,1.0811,27381,25328
4,3.0956,4.68704,0.2533,0.0607,0.2209,0.221,13.7474,0.0201,1.0,1.0943,27716,25328


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.4992,4.80049,0.2385,0.0545,0.2091,0.2091,13.4724,0.0173,1.0,1.0628,26918,25328
2,2.3705,5.015721,0.2335,0.052,0.2028,0.2027,13.2368,0.0174,1.0,1.0484,26553,25328
3,2.179,5.132433,0.2248,0.0475,0.1946,0.1947,13.0662,0.0148,1.0,1.0283,26046,25328


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.8425,5.226613,0.2221,0.0479,0.1933,0.1934,12.944,0.0158,1.0,1.0091,25558,25328
2,1.7592,5.266448,0.2184,0.0448,0.189,0.1891,13.3075,0.0137,1.0,1.0484,26555,25328


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.6999,5.335461,0.2108,0.0431,0.1835,0.1835,12.5518,0.0139,0.9647,0.9653,24449,25328
2,1.7016,5.398447,0.2164,0.0436,0.1873,0.1872,13.2432,0.0134,1.0,1.0556,26736,25328
3,1.8857,5.326833,0.2145,0.045,0.1869,0.1869,13.1203,0.014,1.0,1.0288,26058,25328
4,2.0222,5.357116,0.2053,0.0395,0.1778,0.178,13.0042,0.012,1.0,1.0259,25985,25328
5,1.9639,5.391723,0.2149,0.045,0.187,0.1871,13.4459,0.0142,1.0,1.0683,27058,25328
6,1.8628,5.403353,0.2093,0.0425,0.1817,0.1818,13.2152,0.0132,1.0,1.046,26494,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Map:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7624,4.422719,0.2758,0.0751,0.242,0.2421,14.0215,0.0282,1.0,1.1289,28891,25592


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8591,4.437797,0.2724,0.0734,0.2406,0.2407,13.6664,0.0258,1.0,1.0906,27910,25592
2,3.8171,4.504949,0.2657,0.0678,0.233,0.233,13.909,0.024,1.0,1.1107,28425,25592
3,3.3721,4.598738,0.2569,0.0617,0.2244,0.2244,13.7896,0.0203,1.0,1.0869,27816,25592
4,3.0765,4.726808,0.2504,0.0574,0.217,0.2169,13.8836,0.0188,1.0,1.0997,28143,25592


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.4886,4.850921,0.2434,0.0577,0.2134,0.2136,13.6619,0.018,1.0,1.0796,27630,25592
2,2.3563,5.052765,0.2308,0.0496,0.2006,0.2008,13.5863,0.0149,1.0,1.0733,27469,25592
3,2.1696,5.17634,0.2227,0.0485,0.1942,0.1944,13.3649,0.0151,1.0,1.045,26744,25592


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.8361,5.258586,0.2183,0.0462,0.1902,0.1905,13.4141,0.014,1.0,1.0479,26819,25592
2,1.7534,5.293366,0.2181,0.0448,0.1902,0.1905,13.3763,0.0144,1.0,1.0491,26849,25592


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.6999,5.339198,0.2194,0.0434,0.1878,0.1883,14.0253,0.0142,1.0,1.1203,28670,25592
2,1.7019,5.378943,0.2082,0.0391,0.1809,0.1811,12.6116,0.0138,0.9713,0.9717,24867,25592
3,1.8847,5.382474,0.2103,0.0419,0.1837,0.1839,13.224,0.012,1.0,1.0367,26532,25592
4,2.0155,5.364723,0.207,0.0401,0.1801,0.1803,12.9422,0.0142,1.0,1.0072,25777,25592
5,1.9617,5.414166,0.2071,0.0404,0.1797,0.1799,13.2029,0.0125,1.0,1.0244,26216,25592
6,1.8586,5.431931,0.2063,0.0407,0.1798,0.18,13.1953,0.0119,1.0,1.0267,26275,25592


Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7717,4.431364,0.2664,0.0734,0.2356,0.2357,14.0909,0.0253,1.0,1.147,28997,25280


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8505,4.444387,0.2712,0.0748,0.2396,0.2395,14.5111,0.0247,1.0,1.182,29882,25280
2,3.8141,4.517859,0.2681,0.0726,0.236,0.236,13.9344,0.0247,1.0,1.1222,28370,25280
3,3.3612,4.632855,0.2552,0.0669,0.2251,0.2249,13.8989,0.0223,1.0,1.1066,27975,25280
4,3.0726,4.742863,0.2492,0.0622,0.2185,0.2185,13.8699,0.0203,1.0,1.1129,28135,25280


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.488,4.859555,0.2422,0.0612,0.2151,0.2154,14.2177,0.0204,1.0,1.1342,28672,25280
2,2.3595,5.068493,0.232,0.055,0.2045,0.2044,13.8325,0.0173,1.0,1.1144,28173,25280
3,2.1669,5.175135,0.2279,0.0503,0.2007,0.2006,13.5025,0.0157,1.0,1.0746,27167,25280


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.8368,5.275476,0.2262,0.0508,0.1982,0.1982,13.7835,0.0155,1.0,1.0973,27739,25280
2,1.757,5.293399,0.2206,0.0484,0.1925,0.1926,13.3384,0.0171,1.0,1.0584,26757,25280


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.6971,5.374691,0.2263,0.0506,0.1978,0.1979,13.9528,0.0157,1.0,1.1062,27966,25280
2,1.6992,5.435202,0.2097,0.044,0.1829,0.1828,13.3882,0.0136,1.0,1.0713,27082,25280
3,1.8831,5.391041,0.2129,0.045,0.1852,0.1854,13.625,0.0137,1.0,1.0797,27295,25280
4,2.0179,5.378398,0.2128,0.0441,0.1859,0.1861,13.2969,0.012,1.0,1.0538,26641,25280
5,1.9608,5.40956,0.2089,0.0434,0.1824,0.1825,13.5504,0.0145,1.0,1.0787,27270,25280
6,1.8619,5.423164,0.2109,0.0424,0.1836,0.1836,13.47,0.0153,1.0,1.0716,27089,25280


In [4]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby(["epoch_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0     0.113737
1     0.269006
5     0.251882
8     0.226802
10    0.220457
16    0.209831
Name: rouge, dtype: float64
STD
epoch_set
0     0.124526
1     0.159247
5     0.156830
8     0.151234
10    0.149064
16    0.146883
Name: rouge, dtype: float64


### Step 2. Learn performance

In [5]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 
    global ANALYSIS_POSTFIX
    
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [6]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

# ENSEMBLE ESTIMATE (JUST HIGHEST PREDICTIONS)
models_index = cv_df.groupby("id")["catboost_perf_hat"].idxmax()
optimal_ensemble = cv_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))
cv_df["opt_es_id"] = cv_df.id.map(optimal_ensemble_map)
ensemble_preds = cv_df.loc[cv_df["epoch_set"]==cv_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()
ensemble_preds["epoch_set"] = "ensemble"
cv_df = pd.concat([cv_df, ensemble_preds], axis=0)

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13596
[LightGBM] [Info] Number of data points in the train set: 31788, number of used features: 805
[LightGBM] [Info] Start training from score 0.215539
catboost
Learning rate set to 0.070719
0:	learn: 0.1563618	total: 74.9ms	remaining: 1m 14s
1:	learn: 0.1554183	total: 89.1ms	remaining: 44.5s
2:	learn: 0.1547072	total: 101ms	remaining: 33.6s
3:	learn: 0.1539364	total: 112ms	remaining: 28s
4:	learn: 0.1533091	total: 124ms	remaining: 24.7s
5:	learn: 0.1527386	total: 136ms	remaining: 22.6s
6:	learn: 0.1522633	total: 149ms	remaining: 21.2s
7:	learn: 0.1518403	total: 162ms	remaining: 20.1s
8:	learn: 0.1514837	total: 176ms	remaining: 19.3s
9:	learn: 0.1511022	total: 188ms	remaining: 18.6s
10:	learn: 0.1507680	total: 200ms	rema

In [7]:
cv_df.groupby("epoch_set").catboost_perf_hat.mean()

epoch_set
0           0.113360
1           0.265932
5           0.248058
8           0.225815
10          0.223279
16          0.212771
ensemble    0.265938
Name: catboost_perf_hat, dtype: float64

In [8]:
cv_df.groupby("epoch_set").catboost_perf_hat.std()

epoch_set
0           0.036543
1           0.040324
5           0.040190
8           0.040232
10          0.040186
16          0.039770
ensemble    0.040332
Name: catboost_perf_hat, dtype: float64

In [9]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

    model_results[model]["rmse_std"] = np.array(model_results[model]["rmse"]).std()
    model_results[model]["mae_std"] = np.array(model_results[model]["mae"]).std()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

    print("RMSE STD ", model_results[model]["rmse_std"])
    print("MAE STD",model_results[model]["mae_std"])
    print("\n")

with open(f'reports/results/s2_model_results_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.19403935059755759
MAE  0.1466663206761671


RMSE STD  0.006754343751349654
MAE STD 0.0029150317298508796


svm
RMSE  0.1555344031994367
MAE  0.1241867621407618


RMSE STD  0.0006193557181652776
MAE STD 0.0004571516481080589


lgbm
RMSE  0.1492709409196213
MAE  0.11948104297551726


RMSE STD  0.0011105031188999553
MAE STD 0.0005557678249745369


catboost
RMSE  0.14854096538873587
MAE  0.1188749287517646


RMSE STD  0.0013325920691760799
MAE STD 0.000764375565019631




In [10]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.epoch_set!="ensemble", "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.epoch_set!="ensemble", "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[cv_df.epoch_set!="ensemble", "rouge"]
    
with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018553 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19860
[LightGBM] [Info] Number of data points in the train set: 47652, number of used features: 1084
[LightGBM] [Info] Start training from score 0.215286
catboost
Learning rate set to 0.07539
0:	learn: 0.1552973	total: 23.9ms	remaining: 23.9s
1:	learn: 0.1543123	total: 42.2ms	remaining: 21.1s
2:	learn: 0.1534450	total: 58.9ms	remaining: 19.6s
3:	learn: 0.1527198	total: 74.6ms	remaining: 18.6s
4:	learn: 0.1520817	total: 91.4ms	remaining: 18.2s
5:	learn: 0.1515113	total: 107ms	remaining: 17.8s
6:	learn: 0.1510694	total: 123ms	remaining: 17.5s
7:	learn: 0.1506443	total: 140ms	remaining: 17.3s
8:	learn: 0.1502840	total: 156ms	remaining: 17.1s
9:	learn: 0.1499380	total: 171ms	remaining: 16.9s
10:	learn: 0.1496441	total: 186ms	re

In [11]:
print(ANALYSIS_POSTFIX)

mined_sudden_2024-07-30
