### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy
from datetime import datetime, date

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold, train_test_split # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
ANALYSIS_POSTFIX = f"mined_slight_{str(date.today())}"
DATE_STR = 20240721
SEMANTIC_DRIFT = True
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:

dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv")

if SEMANTIC_DRIFT:
    dataset_4_cl = dataset[dataset.cluster==4].sample(n=2000, random_state=RS)
    dataset_non_4_cl = dataset[dataset.cluster!=4].sample(n=8000, random_state=RS)

    qids_4_cl = sorted(dataset_4_cl.question_id.unique())
    train_idx_4_cl, test_idx_4_cl = qids_4_cl[int(len(qids_4_cl)*0.9):], qids_4_cl[:int(len(qids_4_cl)*0.9)]

    qids_non4_cl = sorted(dataset_non_4_cl.question_id.unique())
    train_idx_non4_cl, test_idx_non4_cl = qids_non4_cl[:int(len(qids_non4_cl)*0.968)], qids_non4_cl[int(len(qids_non4_cl)*0.968):]

    test_idx_non4_cl.append(train_idx_non4_cl[0])
    train_idx_non4_cl.pop(0)
    
    
    train_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(train_idx_4_cl)]
    test_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(test_idx_4_cl)]

    train_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(train_idx_non4_cl)]
    test_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(test_idx_non4_cl)]

    train_dataset = pd.concat([train_dataset_4cl, train_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    test_dataset = pd.concat([test_dataset_4cl, test_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    
else:
    qids = sorted(dataset.question_id.unique())
    train_idx, test_idx = qids[:int(len(qids)*0.8)], qids[int(len(qids)*0.8):]
    train_dataset = dataset[dataset.question_id.isin(train_idx)]
    test_dataset = dataset[dataset.question_id.isin(test_idx)]


print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

print("Train Data: Cluster", train_dataset.cluster.value_counts())
print("Test Data: Cluster", test_dataset.cluster.value_counts())

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer, intent_colum_name="intent")
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')


# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3570
3    2129
1    1639
0     408
4     196
Name: count, dtype: int64
Test Data: Cluster cluster
4    1804
3     114
2      87
1      45
0       8
Name: count, dtype: int64


Filter: 100%|████████████████████| 2058/2058 [00:00<00:00, 106120.94 examples/s]
Filter: 100%|█████████████████████| 2058/2058 [00:00<00:00, 38306.19 examples/s]
Map: 100%|█████████████████████████| 2058/2058 [00:01<00:00, 2053.37 examples/s]


Fold 0
Fold 1
Fold 2


In [3]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}

for i, (train_idxs, val_idxs) in enumerate(splits):

    print(f"Fold {i}")
    fold_dataset = DatasetDict({
        "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
        "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
    })
    fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer, intent_colum_name="intent")
    fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer, intent_colum_name="intent")
    

    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

        fold_df = pd.DataFrame(fold_val)
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = []
        
        if len(text)>1000:
            
            batch_size = 1000
            n_batches = math.ceil(len(text)/batch_size)

            for batch in range(n_batches):

                batch_start_idx = batch*batch_size
                batch_end_idx = batch*batch_size + batch_size

                if batch==(n_batches-1):
                    batch_end_idx = len(text)
                summary = infer.generate_summary(text[batch_start_idx:batch_end_idx],
                                                model,
                                                tokenizer,
                                                TRAIN_ARGS["ENCODER_LENGTH"],
                                                TRAIN_ARGS["DECODER_LENGTH"])[1]
                summaries.append(summary)

            summaries = [sentence for summary_list in summaries for sentence in summary_list]
            
            fold_df["prediction"] = summaries
        else: 
            summaries = infer.generate_summary(text, 
                                               model,
                                               tokenizer,
                                               TRAIN_ARGS["ENCODER_LENGTH"],
                                               TRAIN_ARGS["DECODER_LENGTH"])
            fold_df["prediction"] = summaries[1]


        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 55320.37 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 64046.50 examples/s]
Filter: 100%|█████████████████████| 5295/5295 [00:00<00:00, 36825.45 examples/s]
Filter: 100%|█████████████████████| 5295/5295 [00:00<00:00, 34544.83 examples/s]
Map: 100%|█████████████████████████| 5295/5295 [00:02<00:00, 2005.27 examples/s]
Filter: 100%|█████████████████████| 2647/2647 [00:00<00:00, 31758.12 examples/s]
Filter: 100%|█████████████████████| 2647/2647 [00:00<00:00, 36553.99 examples/s]
Map: 100%|█████████████████████████| 2647/2647 [00:01<00:00, 1832.87 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1653,4.729744,0.2437,0.06,0.2161,0.2166,14.0835,0.0192,1.0,1.1714,29365,25069


TRAINING EPOCH SET 5
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6648,4.591383,0.2564,0.0671,0.2256,0.2259,14.2765,0.0216,1.0,1.1853,29715,25069
2,4.5622,4.533464,0.2645,0.0698,0.2319,0.2324,13.9754,0.0238,1.0,1.1522,28884,25069
3,4.4596,4.512168,0.2657,0.0702,0.2323,0.2327,14.1371,0.023,1.0,1.171,29356,25069
4,4.4101,4.508818,0.266,0.0712,0.2327,0.2329,14.1107,0.0241,1.0,1.1627,29147,25069


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2265,4.51259,0.2638,0.07,0.2308,0.2312,14.2444,0.0234,1.0,1.1758,29477,25069
2,4.1973,4.50025,0.2645,0.0699,0.2305,0.2308,13.9815,0.0235,1.0,1.1479,28777,25069
3,4.1782,4.499527,0.2663,0.0706,0.2322,0.2326,14.085,0.0238,1.0,1.1581,29033,25069


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.919,4.552171,0.2621,0.0678,0.2283,0.2287,14.2017,0.0229,1.0,1.1681,29284,25069
2,3.9348,4.544191,0.2641,0.0696,0.23,0.2304,14.0737,0.0227,1.0,1.1547,28948,25069


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6528,4.630479,0.2583,0.0644,0.2239,0.2243,14.1779,0.0209,1.0,1.1608,29101,25069
2,3.7731,4.546791,0.2643,0.0681,0.23,0.2304,13.9411,0.0225,1.0,1.1377,28520,25069
3,3.924,4.52118,0.265,0.0684,0.2297,0.23,14.0589,0.0218,1.0,1.1584,29041,25069
4,3.9645,4.517269,0.2651,0.0687,0.2305,0.2309,14.0132,0.0228,1.0,1.1487,28796,25069
5,3.9541,4.518394,0.2646,0.0699,0.2302,0.2304,14.1862,0.0237,1.0,1.1658,29225,25069
6,3.9249,4.522631,0.2642,0.0695,0.2296,0.2298,14.1065,0.0239,1.0,1.156,28980,25069


Fold 1


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 51061.06 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 67663.26 examples/s]
Filter: 100%|█████████████████████| 5288/5288 [00:00<00:00, 30799.14 examples/s]
Filter: 100%|█████████████████████| 5288/5288 [00:00<00:00, 28099.55 examples/s]
Map: 100%|█████████████████████████| 5288/5288 [00:02<00:00, 2052.22 examples/s]
Filter: 100%|█████████████████████| 2654/2654 [00:00<00:00, 34905.51 examples/s]
Filter: 100%|█████████████████████| 2654/2654 [00:00<00:00, 36148.99 examples/s]
Map: 100%|█████████████████████████| 2654/2654 [00:01<00:00, 2221.41 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1828,4.695487,0.2533,0.0644,0.2258,0.2258,13.8625,0.0237,1.0,1.1213,28786,25671


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6838,4.552226,0.2696,0.0741,0.239,0.239,13.636,0.0249,1.0,1.0921,28035,25671
2,4.5901,4.493259,0.2766,0.0753,0.2444,0.2443,13.8267,0.027,1.0,1.1109,28519,25671
3,4.4873,4.469931,0.2776,0.0752,0.2438,0.2437,13.9472,0.0265,1.0,1.1234,28838,25671
4,4.4352,4.466553,0.2783,0.0758,0.245,0.245,14.0038,0.0266,1.0,1.1261,28909,25671


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2486,4.464822,0.277,0.0749,0.2433,0.2432,13.7566,0.0278,1.0,1.094,28084,25671
2,4.227,4.454075,0.278,0.0756,0.2458,0.2457,13.9691,0.0276,1.0,1.1173,28681,25671
3,4.2089,4.450546,0.2798,0.0767,0.2466,0.2466,14.0249,0.0277,1.0,1.1222,28807,25671


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9414,4.49273,0.2729,0.0731,0.2405,0.2405,13.7886,0.0275,1.0,1.0944,28094,25671
2,3.9668,4.489203,0.2763,0.0747,0.2431,0.2431,14.0173,0.0271,1.0,1.1201,28754,25671


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6748,4.557837,0.2681,0.071,0.2361,0.2362,13.7216,0.026,1.0,1.0845,27839,25671
2,3.8045,4.492517,0.2718,0.0729,0.2411,0.2411,13.9035,0.0255,1.0,1.1075,28431,25671
3,3.9586,4.465343,0.2717,0.0706,0.2387,0.2386,14.023,0.0246,1.0,1.1236,28844,25671
4,3.9951,4.460639,0.2742,0.0735,0.2419,0.2419,14.2886,0.025,1.0,1.1417,29308,25671
5,3.9798,4.462881,0.2748,0.073,0.2417,0.2418,14.1096,0.0247,1.0,1.1268,28925,25671
6,3.954,4.465843,0.2727,0.0715,0.2399,0.24,14.1085,0.0248,1.0,1.1271,28934,25671


Fold 2


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 50551.73 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 67833.70 examples/s]
Filter: 100%|█████████████████████| 5301/5301 [00:00<00:00, 32548.64 examples/s]
Filter: 100%|█████████████████████| 5301/5301 [00:00<00:00, 33056.46 examples/s]
Map: 100%|█████████████████████████| 5301/5301 [00:02<00:00, 2049.96 examples/s]
Filter: 100%|█████████████████████| 2641/2641 [00:00<00:00, 37976.05 examples/s]
Filter: 100%|█████████████████████| 2641/2641 [00:00<00:00, 36805.22 examples/s]
Map: 100%|█████████████████████████| 2641/2641 [00:01<00:00, 2160.00 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.178,4.692172,0.2495,0.0647,0.2236,0.2237,13.9886,0.0212,1.0,1.1414,29016,25422


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6753,4.558427,0.2647,0.0711,0.2354,0.2357,13.9807,0.0243,1.0,1.1378,28926,25422
2,4.575,4.503981,0.2695,0.0729,0.2387,0.239,14.2101,0.0238,1.0,1.1561,29391,25422
3,4.4743,4.483448,0.2737,0.0752,0.2417,0.2419,14.1427,0.0248,1.0,1.1459,29130,25422
4,4.4191,4.480166,0.2724,0.0744,0.2407,0.2409,14.1106,0.0249,1.0,1.1436,29072,25422


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2354,4.481641,0.2715,0.0729,0.2381,0.2384,14.1162,0.0237,1.0,1.1399,28979,25422
2,4.2078,4.468441,0.272,0.0745,0.24,0.2402,14.2276,0.0257,1.0,1.1483,29192,25422
3,4.1912,4.469007,0.273,0.0743,0.2407,0.2409,14.1783,0.0253,1.0,1.1415,29019,25422


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9307,4.51504,0.2692,0.0701,0.236,0.2361,14.1367,0.0225,1.0,1.1357,28872,25422
2,3.9481,4.506023,0.2708,0.0721,0.238,0.2383,14.0644,0.0236,1.0,1.1272,28656,25422


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6686,4.593952,0.2631,0.0673,0.2311,0.2312,14.1159,0.0213,1.0,1.131,28753,25422
2,3.7874,4.513627,0.2658,0.0699,0.2339,0.2341,14.334,0.0221,1.0,1.1525,29298,25422
3,3.9349,4.493846,0.2716,0.0717,0.238,0.238,14.2351,0.0225,1.0,1.1392,28960,25422
4,3.9709,4.48829,0.2678,0.0704,0.2353,0.2354,14.0867,0.023,1.0,1.1257,28618,25422
5,3.9589,4.486603,0.27,0.07,0.236,0.2362,14.1761,0.0233,1.0,1.1359,28877,25422
6,3.9341,4.492264,0.2694,0.0701,0.2362,0.2363,14.1836,0.0232,1.0,1.1345,28840,25422


In [4]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby(["epoch_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0     0.113179
1     0.249767
5     0.273433
8     0.273033
10    0.271249
16    0.270074
Name: rouge, dtype: float64
STD
epoch_set
0     0.124336
1     0.158044
5     0.158529
8     0.159088
10    0.159535
16    0.159240
Name: rouge, dtype: float64


### Step 2. Learn performance

In [5]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 
    global ANALYSIS_POSTFIX
    
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [6]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

# ENSEMBLE ESTIMATE (JUST HIGHEST PREDICTIONS)
models_index = cv_df.groupby("id")["catboost_perf_hat"].idxmax()
optimal_ensemble = cv_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))
cv_df["opt_es_id"] = cv_df.id.map(optimal_ensemble_map)
ensemble_preds = cv_df.loc[cv_df["epoch_set"]==cv_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()
ensemble_preds["epoch_set"] = "ensemble"
cv_df = pd.concat([cv_df, ensemble_preds], axis=0)

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 7.866961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13396
[LightGBM] [Info] Number of data points in the train set: 31770, number of used features: 787
[LightGBM] [Info] Start training from score 0.244510
catboost
Learning rate set to 0.070713
0:	learn: 0.1630651	total: 66.8ms	remaining: 1m 6s
1:	learn: 0.1617789	total: 79ms	remaining: 39.4s
2:	learn: 0.1605932	total: 90.4ms	remaining: 30s
3:	learn: 0.1596134	total: 102ms	remaining: 25.5s
4:	learn: 0.1587366	total: 118ms	remaining: 23.5s
5:	learn: 0.1579606	total: 130ms	remaining: 21.5s
6:	learn: 0.1572956	total: 148ms	remaining: 20.9s
7:	learn: 0.1566981	total: 163ms	remaining: 20.2s
8:	learn: 0.1561204	total: 179ms	remaining: 19.7s
9:	learn: 0.1556037	total: 192ms	remaining: 19s
10:	learn: 0.1552219	total: 204ms	remainin

In [7]:
cv_df.groupby("epoch_set").catboost_perf_hat.mean()

epoch_set
0           0.113942
1           0.254337
5           0.271368
8           0.271626
10          0.271508
16          0.271225
ensemble    0.271650
Name: catboost_perf_hat, dtype: float64

In [8]:
cv_df.groupby("epoch_set").catboost_perf_hat.std()

epoch_set
0           0.038600
1           0.042956
5           0.043281
8           0.043261
10          0.043242
16          0.043238
ensemble    0.043274
Name: catboost_perf_hat, dtype: float64

In [9]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

    model_results[model]["rmse_std"] = np.array(model_results[model]["rmse"]).std()
    model_results[model]["mae_std"] = np.array(model_results[model]["mae"]).std()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

    print("RMSE STD ", model_results[model]["rmse_std"])
    print("MAE STD",model_results[model]["mae_std"])
    print("\n")

with open(f'reports/results/s2_model_results_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.20352146390395087
MAE  0.1576347894345456


RMSE STD  0.0028135543010467893
MAE STD 0.0014060955455891407


svm
RMSE  0.16194377107147542
MAE  0.12956459256153322


RMSE STD  0.0006247477823232549
MAE STD 0.0009524230341970948


lgbm
RMSE  0.15628657573806196
MAE  0.1253384609965067


RMSE STD  0.00046297093904505547
MAE STD 0.00080434825547944


catboost
RMSE  0.1549047082913871
MAE  0.12419672593061004


RMSE STD  0.0003226794570741731
MAE STD 0.0008170206803472665




In [10]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.epoch_set!="ensemble", "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.epoch_set!="ensemble", "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[cv_df.epoch_set!="ensemble", "rouge"]
    
with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.954804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19721
[LightGBM] [Info] Number of data points in the train set: 47652, number of used features: 1093
[LightGBM] [Info] Start training from score 0.241789
catboost
Learning rate set to 0.07539
0:	learn: 0.1627395	total: 22.1ms	remaining: 22.1s
1:	learn: 0.1614162	total: 36.6ms	remaining: 18.3s
2:	learn: 0.1603203	total: 49ms	remaining: 16.3s
3:	learn: 0.1593231	total: 60.7ms	remaining: 15.1s
4:	learn: 0.1584625	total: 72.6ms	remaining: 14.5s
5:	learn: 0.1576963	total: 83.3ms	remaining: 13.8s
6:	learn: 0.1570391	total: 93.6ms	remaining: 13.3s
7:	learn: 0.1564160	total: 104ms	remaining: 12.9s
8:	learn: 0.1559253	total: 114ms	remaining: 12.5s
9:	learn: 0.1554659	total: 124ms	remaining: 12.2s
10:	learn: 0.1551260	total: 134ms	re

In [11]:
ANALYSIS_POSTFIX

'mined_slight_2024-07-28'