### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy
from datetime import datetime, date

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold, train_test_split # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
ANALYSIS_POSTFIX = f"mined_sudden_{str(date.today())}"
DATE_STR = 20240721
SEMANTIC_DRIFT = True
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:

dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv")

if SEMANTIC_DRIFT:
    dataset_4_cl = dataset[dataset.cluster==4].sample(n=2000, random_state=RS)
    dataset_non_4_cl = dataset[dataset.cluster!=4].sample(n=8000, random_state=RS)

    qids_4_cl = sorted(dataset_4_cl.question_id.unique())
    train_idx_4_cl, test_idx_4_cl = qids_4_cl[int(len(qids_4_cl)*0.99):], qids_4_cl[:int(len(qids_4_cl)*0.99)]

    qids_non4_cl = sorted(dataset_non_4_cl.question_id.unique())
    train_idx_non4_cl, test_idx_non4_cl = qids_non4_cl[:int(len(qids_non4_cl)*0.99)], qids_non4_cl[int(len(qids_non4_cl)*0.99):]

    train_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(train_idx_4_cl)]
    test_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(test_idx_4_cl)]

    train_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(train_idx_non4_cl)]
    test_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(test_idx_non4_cl)]

    train_dataset = pd.concat([train_dataset_4cl, train_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    test_dataset = pd.concat([test_dataset_4cl, test_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    
else:
    qids = sorted(dataset.question_id.unique())
    train_idx, test_idx = qids[:int(len(qids)*0.8)], qids[int(len(qids)*0.8):]
    train_dataset = dataset[dataset.question_id.isin(train_idx)]
    test_dataset = dataset[dataset.question_id.isin(test_idx)]


print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

print("Train Data: Cluster", train_dataset.cluster.value_counts())
print("Test Data: Cluster", test_dataset.cluster.value_counts())

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer, intent_colum_name="intent")
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')


# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter: 100%|█████████████████████| 2058/2058 [00:00<00:00, 95911.88 examples/s]
Filter: 100%|█████████████████████| 2058/2058 [00:00<00:00, 33375.65 examples/s]
Map: 100%|█████████████████████████| 2058/2058 [00:01<00:00, 1801.51 examples/s]


Fold 0
Fold 1
Fold 2


In [3]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}

for i, (train_idxs, val_idxs) in enumerate(splits):

    print(f"Fold {i}")
    fold_dataset = DatasetDict({
        "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
        "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
    })
    fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer, intent_colum_name="intent")
    fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer, intent_colum_name="intent")
    

    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

        fold_df = pd.DataFrame(fold_val)
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = []
        
        if len(text)>1000:
            
            batch_size = 1000
            n_batches = math.ceil(len(text)/batch_size)

            for batch in range(n_batches):

                batch_start_idx = batch*batch_size
                batch_end_idx = batch*batch_size + batch_size

                if batch==(n_batches-1):
                    batch_end_idx = len(text)
                summary = infer.generate_summary(text[batch_start_idx:batch_end_idx],
                                                model,
                                                tokenizer,
                                                TRAIN_ARGS["ENCODER_LENGTH"],
                                                TRAIN_ARGS["DECODER_LENGTH"])[1]
                summaries.append(summary)

            summaries = [sentence for summary_list in summaries for sentence in summary_list]
            
            fold_df["prediction"] = summaries
        else: 
            summaries = infer.generate_summary(text, 
                                               model,
                                               tokenizer,
                                               TRAIN_ARGS["ENCODER_LENGTH"],
                                               TRAIN_ARGS["DECODER_LENGTH"])
            fold_df["prediction"] = summaries[1]


        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 47300.06 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 65107.66 examples/s]
Filter: 100%|█████████████████████| 5298/5298 [00:00<00:00, 29565.92 examples/s]
Filter: 100%|█████████████████████| 5298/5298 [00:00<00:00, 32420.68 examples/s]
Map: 100%|█████████████████████████| 5298/5298 [00:02<00:00, 1845.86 examples/s]
Filter: 100%|█████████████████████| 2644/2644 [00:00<00:00, 33774.66 examples/s]
Filter: 100%|█████████████████████| 2644/2644 [00:00<00:00, 33875.97 examples/s]
Map: 100%|█████████████████████████| 2644/2644 [00:01<00:00, 2374.32 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1817,4.698771,0.2487,0.0616,0.2212,0.2213,14.1649,0.0215,1.0,1.1625,29444,25328


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6798,4.561677,0.2599,0.0666,0.2298,0.2299,14.1528,0.0207,1.0,1.1572,29309,25328
2,4.5816,4.505205,0.265,0.0697,0.2336,0.2335,14.0564,0.0225,1.0,1.1441,28979,25328
3,4.4846,4.484755,0.2674,0.0715,0.2352,0.2352,14.0061,0.0233,1.0,1.1344,28732,25328
4,4.4337,4.477675,0.2687,0.0728,0.2366,0.2367,14.09,0.0243,1.0,1.1442,28981,25328


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2429,4.479573,0.2658,0.0693,0.2327,0.2327,14.1483,0.0232,1.0,1.1477,29068,25328
2,4.2185,4.470181,0.2703,0.0724,0.2367,0.2366,14.0344,0.0233,1.0,1.1356,28762,25328
3,4.2043,4.4659,0.2692,0.0716,0.2354,0.2353,14.166,0.0227,1.0,1.1473,29058,25328


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9388,4.5065,0.2631,0.0672,0.2311,0.2311,14.0692,0.0224,1.0,1.136,28773,25328
2,3.9584,4.50456,0.2677,0.0696,0.234,0.2339,13.9391,0.0229,1.0,1.1237,28461,25328


TRAINING EPOCH SET 16
TRAINING EPOCHS 6


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6751,4.572701,0.2582,0.0665,0.2271,0.2272,13.9482,0.0206,1.0,1.1203,28375,25328
2,3.798,4.518637,0.2659,0.0688,0.2329,0.2328,13.9939,0.0218,1.0,1.128,28571,25328
3,3.954,4.4866,0.2683,0.0709,0.235,0.2351,13.9346,0.0223,1.0,1.121,28393,25328
4,3.9976,4.47,0.2661,0.0709,0.2331,0.2333,13.924,0.0224,1.0,1.1203,28375,25328
5,3.9793,4.47636,0.2667,0.0701,0.2336,0.2337,14.0004,0.0216,1.0,1.1258,28514,25328
6,3.9533,4.479221,0.2654,0.069,0.2321,0.2323,13.9875,0.021,1.0,1.1269,28543,25328


Fold 1


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 53509.59 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 67424.81 examples/s]
Filter: 100%|█████████████████████| 5295/5295 [00:00<00:00, 32795.00 examples/s]
Filter: 100%|█████████████████████| 5295/5295 [00:00<00:00, 33722.46 examples/s]
Map: 100%|█████████████████████████| 5295/5295 [00:02<00:00, 1856.46 examples/s]
Filter: 100%|█████████████████████| 2647/2647 [00:00<00:00, 33507.44 examples/s]
Filter: 100%|█████████████████████| 2647/2647 [00:00<00:00, 32330.11 examples/s]
Map: 100%|█████████████████████████| 2647/2647 [00:01<00:00, 1882.11 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1708,4.710202,0.2492,0.061,0.2217,0.2219,14.0385,0.0195,1.0,1.1426,29242,25592


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6706,4.571948,0.2701,0.0716,0.2375,0.2374,14.0831,0.0246,1.0,1.1401,29178,25592
2,4.571,4.509912,0.2721,0.0733,0.2382,0.2383,13.9698,0.0262,1.0,1.1286,28884,25592
3,4.4792,4.48725,0.2734,0.0733,0.2394,0.2394,14.1073,0.0253,1.0,1.138,29124,25592
4,4.4222,4.483542,0.2755,0.0734,0.2405,0.2407,14.071,0.0256,1.0,1.1332,29000,25592


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2315,4.491459,0.2759,0.0734,0.2404,0.2406,14.1711,0.0264,1.0,1.1404,29184,25592
2,4.2073,4.473559,0.2752,0.0727,0.2402,0.24,14.0608,0.0266,1.0,1.1286,28884,25592
3,4.1976,4.472107,0.2762,0.0726,0.2411,0.2411,14.0257,0.026,1.0,1.1232,28746,25592


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9242,4.526834,0.2709,0.0697,0.2367,0.2368,14.1572,0.0244,1.0,1.1351,29050,25592
2,3.9467,4.513162,0.2711,0.0693,0.2369,0.2369,13.9868,0.0241,1.0,1.1177,28603,25592


TRAINING EPOCH SET 16
TRAINING EPOCHS 6


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6578,4.604552,0.2663,0.0667,0.232,0.232,14.1099,0.0224,1.0,1.1322,28975,25592
2,3.7852,4.518794,0.27,0.0692,0.2359,0.236,14.0903,0.0243,1.0,1.1323,28977,25592
3,3.9451,4.487727,0.2686,0.0697,0.2361,0.236,14.034,0.0254,1.0,1.1219,28711,25592
4,3.9782,4.487778,0.2709,0.0695,0.2365,0.2366,14.1564,0.0245,1.0,1.1368,29094,25592
5,3.9615,4.488613,0.2699,0.0701,0.2366,0.2367,14.1012,0.0247,1.0,1.1263,28825,25592
6,3.9338,4.492178,0.2697,0.0701,0.2361,0.2363,14.1371,0.0248,1.0,1.1316,28961,25592


Fold 2


Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 53473.34 examples/s]
Filter: 100%|█████████████████████| 7942/7942 [00:00<00:00, 64258.88 examples/s]
Filter: 100%|█████████████████████| 5291/5291 [00:00<00:00, 32619.75 examples/s]
Filter: 100%|█████████████████████| 5291/5291 [00:00<00:00, 33629.43 examples/s]
Map: 100%|█████████████████████████| 5291/5291 [00:02<00:00, 1903.93 examples/s]
Filter: 100%|█████████████████████| 2651/2651 [00:00<00:00, 35592.05 examples/s]
Filter: 100%|█████████████████████| 2651/2651 [00:00<00:00, 35963.43 examples/s]
Map: 100%|█████████████████████████| 2651/2651 [00:01<00:00, 2047.11 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.18,4.713183,0.2403,0.0617,0.2159,0.216,14.017,0.0202,1.0,1.1579,29271,25280


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6788,4.577013,0.2608,0.0722,0.232,0.232,14.0219,0.0245,1.0,1.1485,29033,25280
2,4.5745,4.520506,0.2694,0.0738,0.238,0.2381,13.9415,0.0249,1.0,1.1382,28773,25280
3,4.4696,4.501167,0.2737,0.0753,0.2418,0.2417,14.143,0.0252,1.0,1.1571,29252,25280
4,4.4227,4.495306,0.2736,0.0751,0.2413,0.2414,14.0932,0.0245,1.0,1.1517,29114,25280


TRAINING EPOCH SET 8
TRAINING EPOCHS 3


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2411,4.499412,0.2682,0.0735,0.2372,0.2372,14.0034,0.0238,1.0,1.1372,28748,25280
2,4.2089,4.486858,0.271,0.0739,0.2392,0.2392,13.9694,0.0247,1.0,1.1362,28722,25280
3,4.1886,4.485295,0.2703,0.074,0.2382,0.2383,14.0645,0.0245,1.0,1.1451,28949,25280


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9371,4.532994,0.2643,0.0704,0.2328,0.233,13.9842,0.0238,1.0,1.1303,28574,25280
2,3.9495,4.521092,0.2667,0.0705,0.2343,0.2346,13.9608,0.0235,1.0,1.1312,28596,25280


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6765,4.609656,0.2598,0.0682,0.2283,0.2286,13.9834,0.023,1.0,1.1252,28446,25280
2,3.79,4.533356,0.2671,0.0718,0.2344,0.2345,13.7842,0.0244,1.0,1.1131,28140,25280
3,3.9354,4.508908,0.2696,0.0724,0.2365,0.2366,14.0634,0.025,1.0,1.1408,28840,25280
4,3.9802,4.49993,0.2669,0.0714,0.235,0.2351,13.9694,0.0243,1.0,1.1322,28621,25280
5,3.964,4.499954,0.2673,0.0715,0.2345,0.2345,14.1203,0.0245,1.0,1.1453,28953,25280
6,3.9399,4.503712,0.2669,0.0715,0.2342,0.2342,14.0415,0.0248,1.0,1.1388,28789,25280


In [4]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby(["epoch_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0     0.113737
1     0.247452
5     0.273484
8     0.273170
10    0.270033
16    0.268677
Name: rouge, dtype: float64
STD
epoch_set
0     0.124526
1     0.157347
5     0.158682
8     0.158973
10    0.158828
16    0.158742
Name: rouge, dtype: float64


### Step 2. Learn performance

In [5]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 
    global ANALYSIS_POSTFIX
    
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [6]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

# ENSEMBLE ESTIMATE (JUST HIGHEST PREDICTIONS)
models_index = cv_df.groupby("id")["catboost_perf_hat"].idxmax()
optimal_ensemble = cv_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))
cv_df["opt_es_id"] = cv_df.id.map(optimal_ensemble_map)
ensemble_preds = cv_df.loc[cv_df["epoch_set"]==cv_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()
ensemble_preds["epoch_set"] = "ensemble"
cv_df = pd.concat([cv_df, ensemble_preds], axis=0)

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 6.716822 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13596
[LightGBM] [Info] Number of data points in the train set: 31788, number of used features: 805
[LightGBM] [Info] Start training from score 0.241274
catboost
Learning rate set to 0.070719
0:	learn: 0.1633370	total: 80.7ms	remaining: 1m 20s
1:	learn: 0.1621529	total: 97.5ms	remaining: 48.6s
2:	learn: 0.1610815	total: 107ms	remaining: 35.7s
3:	learn: 0.1601294	total: 115ms	remaining: 28.7s
4:	learn: 0.1592818	total: 124ms	remaining: 24.7s
5:	learn: 0.1585113	total: 131ms	remaining: 21.8s
6:	learn: 0.1578617	total: 139ms	remaining: 19.8s
7:	learn: 0.1572935	total: 145ms	remaining: 18s
8:	learn: 0.1567605	total: 151ms	remaining: 16.7s
9:	learn: 0.1562438	total: 157ms	remaining: 15.6s
10:	learn: 0.1558050	total: 164ms	rema

In [7]:
cv_df.groupby("epoch_set").catboost_perf_hat.mean()

epoch_set
0           0.114959
1           0.252255
5           0.271516
8           0.271150
10          0.271108
16          0.270974
ensemble    0.271516
Name: catboost_perf_hat, dtype: float64

In [8]:
cv_df.groupby("epoch_set").catboost_perf_hat.std()

epoch_set
0           0.040594
1           0.044462
5           0.044575
8           0.044559
10          0.044555
16          0.044547
ensemble    0.044574
Name: catboost_perf_hat, dtype: float64

In [9]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

    model_results[model]["rmse_std"] = np.array(model_results[model]["rmse"]).std()
    model_results[model]["mae_std"] = np.array(model_results[model]["mae"]).std()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

    print("RMSE STD ", model_results[model]["rmse_std"])
    print("MAE STD",model_results[model]["mae_std"])
    print("\n")

with open(f'reports/results/s2_model_results_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.21065228981774062
MAE  0.1593661271131709


RMSE STD  0.006985668136214907
MAE STD 0.0029113889319855673


svm
RMSE  0.1620067900624161
MAE  0.1293840717021081


RMSE STD  0.00013322340138227645
MAE STD 6.310339542271578e-05


lgbm
RMSE  0.15578010522471017
MAE  0.12479896677886153


RMSE STD  0.00028558116856165425
MAE STD 0.00019480164091058652


catboost
RMSE  0.1548901173693615
MAE  0.12411328433051139


RMSE STD  0.000513444056344368
MAE STD 0.00031925492375107643




In [10]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.epoch_set!="ensemble", "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.epoch_set!="ensemble", "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[cv_df.epoch_set!="ensemble", "rouge"]
    
with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.989606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19860
[LightGBM] [Info] Number of data points in the train set: 47652, number of used features: 1084
[LightGBM] [Info] Start training from score 0.241092
catboost
Learning rate set to 0.07539
0:	learn: 0.1623836	total: 19.7ms	remaining: 19.7s
1:	learn: 0.1610802	total: 35.2ms	remaining: 17.6s
2:	learn: 0.1599521	total: 48.9ms	remaining: 16.2s
3:	learn: 0.1589962	total: 63.4ms	remaining: 15.8s
4:	learn: 0.1581307	total: 75.9ms	remaining: 15.1s
5:	learn: 0.1573688	total: 86.4ms	remaining: 14.3s
6:	learn: 0.1567248	total: 109ms	remaining: 15.4s
7:	learn: 0.1561572	total: 120ms	remaining: 14.9s
8:	learn: 0.1556448	total: 130ms	remaining: 14.3s
9:	learn: 0.1552119	total: 141ms	remaining: 14s
10:	learn: 0.1548476	total: 152ms	rem

In [11]:
print(ANALYSIS_POSTFIX)

mined_sudden_2024-07-28
