### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold, train_test_split # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 3, 4, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [2]:
DATE_STR = 20240721
SEMANTIC_DRIFT = True
dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/conala_mined_clustered.csv")#.head(5000)

In [3]:
if SEMANTIC_DRIFT:
    dataset_4_cl = dataset[dataset.cluster==4].sample(n=1000, random_state=RS)
    dataset_non_4_cl = dataset[dataset.cluster!=4].sample(n=4000, random_state=RS)

    qids_4_cl = sorted(dataset_4_cl.question_id.unique())
    train_idx_4_cl, test_idx_4_cl = qids_4_cl[int(len(qids_4_cl)*0.9):], qids_4_cl[:int(len(qids_4_cl)*0.9)]

    qids_non4_cl = sorted(dataset_non_4_cl.question_id.unique())
    train_idx_non4_cl, test_idx_non4_cl = qids_non4_cl[:int(len(qids_non4_cl)*0.8)], qids_non4_cl[int(len(qids_non4_cl)*0.8):]

    train_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(train_idx_4_cl)]
    test_dataset_4cl = dataset_4_cl[dataset_4_cl.question_id.isin(test_idx_4_cl)]

    train_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(train_idx_non4_cl)]
    test_dataset_non4cl = dataset_non_4_cl[dataset_non_4_cl.question_id.isin(test_idx_non4_cl)]

    train_dataset = pd.concat([train_dataset_4cl, train_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    test_dataset = pd.concat([test_dataset_4cl, test_dataset_non4cl], axis=0).sample(frac=1, random_state=RS).reset_index(drop=True)
    
else:
    qids = sorted(dataset.question_id.unique())
    train_idx, test_idx = qids[:int(len(qids)*0.8)], qids[int(len(qids)*0.8):]
    train_dataset = dataset[dataset.question_id.isin(train_idx)]
    test_dataset = dataset[dataset.question_id.isin(test_idx)]
print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer, intent_colum_name="intent")
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')


# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])
    print(val_idxs)

Train Data:  (3294, 11)
Test Data:  (1706, 11)


Filter: 100%|██████████| 1706/1706 [00:00<00:00, 97895.59 examples/s]
Filter:   0%|          | 0/1706 [00:00<?, ? examples/s]

Filter: 100%|██████████| 1706/1706 [00:00<00:00, 38266.86 examples/s]
Map: 100%|██████████| 1706/1706 [00:00<00:00, 1871.65 examples/s]


Fold 0
[   0    7   12 ... 3104 3107 3110]
Fold 1
[   2    6    8 ... 3105 3108 3109]
Fold 2
[   1    3    4 ... 3092 3102 3106]


In [4]:
fold_results = {}
for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):
    fold_results[epoch_set] = {}

for i, (train_idxs, val_idxs) in enumerate(splits):

    print(f"Fold {i}")
    fold_dataset = DatasetDict({
        "train": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
        "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
    })
    fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer, intent_colum_name="intent")
    fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer, intent_colum_name="intent")
    

    for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

        fold_df = pd.DataFrame(fold_val)
        print(f"TRAINING EPOCH SET {epoch_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp/"

        if epoch_set > 1: 
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
        else:
            TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        if epoch_set > 1: 
            model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
            print(f"LOADING MODEL {FOLD_MODEL_PATH}")
        else: 
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        if epoch_set!=0:
            trainer.train()

        text = fold_val["input_sequence"]
        summaries = []
        
        if len(text)>1000:
            
            batch_size = 1000
            n_batches = math.ceil(len(text)/batch_size)

            for batch in range(n_batches):

                batch_start_idx = batch*batch_size
                batch_end_idx = batch*batch_size + batch_size

                if batch==(n_batches-1):
                    batch_end_idx = len(text)
                summary = infer.generate_summary(text[batch_start_idx:batch_end_idx],
                                                model,
                                                tokenizer,
                                                TRAIN_ARGS["ENCODER_LENGTH"],
                                                TRAIN_ARGS["DECODER_LENGTH"])[1]
                summaries.append(summary)

            summaries = [sentence for summary_list in summaries for sentence in summary_list]
            
            fold_df["prediction"] = summaries
        else: 
            summaries = infer.generate_summary(text, 
                                               model,
                                               tokenizer,
                                               TRAIN_ARGS["ENCODER_LENGTH"],
                                               TRAIN_ARGS["DECODER_LENGTH"])
            fold_df["prediction"] = summaries[1]


        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)

        latest_run_epoch = epoch_set

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])



Fold 0


Filter: 100%|██████████| 3294/3294 [00:00<00:00, 63645.20 examples/s]
Filter: 100%|██████████| 3294/3294 [00:00<00:00, 74628.04 examples/s]
Filter: 100%|██████████| 2201/2201 [00:00<00:00, 33943.18 examples/s]
Filter: 100%|██████████| 2201/2201 [00:00<00:00, 37447.32 examples/s]
Map: 100%|██████████| 2201/2201 [00:01<00:00, 2067.65 examples/s]
Filter: 100%|██████████| 1093/1093 [00:00<00:00, 45352.13 examples/s]
Filter: 100%|██████████| 1093/1093 [00:00<00:00, 47699.24 examples/s]
Map: 100%|██████████| 1093/1093 [00:00<00:00, 2595.77 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.5585,4.872462,0.227,0.0551,0.2051,0.2054,13.6761,0.0198,1.0,1.126,11853,10527


TRAINING EPOCH SET 2
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.8842,4.723914,0.2476,0.064,0.2247,0.2247,13.6862,0.0215,1.0,1.1091,11676,10527


TRAINING EPOCH SET 3
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6278,4.675978,0.2626,0.0682,0.2353,0.2353,13.7511,0.0212,1.0,1.1128,11714,10527


TRAINING EPOCH SET 4
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4313,4.68272,0.2613,0.068,0.2344,0.2343,13.7475,0.0223,1.0,1.1151,11739,10527


TRAINING EPOCH SET 8
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2512,4.672801,0.2608,0.0678,0.2324,0.2326,13.9021,0.0213,1.0,1.1282,11877,10527
2,4.5621,4.553144,0.2649,0.0708,0.2388,0.2385,13.9671,0.0254,1.0,1.136,11959,10527
3,4.4617,4.532789,0.2723,0.0738,0.2433,0.2433,14.065,0.0238,1.0,1.1395,11995,10527
4,4.4005,4.529425,0.2716,0.0744,0.2441,0.2442,13.9478,0.0261,1.0,1.1302,11898,10527


TRAINING EPOCH SET 10
TRAINING EPOCHS 2
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9244,4.682295,0.2625,0.0678,0.2308,0.2311,14.0494,0.0221,1.0,1.1344,11942,10527
2,4.1668,4.57616,0.2692,0.0707,0.2375,0.2378,13.936,0.0241,1.0,1.1319,11916,10527


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6764,4.818122,0.2508,0.0643,0.2213,0.2217,13.9588,0.0198,1.0,1.1216,11807,10527
2,3.9578,4.582401,0.2676,0.0696,0.2353,0.2352,13.7539,0.0232,1.0,1.1139,11726,10527
3,4.0938,4.559391,0.2702,0.0722,0.2375,0.2378,13.9835,0.0243,1.0,1.1293,11888,10527
4,4.1048,4.543438,0.2737,0.0732,0.2389,0.2393,13.979,0.025,1.0,1.1301,11897,10527
5,4.1015,4.548637,0.2737,0.0734,0.2399,0.2399,14.0339,0.0258,1.0,1.1364,11963,10527
6,4.0671,4.547963,0.2732,0.0733,0.2396,0.2397,14.0073,0.0252,1.0,1.1324,11921,10527


Fold 1


Filter: 100%|██████████| 3294/3294 [00:00<00:00, 64551.57 examples/s]
Filter: 100%|██████████| 3294/3294 [00:00<00:00, 76538.90 examples/s]
Filter: 100%|██████████| 2185/2185 [00:00<00:00, 36506.79 examples/s]
Filter: 100%|██████████| 2185/2185 [00:00<00:00, 37984.80 examples/s]
Map: 100%|██████████| 2185/2185 [00:01<00:00, 2001.31 examples/s]
Filter: 100%|██████████| 1109/1109 [00:00<00:00, 36195.21 examples/s]
Filter: 100%|██████████| 1109/1109 [00:00<00:00, 37841.86 examples/s]
Map: 100%|██████████| 1109/1109 [00:00<00:00, 2016.81 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.545,4.889689,0.2229,0.0479,0.2003,0.2004,13.8224,0.0126,1.0,1.16,12216,10531


TRAINING EPOCH SET 2
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.8775,4.741844,0.2416,0.0562,0.215,0.2151,13.9648,0.0155,1.0,1.1593,12209,10531


TRAINING EPOCH SET 3
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6225,4.691846,0.2558,0.0639,0.2264,0.2263,13.7295,0.0182,1.0,1.1332,11934,10531


TRAINING EPOCH SET 4
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4249,4.700852,0.2579,0.0642,0.2276,0.2276,13.8521,0.0182,1.0,1.1453,12061,10531


TRAINING EPOCH SET 8
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2425,4.697736,0.2566,0.0629,0.2263,0.2261,13.8909,0.0174,1.0,1.148,12090,10531
2,4.5525,4.58403,0.2654,0.0691,0.2359,0.2359,13.716,0.0195,1.0,1.1216,11812,10531
3,4.4471,4.562257,0.2679,0.0679,0.2358,0.2356,13.9729,0.0188,1.0,1.1545,12158,10531
4,4.3916,4.559991,0.2695,0.0684,0.2373,0.2373,13.9531,0.0193,1.0,1.1474,12083,10531


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9189,4.721393,0.2581,0.0652,0.2275,0.2275,13.8729,0.0203,1.0,1.1374,11978,10531
2,4.1556,4.607701,0.2691,0.068,0.2345,0.2347,13.8611,0.0196,1.0,1.1384,11988,10531


TRAINING EPOCH SET 16
TRAINING EPOCHS 6


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6743,4.869548,0.2517,0.0611,0.2221,0.2222,13.7024,0.0187,1.0,1.1197,11792,10531
2,3.9456,4.618675,0.2607,0.065,0.229,0.2289,13.771,0.02,1.0,1.1217,11813,10531
3,4.0817,4.594588,0.265,0.0656,0.2314,0.2312,13.9594,0.0193,1.0,1.1503,12114,10531
4,4.093,4.577476,0.2674,0.0693,0.2351,0.2349,14.0667,0.0205,1.0,1.154,12153,10531
5,4.0811,4.581172,0.2687,0.0692,0.2366,0.2364,13.9883,0.0208,1.0,1.1453,12061,10531
6,4.0473,4.583351,0.2687,0.0692,0.2365,0.2363,13.9531,0.0204,1.0,1.1422,12028,10531


Fold 2


Filter: 100%|██████████| 3294/3294 [00:00<00:00, 65420.56 examples/s]
Filter: 100%|██████████| 3294/3294 [00:00<00:00, 75922.72 examples/s]
Filter: 100%|██████████| 2202/2202 [00:00<00:00, 36442.85 examples/s]
Filter: 100%|██████████| 2202/2202 [00:00<00:00, 37632.40 examples/s]
Map: 100%|██████████| 2202/2202 [00:01<00:00, 1995.81 examples/s]
Filter: 100%|██████████| 1092/1092 [00:00<00:00, 35568.69 examples/s]
Filter: 100%|██████████| 1092/1092 [00:00<00:00, 36665.79 examples/s]
Map: 100%|██████████| 1092/1092 [00:00<00:00, 2019.57 examples/s]


TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.5214,4.946936,0.2198,0.0487,0.1974,0.1974,13.5989,0.0184,1.0,1.1116,11695,10521


TRAINING EPOCH SET 2
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.8516,4.797902,0.2413,0.0581,0.2162,0.2162,13.8663,0.0206,1.0,1.1267,11854,10521


TRAINING EPOCH SET 3
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5923,4.748438,0.2537,0.0638,0.224,0.2244,13.794,0.0251,1.0,1.1155,11736,10521


TRAINING EPOCH SET 4
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.3938,4.753319,0.2554,0.0657,0.2243,0.2248,13.8242,0.0237,1.0,1.116,11741,10521


TRAINING EPOCH SET 8
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2115,4.740356,0.2519,0.0633,0.2224,0.2226,13.7015,0.0201,1.0,1.0966,11537,10521
2,4.5252,4.626261,0.2639,0.0692,0.2316,0.2318,13.8049,0.0244,1.0,1.118,11763,10521
3,4.4218,4.608314,0.2614,0.0689,0.2304,0.2309,13.9679,0.0231,1.0,1.1311,11900,10521
4,4.3627,4.606149,0.2639,0.0697,0.2321,0.2325,13.9231,0.0223,1.0,1.1237,11822,10521


TRAINING EPOCH SET 10
TRAINING EPOCHS 2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8883,4.763933,0.2512,0.0606,0.2199,0.2203,13.9423,0.0202,1.0,1.1204,11788,10521
2,4.1306,4.648181,0.2589,0.065,0.2262,0.2265,13.7299,0.0217,1.0,1.1045,11620,10521


TRAINING EPOCH SET 16
TRAINING EPOCHS 6
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6437,4.894735,0.2419,0.0558,0.2128,0.213,13.9231,0.0189,1.0,1.1146,11727,10521
2,3.9212,4.660182,0.2626,0.0652,0.2279,0.2281,13.7445,0.0205,1.0,1.1079,11656,10521
3,4.0536,4.639095,0.2571,0.0637,0.224,0.2244,14.0174,0.021,1.0,1.1314,11903,10521
4,4.0644,4.623759,0.2603,0.0651,0.2267,0.227,13.8745,0.0216,1.0,1.1188,11771,10521
5,4.06,4.620718,0.2607,0.0642,0.2253,0.2256,13.8571,0.0209,1.0,1.1161,11743,10521
6,4.0369,4.624347,0.2605,0.0641,0.2259,0.2263,13.8874,0.0216,1.0,1.1199,11782,10521


In [5]:
########## SAVE THE FILE

with open('reports/results/mined_conala.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
########## LOAD CV RESULTS

import pickle
with open('reports/results/mined_conala.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

In [7]:
########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby(["epoch_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("epoch_set")["rouge"].std())


Mean
epoch_set
0     0.112806
1     0.221793
2     0.242544
3     0.258548
4     0.258927
8     0.270442
10    0.266978
16    0.270702
Name: rouge, dtype: float64
STD
epoch_set
0     0.124702
1     0.155665
2     0.157290
3     0.157945
4     0.156677
8     0.158442
10    0.159420
16    0.159584
Name: rouge, dtype: float64


### Step 2. Learn performance

In [8]:
def step_two(X_train, y_train, model, X_val=None, y_val=None,  save=False): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    if save:
        with open(f'./models/reg_{model}_mined.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_mined.pkl'
    
    else:
        y_pred = reg.predict(X_val)
        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [9]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

0
lr


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5941
[LightGBM] [Info] Number of data points in the train set: 17608, number of used features: 530
[LightGBM] [Info] Start training from score 0.235466
catboost
Learning rate set to 0.064417
0:	learn: 0.1605298	total: 68.5ms	remaining: 1m 8s
1:	learn: 0.1596492	total: 78.4ms	remaining: 39.1s
2:	learn: 0.1588408	total: 87.7ms	remaining: 29.2s
3:	learn: 0.1581226	total: 97.4ms	remaining: 24.2s
4:	learn: 0.1575047	total: 107ms	remaining: 21.4s
5:	learn: 0.1569669	total: 117ms	remaining: 19.3s
6:	learn: 0.1564099	total: 126ms	remaining: 17.9s
7:	learn: 0.1559386	total: 135ms	remaining: 16.7s
8:	learn: 0.1555001	total: 143ms	remaining: 15.8s
9:	learn: 0.1550713	total: 152ms	remaining: 15.1s
10:	learn: 0.1547163	total: 162ms	remaini

In [10]:
cv_df.groupby("epoch_set").lgbm_perf_hat.mean()

epoch_set
0     0.116649
1     0.226129
2     0.247001
3     0.258544
4     0.258544
8     0.265850
10    0.263301
16    0.266283
Name: lgbm_perf_hat, dtype: float64

In [11]:
cv_df.groupby("epoch_set").catboost_perf_hat.std()

epoch_set
0     0.047751
1     0.052327
2     0.053332
3     0.053542
4     0.053498
8     0.053455
10    0.053488
16    0.053415
Name: catboost_perf_hat, dtype: float64

In [12]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print("RMSE ", model_results[model]["rmse_avg"])
    print("MAE ",model_results[model]["mae_avg"])
    print("\n")

with open('reports/results/cd_df_mined_with_predictions.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
RMSE  0.22740389156049523
MAE  0.1703610852394782


svm
RMSE  0.160277467576772
MAE  0.1284488477929678


lgbm
RMSE  0.1586082129263878
MAE  0.12716157211558574


catboost
RMSE  0.15633045155763117
MAE  0.12554363072872834




In [13]:
with open('reports/results/cd_df_mined_with_predictions.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["lr", "svm", "lgbm", "catboost"]

# Prepare the input data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(cv_df.loc[:, "input_sequence"])
X_train_column_sparse = pd.get_dummies(cv_df.loc[:, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_train = hstack([X_train_column_sparse, X_train_tfidf])
y_train = cv_df.loc[:, "rouge"]
    
with open("./models/mined_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
      
for model in t_models:
    print(model)
    preds_df = step_two(X_train=X_train,
                        y_train=y_train,
                        model=model,
                        save=True)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9059
[LightGBM] [Info] Number of data points in the train set: 26352, number of used features: 732
[LightGBM] [Info] Start training from score 0.237842
catboost
Learning rate set to 0.068654
0:	learn: 0.1608050	total: 13.1ms	remaining: 13.1s
1:	learn: 0.1599369	total: 23.2ms	remaining: 11.6s
2:	learn: 0.1591390	total: 32.8ms	remaining: 10.9s
3:	learn: 0.1584232	total: 41.6ms	remaining: 10.4s
4:	learn: 0.1578024	total: 51.2ms	remaining: 10.2s
5:	learn: 0.1572559	total: 59.9ms	remaining: 9.92s
6:	learn: 0.1567367	total: 69.1ms	remaining: 9.8s
7:	learn: 0.1562645	total: 78ms	remaining: 9.67s
8:	learn: 0.1558158	total: 87.5ms	remaining: 9.64s
9:	learn: 0.1554294	total: 96.2ms	remaining: 9.52s
10:	learn: 0.1550948	total: 105ms	r