### 1. Settings

In [14]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-5,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [15]:
dataset = pd.read_csv(f"../data/processed/conala/20240327/conala_clustered.csv")
dataset = dataset.drop("time_batch", axis=1)

test_4_examples = dataset[dataset["cluster"]==4].sample(frac=0.85, random_state=RS)
print("Cluster 4 obsevations: ", test_4_examples.shape)
test_non4_examples = dataset[dataset["cluster"]!=4].sample(n=156, random_state=RS)
print("Cluster not 4 obsevations: ", test_non4_examples.shape)

test_dataset = pd.concat([test_4_examples, test_non4_examples])
train_dataset = dataset[~dataset.index.isin(test_dataset.index)]
print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(train_dataset["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

Cluster 4 obsevations:  (344, 6)
Cluster not 4 obsevations:  (156, 6)
Train Data:  (2379, 6)
Test Data:  (500, 6)
Fold 0
Fold 1
Fold 2


Filter: 100%|██████████| 500/500 [00:00<00:00, 87549.14 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 45340.89 examples/s]
Map: 100%|██████████| 499/499 [00:00<00:00, 2094.76 examples/s]


In [16]:
train_dataset

Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 2379
})

In [17]:
test_dataset


Dataset({
    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'idx', 'cluster'],
    num_rows: 500
})

In [19]:
fold_results = {}

cluster_models = [[1,2], [3,4], [5,6], [7,8], [4], [5], [1]]

for cluster_i, cluster_set in enumerate(cluster_models):
    fold_results[cluster_i] = {}
    
for i, (train_idxs, val_idxs) in enumerate(splits):
    latest_run_epoch = 0

    for cluster_i, cluster_set in enumerate(cluster_models):
        print(f"TRAINING EPOCH SET {cluster_set}")

        TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
        FOLD_MODEL_PATH = "./tmp_cluster/"

        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = 2
        
        print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

        print(f"Fold {i}")
        fold_dataset = DatasetDict({
            "train": train_dataset.filter(lambda q_id: (q_id["question_id"] in questions_list[train_idxs]) and (q_id["cluster"] in cluster_set)),
            "validation": train_dataset.filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
        })
        fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer)
        fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer)
        fold_df = pd.DataFrame(fold_val)


        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        print(f"LOADING MODEL {model_name}")

        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports_cluster/'): 
            os.mkdir(f'reports_cluster/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()

        text = fold_val["input_sequence"]
        summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
        
        fold_df["prediction"] = summaries[1]
        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                    references=fold_df["output_sequence"],
                    use_stemmer=True, 
                    use_aggregator=False,
                    rouge_types=["rouge1"])["rouge1"]
        
        fold_results[cluster_i][i] = fold_df

        print("FOLDS IN RESULTS ", fold_results[cluster_i].keys())
        
        ########## SAVE FOLD MODEL
        if not os.path.exists(FOLD_MODEL_PATH): 
            os.mkdir(FOLD_MODEL_PATH)

        trainer.save_model(FOLD_MODEL_PATH)


TRAINING EPOCH SET [1, 2]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 82214.15 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93499.34 examples/s]
Filter: 100%|██████████| 266/266 [00:00<00:00, 37560.09 examples/s]
Filter: 100%|██████████| 266/266 [00:00<00:00, 41172.22 examples/s]
Map: 100%|██████████| 266/266 [00:00<00:00, 2061.67 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 57035.74 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 56843.22 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2167.57 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5671,3.78844,0.3706,0.1154,0.3367,0.3378,10.6783,0.0477,0.5849,0.6509,6676,10256
2,3.3363,3.52631,0.4226,0.1633,0.3809,0.3814,13.7359,0.1378,0.8716,0.8792,9017,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [3, 4]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74666.26 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95049.95 examples/s]
Filter: 100%|██████████| 202/202 [00:00<00:00, 31532.60 examples/s]
Filter: 100%|██████████| 202/202 [00:00<00:00, 34060.28 examples/s]
Map: 100%|██████████| 202/202 [00:00<00:00, 1939.87 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 50809.60 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 57326.48 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2136.68 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.83,4.145144,0.3355,0.0889,0.3066,0.3067,9.2904,0.0237,0.422,0.5369,5506,10256
2,3.7498,3.633107,0.398,0.1348,0.3541,0.3536,12.6308,0.0958,0.767,0.7904,8106,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [5, 6]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76133.23 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94242.00 examples/s]
Filter: 100%|██████████| 220/220 [00:00<00:00, 33880.92 examples/s]
Filter: 100%|██████████| 220/220 [00:00<00:00, 36290.04 examples/s]
Map: 100%|██████████| 220/220 [00:00<00:00, 2034.40 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 52044.49 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 55260.10 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2141.63 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6949,4.06932,0.3488,0.1012,0.3174,0.3177,9.1101,0.0265,0.4077,0.5271,5406,10256
2,3.6042,3.600139,0.4046,0.1513,0.3674,0.3679,12.2954,0.1173,0.7408,0.7692,7889,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [7, 8]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 58479.90 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93677.53 examples/s]
Filter: 100%|██████████| 602/602 [00:00<00:00, 45974.60 examples/s]
Filter: 100%|██████████| 602/602 [00:00<00:00, 51430.31 examples/s]
Map: 100%|██████████| 602/602 [00:00<00:00, 2141.00 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 52174.13 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 55392.54 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2184.54 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.0059,3.449053,0.42,0.1729,0.3849,0.385,14.3955,0.1558,0.9051,0.9093,9326,10256
2,3.0676,3.390378,0.4358,0.1827,0.3988,0.3988,14.164,0.182,0.9123,0.916,9394,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [4]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76105.36 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95754.12 examples/s]
Filter: 100%|██████████| 34/34 [00:00<00:00, 10500.43 examples/s]
Filter: 100%|██████████| 34/34 [00:00<00:00, 11638.48 examples/s]
Map: 100%|██████████| 34/34 [00:00<00:00, 1708.70 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 51671.35 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 55956.74 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2155.19 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.3205,5.028266,0.3105,0.0741,0.2749,0.2762,10.0939,0.0193,0.5178,0.6031,6185,10256
2,5.0645,4.898468,0.308,0.0765,0.2749,0.2763,9.9362,0.0201,0.4972,0.5886,6037,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [5]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 81947.45 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95238.66 examples/s]
Filter: 100%|██████████| 109/109 [00:00<00:00, 24349.12 examples/s]
Filter: 100%|██████████| 109/109 [00:00<00:00, 25088.03 examples/s]
Map: 100%|██████████| 109/109 [00:00<00:00, 1835.17 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 52102.75 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 54917.80 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2150.16 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1092,4.679547,0.3301,0.0812,0.2945,0.2966,9.8924,0.0203,0.4873,0.5818,5967,10256
2,4.3499,4.150532,0.3492,0.1005,0.3142,0.3147,8.9474,0.0252,0.3891,0.5144,5276,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [1]
TRAINING EPOCHS 2
Fold 0


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77103.92 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95372.47 examples/s]
Filter: 100%|██████████| 198/198 [00:00<00:00, 34945.18 examples/s]
Filter: 100%|██████████| 198/198 [00:00<00:00, 35723.84 examples/s]
Map: 100%|██████████| 198/198 [00:00<00:00, 2013.26 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 50432.64 examples/s]
Filter: 100%|██████████| 799/799 [00:00<00:00, 55069.41 examples/s]
Map: 100%|██████████| 799/799 [00:00<00:00, 2101.72 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.716,4.101206,0.3412,0.0959,0.307,0.3071,10.2791,0.0267,0.5488,0.625,6410,10256
2,3.4726,3.589227,0.4011,0.1519,0.3615,0.3609,14.4768,0.1458,0.9562,0.9571,9816,10256


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0])
TRAINING EPOCH SET [1, 2]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76926.18 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 96438.95 examples/s]
Filter: 100%|██████████| 279/279 [00:00<00:00, 39828.83 examples/s]
Filter: 100%|██████████| 279/279 [00:00<00:00, 42465.10 examples/s]
Map: 100%|██████████| 279/279 [00:00<00:00, 2071.10 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 50478.42 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54218.76 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2120.48 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5365,3.823821,0.3708,0.1135,0.3342,0.3348,11.0169,0.0587,0.6342,0.6871,6758,9836
2,3.3432,3.58173,0.4216,0.1685,0.3821,0.3822,13.3888,0.147,0.8587,0.8678,8536,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [3, 4]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77608.26 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95375.20 examples/s]
Filter: 100%|██████████| 238/238 [00:00<00:00, 34896.33 examples/s]
Filter: 100%|██████████| 238/238 [00:00<00:00, 35746.06 examples/s]
Map: 100%|██████████| 238/238 [00:00<00:00, 1969.82 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 49387.82 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 53775.82 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2094.73 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7906,4.022253,0.3525,0.1005,0.316,0.3172,9.8088,0.0292,0.4809,0.5774,5679,9836
2,3.6171,3.659619,0.3997,0.1374,0.3554,0.3559,12.7659,0.0975,0.7926,0.8114,7981,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [5, 6]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 77071.16 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95929.95 examples/s]
Filter: 100%|██████████| 224/224 [00:00<00:00, 35140.79 examples/s]
Filter: 100%|██████████| 224/224 [00:00<00:00, 36685.83 examples/s]
Map: 100%|██████████| 224/224 [00:00<00:00, 2086.65 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 51056.94 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54423.69 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2096.16 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4725,4.1221,0.3439,0.0979,0.31,0.3109,9.1495,0.0245,0.4188,0.5347,5259,9836
2,3.3622,3.66198,0.4075,0.153,0.3683,0.369,12.606,0.1258,0.7681,0.7913,7783,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [7, 8]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 78314.22 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 69505.78 examples/s]
Filter: 100%|██████████| 600/600 [00:00<00:00, 45687.10 examples/s]
Filter: 100%|██████████| 600/600 [00:00<00:00, 50311.52 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 2070.66 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 48794.59 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 53634.53 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2106.36 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9397,3.505379,0.4258,0.1778,0.3889,0.3901,13.7282,0.1724,0.8775,0.8844,8699,9836
2,3.0187,3.455814,0.4273,0.1786,0.3896,0.3899,13.827,0.1732,0.8834,0.8897,8751,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [4]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76094.91 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94963.11 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 11137.29 examples/s]
Filter: 100%|██████████| 40/40 [00:00<00:00, 13400.33 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 1739.96 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 48660.61 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54571.01 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2094.76 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.4504,5.03388,0.3237,0.0842,0.284,0.2846,10.0663,0.0195,0.5178,0.6031,5932,9836
2,5.2057,4.882424,0.3255,0.0842,0.2856,0.2865,10.1899,0.0199,0.5307,0.6121,6021,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [5]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 74874.68 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 93361.11 examples/s]
Filter: 100%|██████████| 125/125 [00:00<00:00, 24191.95 examples/s]
Filter: 100%|██████████| 125/125 [00:00<00:00, 29213.13 examples/s]
Map: 100%|██████████| 125/125 [00:00<00:00, 2094.33 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 50716.54 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 55867.87 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2145.24 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6858,4.667909,0.3393,0.0871,0.2996,0.3006,9.7191,0.0206,0.477,0.5746,5652,9836
2,3.832,4.113112,0.35,0.107,0.3154,0.3165,9.8947,0.0317,0.5,0.5906,5809,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [1]
TRAINING EPOCHS 2
Fold 1


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 79327.18 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95366.09 examples/s]
Filter: 100%|██████████| 199/199 [00:00<00:00, 31055.05 examples/s]
Filter: 100%|██████████| 199/199 [00:00<00:00, 34853.29 examples/s]
Map: 100%|██████████| 199/199 [00:00<00:00, 2037.32 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 48135.56 examples/s]
Filter: 100%|██████████| 769/769 [00:00<00:00, 54373.23 examples/s]
Map: 100%|██████████| 769/769 [00:00<00:00, 2107.98 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7519,4.149573,0.3503,0.0985,0.3111,0.3115,10.0416,0.0278,0.5236,0.6072,5972,9836
2,3.4883,3.654356,0.402,0.1449,0.365,0.3651,12.3693,0.1075,0.7512,0.7776,7648,9836


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1])
TRAINING EPOCH SET [1, 2]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 81440.47 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95528.60 examples/s]
Filter: 100%|██████████| 245/245 [00:00<00:00, 35525.29 examples/s]
Filter: 100%|██████████| 245/245 [00:00<00:00, 38429.49 examples/s]
Map: 100%|██████████| 245/245 [00:00<00:00, 2044.84 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 50569.09 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 55412.07 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2169.12 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5902,3.862168,0.3781,0.1189,0.3402,0.3398,10.524,0.0373,0.5937,0.6573,6716,10218
2,3.4085,3.535324,0.4208,0.1621,0.3767,0.3765,13.9864,0.1452,0.9136,0.9171,9371,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [3, 4]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75910.24 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 94509.79 examples/s]
Filter: 100%|██████████| 234/234 [00:00<00:00, 36205.81 examples/s]
Filter: 100%|██████████| 234/234 [00:00<00:00, 38192.35 examples/s]
Map: 100%|██████████| 234/234 [00:00<00:00, 1958.47 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 51077.08 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54471.48 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2096.32 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.855,4.00618,0.3518,0.1046,0.3181,0.3182,9.455,0.0281,0.4477,0.5544,5665,10218
2,3.6567,3.587981,0.4077,0.1489,0.3681,0.3685,11.7891,0.1022,0.7022,0.7388,7549,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [5, 6]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 75813.92 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95272.30 examples/s]
Filter: 100%|██████████| 222/222 [00:00<00:00, 34277.03 examples/s]
Filter: 100%|██████████| 222/222 [00:00<00:00, 37550.33 examples/s]
Map: 100%|██████████| 222/222 [00:00<00:00, 2086.84 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 50366.18 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54947.51 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2151.77 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6102,4.065082,0.3441,0.101,0.3142,0.314,9.3576,0.029,0.4529,0.558,5702,10218
2,3.4892,3.63864,0.3955,0.1474,0.3596,0.3598,11.7941,0.1177,0.714,0.748,7643,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [7, 8]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 76689.69 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 96778.49 examples/s]
Filter: 100%|██████████| 560/560 [00:00<00:00, 45662.94 examples/s]
Filter: 100%|██████████| 560/560 [00:00<00:00, 52236.41 examples/s]
Map: 100%|██████████| 560/560 [00:00<00:00, 2144.49 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 52278.12 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 57330.33 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2182.59 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.0013,3.44678,0.4301,0.1798,0.3915,0.3914,13.7978,0.1752,0.9053,0.9095,9293,10218
2,3.0783,3.382052,0.4384,0.1895,0.3988,0.3984,14.0271,0.1818,0.9202,0.9232,9433,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [4]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 79721.40 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95557.88 examples/s]
Filter: 100%|██████████| 48/48 [00:00<00:00, 12428.33 examples/s]
Filter: 100%|██████████| 48/48 [00:00<00:00, 14370.21 examples/s]
Map: 100%|██████████| 48/48 [00:00<00:00, 1810.21 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 49864.85 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 53959.08 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2068.38 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.3251,4.953931,0.3111,0.0779,0.2748,0.276,10.1319,0.0219,0.544,0.6216,6351,10218
2,5.1004,4.755752,0.3104,0.0782,0.2773,0.2782,10.2022,0.0208,0.5463,0.6232,6368,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [5]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 78223.97 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95582.59 examples/s]
Filter: 100%|██████████| 122/122 [00:00<00:00, 24923.53 examples/s]
Filter: 100%|██████████| 122/122 [00:00<00:00, 27652.26 examples/s]
Map: 100%|██████████| 122/122 [00:00<00:00, 2009.59 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 48963.33 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54821.76 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2078.16 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.9335,4.617236,0.3306,0.0866,0.2961,0.2965,9.434,0.0233,0.4564,0.5604,5726,10218
2,3.9699,4.101176,0.3525,0.1182,0.3181,0.3182,10.1319,0.039,0.5151,0.6012,6143,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])
TRAINING EPOCH SET [1]
TRAINING EPOCHS 2
Fold 2


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 78564.56 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 95473.76 examples/s]
Filter: 100%|██████████| 187/187 [00:00<00:00, 31360.85 examples/s]
Filter: 100%|██████████| 187/187 [00:00<00:00, 33088.71 examples/s]
Map: 100%|██████████| 187/187 [00:00<00:00, 1978.69 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 49371.25 examples/s]
Filter: 100%|██████████| 811/811 [00:00<00:00, 54690.43 examples/s]
Map: 100%|██████████| 811/811 [00:00<00:00, 2085.85 examples/s]


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7558,4.150366,0.3527,0.1062,0.317,0.3171,10.1122,0.0271,0.5442,0.6217,6353,10218
2,3.5399,3.627105,0.4007,0.146,0.3597,0.3599,12.7571,0.1091,0.794,0.8126,8303,10218


  return dynamo.is_compiling()


FOLDS IN RESULTS  dict_keys([0, 1, 2])


In [20]:
########## CONVERT TO DATAFRAME

for cluster_i, (cluster_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[cluster_i].items()): 
        
        f_df['fold'] = k
        f_df['cluster_set'] = cluster_i

        if (cluster_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

In [21]:
########## SAVE THE FILE

with open('cluster_cv_df_check_drift.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
########## ROUGE PER SETTING

print("Mean")
print(cv_df.groupby("cluster_set")["rouge"].mean())

print("STD")
print(cv_df.groupby("cluster_set")["rouge"].std())

Mean
cluster_set
0    0.421810
1    0.400814
2    0.401374
3    0.432673
4    0.312228
5    0.349272
6    0.400410
Name: rouge, dtype: float64
STD
cluster_set
0    0.183458
1    0.182638
2    0.186765
3    0.191407
4    0.174070
5    0.189003
6    0.182253
Name: rouge, dtype: float64
