### 1. Settings

In [5]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [1, 4, 5 , 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/upper_bound/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/upper_bound/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [2]:
dataset = load_dataset("neulab/conala")
# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(dataset["train"]["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

test_data = pr.preprocess_dataset(dataset["test"], tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Fold 0
Fold 1
Fold 2


In [3]:
fold_results = {}

for epoch_i, epoch_set in enumerate(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]):
    TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
    TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"][epoch_i]
    fold_results[epoch_set] = {}
    
    for i, (train_idxs, val_idxs) in enumerate(splits):
        print(f"Fold {i}")
        fold_dataset = DatasetDict({
            "train": dataset["train"].filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
            "validation": dataset["train"].filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
        })
        fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer)
        fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer)
        fold_df = pd.DataFrame(fold_val)

        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        
        text = fold_val["input_sequence"]
        summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
        fold_df["prediction"] = summaries[1]
        
        fold_results[epoch_set][i] = fold_df

Fold 0
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6679,3.329507,0.436,0.2058,0.4004,0.3998,13.665,0.1937,0.887,0.893,8734,9781


  return dynamo.is_compiling()


Fold 1




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6505,3.326596,0.4347,0.1992,0.3928,0.3929,14.5027,0.195,0.9105,0.9143,8611,9418


  return dynamo.is_compiling()


Fold 2




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.6586,3.312803,0.4453,0.2027,0.4085,0.4089,14.4473,0.2105,0.9338,0.9359,9252,9886


  return dynamo.is_compiling()


Fold 0


Map: 100%|██████████| 1512/1512 [00:00<00:00, 2942.44 examples/s]
Map: 100%|██████████| 788/788 [00:00<00:00, 2999.90 examples/s]


cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7902,3.650997,0.3819,0.1674,0.3557,0.3551,13.566,0.1644,0.9014,0.9059,8861,9781
2,3.0329,3.53504,0.419,0.1881,0.3823,0.3824,14.2449,0.1816,0.9454,0.9468,9261,9781
3,2.4008,3.53446,0.4288,0.1865,0.3885,0.3879,14.5584,0.1859,0.971,0.9714,9501,9781
4,1.9169,3.624654,0.4395,0.198,0.3992,0.399,14.6764,0.1948,0.9559,0.9569,9359,9781


  return dynamo.is_compiling()


Fold 1




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7872,3.623544,0.3849,0.1497,0.3436,0.3441,15.485,0.1579,1.0,1.007,9484,9418
2,3.0246,3.561208,0.394,0.1616,0.3537,0.3535,15.1812,0.1689,0.9775,0.9777,9208,9418
3,2.3953,3.521907,0.4273,0.189,0.3827,0.3827,14.9087,0.188,0.9308,0.9331,8788,9418
4,1.9029,3.606307,0.4311,0.1883,0.3873,0.3874,15.1144,0.1955,0.9532,0.9542,8987,9418


  return dynamo.is_compiling()


Fold 2




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7856,3.643505,0.3917,0.1594,0.3555,0.3554,14.3406,0.1744,0.9397,0.9414,9307,9886
2,3.0101,3.481697,0.4177,0.1742,0.3785,0.3793,14.4357,0.1774,0.9191,0.9222,9117,9886
3,2.3819,3.502955,0.4378,0.1933,0.3958,0.3966,15.2802,0.2043,0.9947,0.9947,9834,9886
4,1.8996,3.597759,0.446,0.1937,0.3992,0.3997,15.1401,0.2068,0.9688,0.9692,9582,9886


  return dynamo.is_compiling()


Fold 0


Map: 100%|██████████| 788/788 [00:00<00:00, 2983.88 examples/s]


cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8,3.682778,0.3922,0.1664,0.3667,0.3665,13.934,0.1619,0.9205,0.9235,9033,9781
2,3.0865,3.563044,0.4167,0.1837,0.3797,0.3792,14.5571,0.1802,0.9635,0.9641,9430,9781
3,2.4959,3.569262,0.4037,0.1731,0.3715,0.3711,14.2195,0.1743,0.9384,0.9402,9196,9781
4,2.028,3.641755,0.4358,0.1979,0.3972,0.3978,14.9112,0.1963,0.9745,0.9748,9535,9781
5,1.7176,3.735392,0.4253,0.1872,0.3836,0.3833,14.9594,0.1912,0.9787,0.9789,9575,9781


  return dynamo.is_compiling()


Fold 1




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7899,3.640733,0.3791,0.1474,0.3403,0.34,15.4251,0.1505,1.0,1.0205,9611,9418
2,3.0583,3.578557,0.401,0.1616,0.3576,0.3577,14.4946,0.1666,0.923,0.9258,8719,9418
3,2.4741,3.571582,0.4099,0.1764,0.368,0.368,14.2984,0.1763,0.8929,0.8983,8460,9418
4,2.0221,3.645603,0.4251,0.188,0.3843,0.3841,14.9523,0.1909,0.9222,0.925,8712,9418
5,1.714,3.709553,0.4176,0.1778,0.3738,0.3745,15.4373,0.1865,0.9794,0.9796,9226,9418


  return dynamo.is_compiling()


Fold 2




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7934,3.643271,0.3873,0.1577,0.3575,0.3579,13.6247,0.1644,0.8707,0.8784,8684,9886
2,3.0525,3.520022,0.4014,0.1622,0.3649,0.3654,14.2596,0.1715,0.9024,0.9068,8965,9886
3,2.4701,3.554854,0.4213,0.1801,0.3808,0.3809,15.1774,0.2011,0.9941,0.9941,9828,9886
4,2.0077,3.662021,0.4327,0.1887,0.387,0.3871,15.6144,0.1966,1.0,1.0022,9908,9886
5,1.7088,3.735873,0.4348,0.1899,0.3873,0.3874,15.3406,0.2042,0.9915,0.9915,9802,9886


  return dynamo.is_compiling()


Fold 0




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8138,3.742558,0.3789,0.1555,0.3499,0.3495,13.6129,0.1572,0.8664,0.8746,8554,9781
2,3.1308,3.634027,0.3995,0.1642,0.3633,0.3631,14.547,0.1629,0.9591,0.9599,9389,9781
3,2.6058,3.698906,0.399,0.1669,0.363,0.3633,13.6751,0.1658,0.8891,0.8948,8752,9781
4,2.1854,3.732364,0.4135,0.1798,0.3764,0.3764,14.3997,0.1786,0.952,0.9531,9322,9781
5,1.862,3.846827,0.4045,0.1702,0.3607,0.3599,15.4594,0.1678,1.0,1.0071,9850,9781
6,1.6642,3.862932,0.406,0.1706,0.3644,0.3641,15.0279,0.1738,0.9908,0.9908,9691,9781
7,1.542,3.885118,0.4182,0.1805,0.3766,0.3763,14.9048,0.1805,0.9785,0.9787,9573,9781


  return dynamo.is_compiling()


Fold 1




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8071,3.671994,0.3652,0.1475,0.3305,0.3307,14.8215,0.1474,0.9579,0.9588,9030,9418
2,3.1219,3.625331,0.3917,0.1563,0.3481,0.3484,14.9237,0.156,0.9404,0.9421,8873,9418
3,2.593,3.617825,0.4016,0.1606,0.3598,0.3606,14.47,0.163,0.9164,0.9197,8662,9418
4,2.1612,3.72381,0.4066,0.1724,0.3681,0.3683,15.1444,0.1881,0.9486,0.9499,8946,9418
5,1.8598,3.800734,0.4141,0.1743,0.3705,0.3702,15.1567,0.1905,0.9694,0.9698,9134,9418
6,1.6604,3.840283,0.4193,0.1794,0.3751,0.3753,15.609,0.1882,1.0,1.0046,9461,9418
7,1.5424,3.845845,0.4166,0.1804,0.3746,0.3748,15.2343,0.1927,0.9684,0.9689,9125,9418


  return dynamo.is_compiling()


Fold 2




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8046,3.701607,0.388,0.1613,0.3576,0.3583,13.4537,0.166,0.8443,0.8552,8455,9886
2,3.1216,3.548919,0.4124,0.1722,0.3767,0.3771,13.7789,0.1722,0.8586,0.8677,8578,9886
3,2.6015,3.605074,0.4022,0.1711,0.3663,0.3664,14.7828,0.1811,0.9415,0.9432,9324,9886
4,2.1724,3.694621,0.417,0.1707,0.3732,0.3739,15.3342,0.1805,0.9976,0.9976,9862,9886
5,1.8657,3.775568,0.4039,0.1716,0.3611,0.3614,15.5578,0.1845,1.0,1.0077,9962,9886
6,1.6607,3.826312,0.4186,0.1788,0.3742,0.3748,15.3368,0.1887,0.9897,0.9898,9785,9886
7,1.5455,3.845487,0.4205,0.1789,0.3754,0.3765,15.2648,0.1891,0.9755,0.9758,9647,9886


  return dynamo.is_compiling()


In [6]:
import pickle
with open('fold_results.pickle', 'wb') as handle:
    pickle.dump(fold_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
for i, (k, f_df) in enumerate(fold_results.items()): 
    
    f_df['fold'] = k

    if i==0: 
        cv_df = f_df.copy()
    else: 
        cv_df = pd.concat([cv_df, f_df])

In [5]:
rouge = evaluate.load('rouge')
cv_df["rouge"] = cv_df.progress_apply(lambda x: rouge.compute(predictions=[x["prediction"]],
                                references=[x["input_sequence"]])["rouge1"],
                                axis=1)

  8%|▊         | 175/2300 [00:17<03:07, 11.35it/s]

100%|██████████| 2300/2300 [03:31<00:00, 10.86it/s]


### Step 2. Learn performance

In [6]:
def step_two(X_train, y_train, X_val, y_val, model): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    y_pred = reg.predict(X_val)
    y_pred[y_pred<0] = 0

    mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
    rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
    return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [None]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}

cv_df["perf_hat"] = 0

for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]

    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train_tfidf,
                            y_train=y_train,
                            X_val=X_val_tfidf,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, "perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

In [8]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print(model_results[model]["rmse_avg"])
    print(model_results[model]["mae_avg"])
    print("\n")

lr
0.20878739304706798
0.15190509314919165


svm
0.1450162616070708
0.11518738698657867


lgbm
0.15065361317257017
0.11926828787539252


catboost
0.14631343823765694
0.11456435755497045


