### 1. Settings

In [None]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [1, 4, 5 , 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [13]:
dataset = load_dataset("neulab/conala")
# Cross Validation
folds = KFold(n_splits=3, random_state=RS, shuffle=True)
questions_list = np.array(list(set(dataset["train"]["question_id"])))
splits_obj = folds.split(questions_list)
splits = []
for i, (train_idxs, val_idxs) in enumerate(splits_obj):
    print(f"Fold {i}")
    splits.append([train_idxs, val_idxs])

test_data = pr.preprocess_dataset(dataset["test"], tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Fold 0
Fold 1
Fold 2


In [None]:
fold_results = {}

for epoch_i, epoch_set in enumerate(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]):
    TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
    TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"][epoch_i]
    fold_results[epoch_set] = {}
    
    for i, (train_idxs, val_idxs) in enumerate(splits):

        
        print(f"Fold {i}")
        fold_dataset = DatasetDict({
            "train": dataset["train"].filter(lambda q_id: q_id["question_id"] in questions_list[train_idxs]),
            "validation": dataset["train"].filter(lambda q_id: q_id["question_id"] in questions_list[val_idxs]),
        })
        fold_train = pr.preprocess_dataset(fold_dataset["train"], tokenizer=tokenizer)
        fold_val = pr.preprocess_dataset(fold_dataset["validation"], tokenizer=tokenizer)
        fold_df = pd.DataFrame(fold_val)

        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        print(device)
        model.to(device)

        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
        compute_metrics = ev.compute_metric_with_params(tokenizer) 

        if not os.path.exists(f'reports/'): 
            os.mkdir(f'reports/')

        training_args = Seq2SeqTrainingArguments(
                **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
            )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=fold_train,
            eval_dataset=fold_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        
        text = fold_val["input_sequence"]
        summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
        
        fold_df["prediction"] = summaries[1]
        fold_df["rouge"] = rouge.compute(predictions=fold_df["prediction"], 
                       references=fold_df["output_sequence"],
                       use_stemmer=True, 
                       use_aggregator=False,
                       rouge_types=["rouge1"])["rouge1"]
        
        fold_results[epoch_set][i] = fold_df

########## CONVERT TO DATAFRAME

for epoch_i, (epoch_set) in enumerate(fold_results.keys()): 
    
    for i, (k, f_df) in enumerate(fold_results[epoch_set].items()): 
        
        f_df['fold'] = k
        f_df['epoch_set'] = epoch_set

        if (epoch_i==0 and i==0): 
            cv_df = f_df.copy()
        else: 
            cv_df = pd.concat([cv_df, f_df])

########## SAVE THE FILE

with open('cv_df.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
import pickle
with open('cv_df.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

In [59]:
########## ADD ZERO SHOT

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

zs_df = cv_df.copy()
zs_df = zs_df.drop_duplicates(["question_id", "input_sequence"])
zs_df["epoch_set"] = 0
zs_df["prediction"] = infer.generate_summary(list(zs_df["input_sequence"].values), model, tokenizer, FULL_TRAIN_ARGS["ENCODER_LENGTH"], FULL_TRAIN_ARGS["DECODER_LENGTH"])

cv_df = pd.concat([cv_df, zs_df])

  return dynamo.is_compiling()


In [64]:
########## ROUGE PER SETTING

cv_df.groupby("epoch_set")["rouge"].mean()

epoch_set
0    0.436602
1    0.437766
4    0.437809
5    0.425505
7    0.418277
Name: rouge, dtype: float64

### Step 2. Learn performance

In [9]:
def step_two(X_train, y_train, X_val, y_val, model): 

    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor()
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)

    y_pred = reg.predict(X_val)
    y_pred[y_pred<0] = 0

    mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
    rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
    return {"pred": y_pred, "mae": mae, "rmse": rmse}

In [10]:
t_models = ["lr", "svm", "lgbm", "catboost"]

results = {}

cv_df["perf_hat"] = 0


for test_fold in range(cv_df.fold.max()+1):
    print(test_fold)

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
    
    X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
    X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
    X_val = hstack([X_val_column_sparse, X_val_tfidf])
    y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

    results[test_fold] = {}
    for model in t_models:
        print(model)
        preds_df = step_two(X_train=X_train,
                            y_train=y_train,
                            X_val=X_val,
                            y_val=y_val,
                            model=model)
        cv_df.loc[cv_df.fold==test_fold, "perf_hat"] = preds_df["pred"]
        results[test_fold][model] = preds_df

cv_df = cv_df.reset_index(drop=True)

0
lr
svm


  cv_df.loc[cv_df.fold==test_fold, "perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.031548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4597
[LightGBM] [Info] Number of data points in the train set: 6048, number of used features: 251
[LightGBM] [Info] Start training from score 0.429893


catboost
Learning rate set to 0.054409
0:	learn: 0.1871019	total: 60ms	remaining: 59.9s
1:	learn: 0.1865091	total: 64.7ms	remaining: 32.3s
2:	learn: 0.1860489	total: 68.8ms	remaining: 22.9s
3:	learn: 0.1854273	total: 71.8ms	remaining: 17.9s
4:	learn: 0.1849794	total: 74.6ms	remaining: 14.8s
5:	learn: 0.1844890	total: 77.1ms	remaining: 12.8s
6:	learn: 0.1841156	total: 79.4ms	remaining: 11.3s
7:	learn: 0.1837264	total: 81.8ms	remaining: 10.1s
8:	learn: 0.1833172	total: 84.1ms	remaining: 9.25s
9:	learn: 0.1830036	total: 86.3ms	remaining: 8.54s
10:	learn: 0.1826402	total: 88.5ms	remaining: 7.95s
11:	learn: 0.1822825	total: 90.7ms	remaining: 7.47s
12:	learn: 0.1820059	total: 92.9ms	remaining: 7.05s
13:	learn: 0.1817221	total: 95.1ms	remaining: 6.7s
14:	learn: 0.1814616	total: 97.3ms	remaining: 6.39s
15:	learn: 0.1811516	total: 99.5ms	remaining: 6.12s
16:	learn: 0.1809276	total: 102ms	remaining: 5.88s
17:	learn: 0.1807446	total: 104ms	remaining: 5.67s
18:	learn: 0.1804750	total: 106ms	remain

In [11]:
cv_df.groupby("epoch_set")["perf_hat"].mean()

epoch_set
1    0.437560
4    0.435067
5    0.427149
7    0.420366
Name: perf_hat, dtype: float64

In [12]:
# rearrange the file

model_results = {}

for model in t_models:
    model_results[model]= {}
    model_results[model]["rmse"] = []
    model_results[model]["mae"] = [] 

    for fold in range(3):
    
        model_results[model]["mae"].append(results[fold][model]["mae"])
        model_results[model]["rmse"].append(results[fold][model]["rmse"])
    
    model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
    model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

for model in t_models:
    print(model)
    print(model_results[model]["rmse_avg"])
    print(model_results[model]["mae_avg"])
    print("\n")

lr
0.23991892736138118
0.18434274362303518


svm
0.18678322874195594
0.1475834151646963


lgbm
0.1871979445735191
0.14734427393845723


catboost
0.18162865133585648
0.1438321929578931


