### 1. Settings

In [21]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 4, 5 , 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [51]:
dataset = load_dataset("neulab/conala")

train_data = pr.preprocess_dataset(dataset["train"], tokenizer=tokenizer)
test_data = pr.preprocess_dataset(dataset["test"], tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [52]:
def pred_perf(X, model): 

    with open(f'./models/reg_{model}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

In [64]:
### Step 1. PREDICT PERFORMANCE

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["svm", "catboost"]

for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

    set_df = test_df.copy()
    set_df["epoch_set"] = epoch_set
    # Prepare the input data
    with open("./models/vectorizer.pkl", "rb") as file:
        vectorizer = pickle.load(file)

    if epoch_set==0:
        meta_preds_df = set_df.copy()
    else: 
        meta_preds_df = pd.concat([meta_preds_df, set_df])
         
X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_test = hstack([X_test_column_sparse, X_test_tfidf])
#y_test = test_df.loc[:, "rouge"]

models_preds = []
for model in t_models:
    print(model)
    meta_preds_df[f"{model}_preds"] = pred_perf(X_test, model)

meta_preds_df = meta_preds_df.reset_index(drop=True)

svm
catboost


In [68]:
meta_preds_df.groupby("epoch_set").svm_preds.mean()


epoch_set
0    0.322575
1    0.441403
4    0.439045
5    0.428796
7    0.420598
Name: svm_preds, dtype: float64

In [108]:
meta_preds_df.catboost_preds

0       0.372039
1       0.345680
2       0.316194
3       0.276627
4       0.302049
          ...   
2380    0.258292
2381    0.419826
2382    0.578888
2383    0.435920
2384    0.393024
Name: catboost_preds, Length: 2385, dtype: float64

In [110]:
models_index = meta_preds_df.groupby("id")["catboost_preds"].idxmax()
optimal_ensemble = meta_preds_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))

In [72]:
results = {}
latest_run_epoch = 0

for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

    set_df = test_df.copy()
    print(f"TRAINING EPOCH SET {epoch_set}")

    TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
    MODEL_PATH = f"./models/{epoch_set}_epoch_set"
    

    results[epoch_set] = {}

    if epoch_set > 1: 
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
    else:
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
    
    print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

    if epoch_set > 1: 
        model_name = f"./models/{latest_run_epoch}_epoch_set"

    print(f"LOADING MODEL {model_name}")

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    print(device)
    model.to(device)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    compute_metrics = ev.compute_metric_with_params(tokenizer) 

    if not os.path.exists(f'reports/'): 
        os.mkdir(f'reports/')

    training_args = Seq2SeqTrainingArguments(
            **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
        )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    if epoch_set!=0:
        trainer.train()

    text = list(test_df["input_sequence"].values)
    summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
    
    
    set_df["epoch_set"] = epoch_set
    set_df["prediction"] = summaries[1]
    set_df["rouge"] = rouge.compute(predictions=set_df["prediction"], 
                references=set_df["output_sequence"],
                use_stemmer=True, 
                use_aggregator=False,
                rouge_types=["rouge1"])["rouge1"]

    if epoch_set==0:
        test_result_df = set_df.copy()
    else: 
        test_result_df = pd.concat([test_result_df, set_df])


    
    ########## SAVE EPOCH SET MODEL
    if not os.path.exists(MODEL_PATH): 
        os.mkdir(MODEL_PATH)

    trainer.save_model(MODEL_PATH)

    latest_run_epoch = epoch_set


########## SAVE THE FILE

with open('test_results_df.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  return dynamo.is_compiling()


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum




cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5936,3.208277,0.4633,0.2218,0.4228,0.4226,14.7883,0.2192,0.9619,0.9626,5821,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 4
TRAINING EPOCHS 3
LOADING MODEL ./models/1_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8575,3.459214,0.4108,0.1752,0.3697,0.3699,14.8805,0.1697,0.9545,0.9555,5778,6047
2,2.7097,3.388699,0.4359,0.1898,0.3931,0.3931,14.87,0.1926,0.9559,0.9568,5786,6047
3,2.0885,3.424679,0.4464,0.2015,0.4026,0.4021,15.1593,0.2093,0.9694,0.9699,5865,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 5
TRAINING EPOCHS 1
LOADING MODEL ./models/4_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.8982,3.565015,0.4314,0.1878,0.3871,0.3868,15.26,0.1988,0.9823,0.9825,5941,6047


  return dynamo.is_compiling()


TRAINING EPOCH SET 7
TRAINING EPOCHS 2
LOADING MODEL ./models/5_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,1.895,3.712929,0.4091,0.1674,0.3649,0.365,14.6457,0.1705,0.9044,0.9087,5495,6047
2,1.8364,3.656302,0.4344,0.2003,0.3929,0.3926,15.4801,0.2134,0.9945,0.9945,6014,6047


  return dynamo.is_compiling()


In [85]:
########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("epoch_set")["rouge"].std())

Mean
epoch_set
0    0.319179
1    0.465227
4    0.445885
5    0.433447
7    0.434192
Name: rouge, dtype: float64
STD
epoch_set
0    0.180808
1    0.204037
4    0.201935
5    0.196731
7    0.206415
Name: rouge, dtype: float64


In [111]:
### ENSEMBLE COMPUTE
test_result_df["opt_es_id"] = test_result_df.id.map(optimal_ensemble_map)
ensemble_preds = test_result_df.loc[test_result_df["epoch_set"]==test_result_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()

0.4612297855580542