### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import numpy as np
from tqdm import tqdm # type: ignore
import pandas as pd
import copy

from datasets import load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import KFold # type: ignore
import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from scipy.sparse import hstack

tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

FULL_TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 3, 4, 5, 6, 7],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-6,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/results'
FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/logs'

model_name="Salesforce/codet5-base-multi-sum"
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. Sampling as in the paper (further, random sampling)

In [2]:
dataset = pd.read_csv(f"../data/processed/conala/20240327/conala_clustered.csv")
dataset = dataset.drop("time_batch", axis=1)

test_4_examples = dataset[dataset["cluster"]==4].sample(frac=0.85, random_state=RS)
print("Cluster 4 obsevations: ", test_4_examples.shape)
test_non4_examples = dataset[dataset["cluster"]!=4].sample(n=156, random_state=RS)
print("Cluster not 4 obsevations: ", test_non4_examples.shape)

test_dataset = pd.concat([test_4_examples, test_non4_examples])
train_dataset = dataset[~dataset.index.isin(test_dataset.index)]
print("Train Data: ", train_dataset.shape)
print("Test Data: ", test_dataset.shape)

train_dataset = Dataset.from_pandas(train_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_dataset.sample(frac=1, random_state=RS).reset_index(drop=True))

train_data = pr.preprocess_dataset(train_dataset, tokenizer=tokenizer)
test_data = pr.preprocess_dataset(test_dataset, tokenizer=tokenizer)
test_df = pd.DataFrame(test_data)
test_df["id"] = test_df.index

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

Cluster 4 obsevations:  (344, 6)
Cluster not 4 obsevations:  (156, 6)
Train Data:  (2379, 6)
Test Data:  (500, 6)


Filter: 100%|██████████| 2379/2379 [00:00<00:00, 171226.93 examples/s]
Filter: 100%|██████████| 2379/2379 [00:00<00:00, 52166.76 examples/s]
Map:   0%|          | 0/2379 [00:00<?, ? examples/s]

Map: 100%|██████████| 2379/2379 [00:00<00:00, 2405.86 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 106752.46 examples/s]
Filter: 100%|██████████| 500/500 [00:00<00:00, 53575.31 examples/s]
Map: 100%|██████████| 499/499 [00:00<00:00, 2387.35 examples/s]


In [3]:
torch.cuda.device_count()

1

In [4]:
def pred_perf(X, model): 

    with open(f'./models/reg_{model}_drift.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

In [6]:
### Step 1. PREDICT PERFORMANCE

# TRAIN ON ALL PREDICTIONS AT ONCE

t_models = ["svm", "catboost"]

for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

    set_df = test_df.copy()
    set_df["epoch_set"] = epoch_set
    # Prepare the input data
    with open("./models/vectorizer_drift.pkl", "rb") as file:
        vectorizer = pickle.load(file)

    if epoch_set==0:
        meta_preds_df = set_df.copy()
    else: 
        meta_preds_df = pd.concat([meta_preds_df, set_df])
         
X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "epoch_set"], sparse=True).sparse.to_coo().tocsr()
X_test = hstack([X_test_column_sparse, X_test_tfidf])
#y_test = test_df.loc[:, "rouge"]

models_preds = []
for model in t_models:
    print(model)
    meta_preds_df[f"{model}_preds"] = pred_perf(X_test, model)

meta_preds_df = meta_preds_df.reset_index(drop=True)

svm


catboost


In [7]:
meta_preds_df.groupby("epoch_set").catboost_preds.mean()


epoch_set
0    0.306311
1    0.359166
2    0.424925
3    0.439072
4    0.439072
5    0.439072
6    0.440074
7    0.442289
Name: catboost_preds, dtype: float64

In [8]:
models_index = meta_preds_df.groupby("id")["catboost_preds"].idxmax()
optimal_ensemble = meta_preds_df.iloc[models_index][["id", "epoch_set"]]
optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.epoch_set))

In [11]:
results = {}
latest_run_epoch = 6

for epoch_i, epoch_set in enumerate(sorted(FULL_TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"])):

    set_df = test_df.copy()
    print(f"TRAINING EPOCH SET {epoch_set}")

    TRAIN_ARGS = copy.deepcopy(FULL_TRAIN_ARGS)
    MODEL_PATH = f"./models/{epoch_set}_epoch_set"
    PREV_MODEL_PATH = f"./models/{latest_run_epoch}_epoch_set"
    

    results[epoch_set] = {}

    if epoch_set > 1: 
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set - latest_run_epoch
    else:
        TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"] = epoch_set
    
    print(f'TRAINING EPOCHS {TRAIN_ARGS["SEQ_TRAINER_ARGS"]["num_train_epochs"]}')

    if epoch_set > 1: 
        model = AutoModelForSeq2SeqLM.from_pretrained(PREV_MODEL_PATH)
        print(f"LOADING MODEL {PREV_MODEL_PATH}")
    else: 
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        print(f"LOADING MODEL {model_name}")

    print(device)
    model.to(device)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    compute_metrics = ev.compute_metric_with_params(tokenizer) 

    if not os.path.exists(f'reports/'): 
        os.mkdir(f'reports/')

    training_args = Seq2SeqTrainingArguments(
            **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
        )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    if epoch_set!=0:
        trainer.train()

    text = list(test_df["input_sequence"].values)
    summaries = infer.generate_summary(text, model, tokenizer, TRAIN_ARGS["ENCODER_LENGTH"], TRAIN_ARGS["DECODER_LENGTH"])
    
    
    set_df["epoch_set"] = epoch_set
    set_df["prediction"] = summaries[1]
    set_df["rouge"] = rouge.compute(predictions=set_df["prediction"], 
                references=set_df["output_sequence"],
                use_stemmer=True, 
                use_aggregator=False,
                rouge_types=["rouge1"])["rouge1"]

    if epoch_set==0:
        test_result_df = set_df.copy()
    else: 
        test_result_df = pd.concat([test_result_df, set_df])


    
    ########## SAVE EPOCH SET MODEL
    if not os.path.exists(MODEL_PATH): 
        os.mkdir(MODEL_PATH)

    trainer.save_model(MODEL_PATH)

    latest_run_epoch = epoch_set


########## SAVE THE FILE

with open('test_results_df_drift.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 7
TRAINING EPOCHS 1


LOADING MODEL ./models/6_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,2.8332,3.708618,0.4092,0.1732,0.3733,0.3728,12.6593,0.167,0.8565,0.8659,5037,5817


  return dynamo.is_compiling()


NameError: name 'test_result_df' is not defined

In [None]:
########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("epoch_set")["rouge"].std())

Mean
epoch_set
0    0.298044
1    0.407708
4    0.396904
5    0.382831
7    0.385440
Name: rouge, dtype: float64
STD
epoch_set
0    0.181991
1    0.220346
4    0.239847
5    0.225329
7    0.228136
Name: rouge, dtype: float64


In [None]:
### ENSEMBLE COMPUTE
test_result_df["opt_es_id"] = test_result_df.id.map(optimal_ensemble_map)
ensemble_preds = test_result_df.loc[test_result_df["epoch_set"]==test_result_df["opt_es_id"], :]
ensemble_preds["rouge"].mean()

0.4079935568907713

In [None]:
test_result_df["opt_es_id"].value_counts()

opt_es_id
1    2460
4      35
Name: count, dtype: int64