In [1]:
############################
######### SETTINGS #########
############################

# Installations
#%pip install rouge_score
#%pip install absl
#%pip install seaborn
#%pip install transformers[torch]

# Dependencies
import importlib
import os
import sys

import numpy as np
import pandas as pd
from datasets import Dataset


import torch
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import KFold

import seaborn as sns
import evaluate


# Constants
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15
RS = 42

TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": 10,
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}

model_name="Salesforce/codet5-base-multi-sum"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
############################
######## FUNCTIONS #########
############################

def prep_for_hf(df: pd.DataFrame) -> Dataset:
    
    """Convert pandas dataframe to huggingface."""
    
    df = df.rename(columns={"snippet": "input_sequence",  
                    "rewritten_intent" : "output_sequence"})
    df = df.loc[:, ["input_sequence", "output_sequence", "idx"]]  
    df = df.sample(frac=1, random_state=RS)  
    return df, Dataset.from_pandas(df)

def batch_tokenize_preprocess(batch, tokenizer, max_input_length, max_output_length):

    source = batch["input_sequence"]
    target = batch["output_sequence"]

    source_tokenized = tokenizer(
        source, padding="max_length",
        truncation=True, max_length=max_input_length
    )

    target_tokenized = tokenizer(
        target, padding="max_length",
        truncation=True, max_length=max_output_length
    )

    batch = {k: v for k, v in source_tokenized.items()}

    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in target_tokenized["input_ids"]
    ]

    return batch

def generate_summary(test_samples, model, tokenizer, encoder_max_length, decoder_max_length):

    inputs = tokenizer(
        test_samples["input_sequence"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=decoder_max_length)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

def rouge_custom(prediction, reference): 
    splitted_reference = reference.lower().split()
    matched = sum([word in prediction.lower().split() for word in splitted_reference])
    return matched / len(splitted_reference)

def bleu_custom(prediction, reference): 
    splitted_prediction = prediction.lower().split()
    matched = sum([word in reference.lower().split() for word in splitted_prediction])
    return matched / len(splitted_prediction)

import nltk
nltk.download('punkt')


def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds  = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels

def compute_metric_with_params(tokenizer, metrics_list=['rouge', 'bleu']):
    def compute_metrics(eval_preds):

        preds, labels = eval_preds

        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # POST PROCESSING
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        results_dict = {}
        for m in metrics_list:
            metric = evaluate.load(m)

            if m=='bleu':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels
                )
            elif m=='rouge':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels, use_stemmer=True
                )
            result = {key: value for key, value in result.items() if key!='precisions'}

            prediction_lens = [
                np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
            ]
            result["gen_len"] = np.mean(prediction_lens)
            result = {k: round(v, 4) for k, v in result.items()}
            results_dict.update(result)
        return results_dict
    return compute_metrics

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
############################
###### CV LOOP PREP ########
############################

DATE_STR = "20240327"
df = pd.read_csv(f"./data/processed/conala/{DATE_STR}/all_drifts.csv")
df["t_batch"] = df["time_batch"]
df.loc[df.rewritten_intent.isna(), "rewritten_intent"] = "translate an ISO 8601 datetime string into a Python datetime object"

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)

full_train_idx = pd.Series(df.loc[df.cluster<=5, "question_id"].unique())
test_idx = pd.Series(df.loc[df.cluster>5, "question_id"].unique())


In [4]:
############################
###### MODEL SETTINGS ######
############################


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

compute_metrics = compute_metric_with_params(tokenizer)



TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/upper_bound/results'
TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/upper_bound/logs'

if not os.path.exists('reports/'): 
    os.mkdir('reports/')

training_args = Seq2SeqTrainingArguments(
        **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
    )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [5]:
############################
######### CV TRAINING ######
############################

kf = KFold(n_splits=3, random_state=RS, shuffle=True)

for fold, (train_idx, val_idx) in enumerate(kf.split(full_train_idx.values)):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    print(device)
    model.to(device)

    print(f"Fold: {fold}")
    
    fold_results= {}

    train_df = df.loc[df.question_id.isin(full_train_idx.iloc[train_idx]),:]
    val_df = df.loc[df.question_id.isin(full_train_idx.iloc[val_idx]),:]

    train_df, train_dataset = prep_for_hf(train_df)
    val_df, val_dataset = prep_for_hf(val_df)

    print("Preparing train data")

    train_data = train_dataset.map(
            lambda batch: batch_tokenize_preprocess(
                batch,
                tokenizer=tokenizer,
                max_input_length=ENCODER_LENGTH,
                max_output_length=DECODER_LENGTH,
            ),
            batch_size=4,
            batched=True,
            #remove_columns=train_dataset.column_names,
        )

    print("Preparing val data")

    val_data = val_dataset.map(
            lambda batch: batch_tokenize_preprocess(
                batch,
                tokenizer=tokenizer,
                max_input_length=ENCODER_LENGTH,
                max_output_length=DECODER_LENGTH,
            ),
            batch_size=4,
            batched=True,
            #remove_columns=train_dataset.column_names,
        )

    fold_trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    FOLD_MODEL_PATH = "./experiments/reports/drift/fold_model"
    if not os.path.exists(FOLD_MODEL_PATH): 
        os.mkdir(FOLD_MODEL_PATH)

    print("Training")
    fold_trainer.train()
    fold_trainer.save_model(FOLD_MODEL_PATH)

    val_ground_truths = val_data["output_sequence"]
    rouge = evaluate.load('rouge')

    print("Inference")
    # Fine-Tuned
    model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(FOLD_MODEL_PATH, skip_special_tokens=False)
    fold_predictions_ft = generate_summary(val_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

    # Zero-Shot
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
    fold_predictions_zs = generate_summary(val_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

    # Rouge
    print("Evaluation")
    fold_predictions_ft_rouge = rouge.compute(references=val_ground_truths, predictions=fold_predictions_ft, use_aggregator=False)["rouge1"]
    fold_predictions_zs_rouge = rouge.compute(references=val_ground_truths, predictions=fold_predictions_zs, use_aggregator=False)["rouge1"]
    
    fold_results["input_sequence"] = val_data["input_sequence"]
    fold_results["output_sequence"] = val_ground_truths
    fold_results["fold_predictions_ft"] = fold_predictions_ft
    fold_results["fold_predictions_zs"] = fold_predictions_zs
    fold_results["fold_predictions_ft_rouge"] = fold_predictions_ft_rouge
    fold_results["fold_predictions_zs_rouge"] = fold_predictions_zs_rouge

    # Combine
    fold_results = pd.DataFrame(fold_results)
    fold_results["fold"] = fold

    if fold==0:
        results = fold_results.copy()
    else: 
        results = pd.concat([results, fold_results])

results.to_csv("./experiments/csv_results/cv_results_drift.csv", index=False)

cuda
Fold: 0
Preparing train data


                                                                

Preparing val data


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9083,3.901278,0.3637,0.1348,0.3317,0.3332,12.4657,0.1224,0.7713,0.7938,5210,6563
2,3.2212,3.808246,0.3979,0.1627,0.3584,0.3593,13.2635,0.1463,0.8504,0.8606,5648,6563
3,2.6668,3.868564,0.3867,0.1468,0.3461,0.3475,14.1132,0.1345,0.9064,0.9106,5976,6563
4,2.2617,3.931495,0.4019,0.1617,0.3563,0.3576,14.9295,0.1412,0.9738,0.9741,6393,6563
5,1.9913,4.015207,0.3911,0.1556,0.3509,0.3523,14.4935,0.1412,0.9631,0.9637,6325,6563
6,1.7893,4.138397,0.3938,0.1484,0.3465,0.3476,15.7551,0.1281,1.0,1.0472,6873,6563
7,1.6547,4.10781,0.3875,0.1441,0.3422,0.3443,14.9221,0.1349,0.9809,0.9811,6439,6563
8,1.562,4.160426,0.3935,0.1467,0.3474,0.3495,15.3711,0.137,1.0,1.0277,6745,6563
9,1.4994,4.181761,0.3964,0.1534,0.3507,0.3524,15.2356,0.1375,1.0,1.0187,6686,6563
10,1.4669,4.188765,0.3961,0.1523,0.352,0.3539,15.1874,0.1443,1.0,1.0029,6582,6563




Inference
Evaluation
cuda
Fold: 1
Preparing train data


                                                                

Preparing val data


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Training


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9273,3.797089,0.3476,0.1331,0.3158,0.3169,12.9327,0.1061,0.7427,0.7707,5126,6651
2,3.2231,3.753729,0.3647,0.1288,0.3184,0.3184,15.1962,0.1231,0.9474,0.9487,6310,6651
3,2.6758,3.780504,0.3654,0.1311,0.3228,0.3224,14.1462,0.1266,0.8591,0.8681,5774,6651
4,2.2787,3.894175,0.355,0.1278,0.3189,0.3184,13.825,0.1295,0.8914,0.8969,5965,6651
5,1.9807,3.987505,0.3742,0.1402,0.3252,0.3255,16.0385,0.1473,1.0,1.0176,6768,6651
6,1.7883,4.040753,0.3687,0.1296,0.3197,0.3195,15.4308,0.1291,0.9968,0.9968,6630,6651
7,1.6585,4.033797,0.38,0.14,0.3281,0.3282,15.1885,0.1441,0.9615,0.9623,6400,6651
8,1.5624,4.072205,0.3742,0.1358,0.3286,0.3286,15.5481,0.1378,0.9859,0.986,6558,6651
9,1.4994,4.046926,0.3832,0.1441,0.336,0.3364,15.4673,0.1505,0.9865,0.9866,6562,6651
10,1.469,4.05629,0.3808,0.1439,0.335,0.3357,15.5827,0.1455,0.9908,0.9908,6590,6651




Inference
Evaluation
cuda
Fold: 2
Preparing train data


                                                                

Preparing val data


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Training


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9405,3.848444,0.3313,0.1247,0.2941,0.294,15.7761,0.128,1.0,1.0212,6696,6557
2,3.214,3.735567,0.3759,0.1508,0.3372,0.3368,14.4672,0.1465,0.9054,0.9096,5964,6557
3,2.68,3.84033,0.3655,0.1467,0.3284,0.3286,14.4112,0.1319,0.8734,0.8807,5775,6557
4,2.2841,3.939187,0.3676,0.1365,0.3253,0.3257,15.4131,0.1362,1.0,1.0328,6772,6557
5,1.9941,4.01242,0.3763,0.1387,0.328,0.3275,14.6255,0.1499,0.9345,0.9366,6141,6557
6,1.8039,4.070661,0.3648,0.1343,0.3201,0.3195,15.6988,0.1359,1.0,1.0297,6752,6557
7,1.6628,4.084267,0.3732,0.1452,0.329,0.3289,15.112,0.1457,0.9853,0.9854,6461,6557
8,1.5669,4.1236,0.3742,0.141,0.331,0.3304,15.3668,0.1398,0.9857,0.9858,6464,6557
9,1.5047,4.136731,0.3796,0.1527,0.3373,0.3371,15.6351,0.1496,1.0,1.0049,6589,6557
10,1.4685,4.152503,0.3802,0.1526,0.338,0.3379,15.1969,0.1498,0.9744,0.9747,6391,6557




Inference
Evaluation


In [6]:
############################
######## FULL TRAINING #####
############################

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print(device)
model.to(device)

train_df = df.loc[df.question_id.isin(full_train_idx),:]
test_df = df.loc[df.question_id.isin(test_idx),:]

train_df, train_dataset = prep_for_hf(train_df)
test_df, test_dataset = prep_for_hf(test_df)

train_data = train_dataset.map(
        lambda batch: batch_tokenize_preprocess(
            batch,
            tokenizer=tokenizer,
            max_input_length=ENCODER_LENGTH,
            max_output_length=DECODER_LENGTH,
        ),
        batch_size=4,
        batched=True,
        #remove_columns=train_dataset.column_names,
    )

test_data = test_dataset.map(
        lambda batch: batch_tokenize_preprocess(
            batch,
            tokenizer=tokenizer,
            max_input_length=ENCODER_LENGTH,
            max_output_length=DECODER_LENGTH,
        ),
        batch_size=4,
        batched=True,
        #remove_columns=train_dataset.column_names,
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
MODEL_PATH = f"./experiments/reports/drift/saved_model"
if not os.path.exists(MODEL_PATH): 
    os.mkdir(MODEL_PATH)

trainer.train()
trainer.save_model(MODEL_PATH)

cuda


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8989,3.667003,0.3349,0.1105,0.3058,0.3061,14.7771,0.1147,1.0,1.0176,17995,17684
2,3.2129,3.669625,0.3611,0.1288,0.3302,0.3302,12.8034,0.1353,0.8238,0.8376,14813,17684
3,2.7269,3.658848,0.3911,0.1512,0.3496,0.3499,15.4819,0.1626,1.0,1.0249,18125,17684
4,2.3322,3.750256,0.3949,0.167,0.3587,0.3586,14.22,0.1681,0.8864,0.8924,15781,17684
5,2.0451,3.814176,0.3861,0.1682,0.3522,0.3523,15.1157,0.1838,0.9795,0.9797,17325,17684
6,1.8247,3.893816,0.3884,0.1722,0.3532,0.3531,15.2122,0.1884,1.0,1.0081,17827,17684
7,1.6727,3.853261,0.4172,0.2001,0.3825,0.3827,15.2342,0.2204,0.9906,0.9907,17519,17684
8,1.5733,3.909604,0.4145,0.1951,0.3777,0.3772,15.1973,0.216,1.0,1.0017,17714,17684
9,1.5058,3.922346,0.405,0.1911,0.368,0.3677,15.3513,0.2117,0.9999,0.9999,17683,17684
10,1.469,3.925814,0.4157,0.1997,0.3791,0.379,15.2307,0.223,0.9984,0.9984,17656,17684




In [7]:
############################
######### INFERENCE ########
############################

test_ground_truths = test_data["output_sequence"]

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, skip_special_tokens=False)
test_df["predictions_ft"] = generate_summary(test_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
test_df["predictions_zs"] = generate_summary(test_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

############################
######## EVALUATION ########
############################

rouge = evaluate.load('rouge')
test_df["predictions_ft_rouge"] = rouge.compute(references=test_ground_truths, predictions=test_df["predictions_ft"].values, use_aggregator=False)["rouge1"]
test_df["predictions_zs_rouge"] = rouge.compute(references=test_ground_truths, predictions=test_df["predictions_zs"].values, use_aggregator=False)["rouge1"]

In [8]:
print(f"Cross Validation Rouge Zero-Shot {results.fold_predictions_zs_rouge.mean()}")
print(f"Cross Validation Rouge Fine-Tuning {results.fold_predictions_ft_rouge.mean()}")

print(f"Test Rouge Zero-Shot {test_df.predictions_zs_rouge.mean()}")
print(f"Test Rouge Fine-Tuning {test_df.predictions_ft_rouge.mean()}")

Cross Validation Rouge Zero-Shot 0.2702202093946143
Cross Validation Rouge Fine-Tuning 0.3665148879570414
Test Rouge Zero-Shot 0.3069821108612329
Test Rouge Fine-Tuning 0.4005606151514323


In [9]:
test_df.to_csv("./experiments/csv_results/test_df_drift.csv", index=False)