In [1]:
############################
######### SETTINGS #########
############################

# Installations
#%pip install rouge_score
#%pip install absl
#%pip install seaborn
#%pip install transformers[torch]

# Dependencies
import importlib
import os
import sys

import numpy as np
import pandas as pd
from datasets import Dataset
sys.path.append("../")
from sklearn.model_selection import KFold
import src

importlib.reload(src)

from src.data_prep_utils import (  # noqa: E402
    conala_to_time_batches,
    load_time_sorted_conala,
)

importlib.reload(src.data_prep_utils)


from src.training import nd_inference, retraining, continual
importlib.reload(src.training)

import torch
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from src.processing_utils import compute_metric_with_params, prepare_hg_ds
from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
import evaluate


# Constants
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

MODEL = "CodeT5"
TRAIN_N = 330
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15
RS = 42

TRAIN_ARGS = {
    "TRAIN_N": TRAIN_N,
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": 3,
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}

model_name="Salesforce/codet5-base-multi-sum"

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
############################
######## FUNCTIONS #########
############################

def prep_for_hf(df: pd.DataFrame) -> Dataset:
    
    """Convert pandas dataframe to huggingface."""
    
    df = df.rename(columns={"snippet": "input_sequence",  
                    "rewritten_intent" : "output_sequence"})
    df = df.loc[:, ["input_sequence", "output_sequence", "idx"]]  
    df = df.sample(frac=1, random_state=RS)  
    return df, Dataset.from_pandas(df)

def batch_tokenize_preprocess(batch, tokenizer, max_input_length, max_output_length):

    source = batch["input_sequence"]
    target = batch["output_sequence"]

    source_tokenized = tokenizer(
        source, padding="max_length",
        truncation=True, max_length=max_input_length
    )

    target_tokenized = tokenizer(
        target, padding="max_length",
        truncation=True, max_length=max_output_length
    )

    batch = {k: v for k, v in source_tokenized.items()}

    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in target_tokenized["input_ids"]
    ]

    return batch

def generate_summary(test_samples, model, tokenizer, encoder_max_length, decoder_max_length):

    inputs = tokenizer(
        test_samples["input_sequence"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=decoder_max_length)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

def rouge_custom(prediction, reference): 
    splitted_reference = reference.lower().split()
    matched = sum([word in prediction.lower().split() for word in splitted_reference])
    return matched / len(splitted_reference)

def bleu_custom(prediction, reference): 
    splitted_prediction = prediction.lower().split()
    matched = sum([word in reference.lower().split() for word in splitted_prediction])
    return matched / len(splitted_prediction)

import nltk
nltk.download('punkt')


def postprocess_text(preds, labels):

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds  = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels

def compute_metric_with_params(tokenizer, metrics_list=['rouge', 'bleu']):
    def compute_metrics(eval_preds):

        preds, labels = eval_preds

        if isinstance(preds, tuple):
            preds = preds[0]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # POST PROCESSING
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        results_dict = {}
        for m in metrics_list:
            metric = evaluate.load(m)

            if m=='bleu':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels
                )
            elif m=='rouge':
                result = metric.compute(
                    predictions=decoded_preds, references=decoded_labels, use_stemmer=True
                )
            result = {key: value for key, value in result.items() if key!='precisions'}

            prediction_lens = [
                np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
            ]
            result["gen_len"] = np.mean(prediction_lens)
            result = {k: round(v, 4) for k, v in result.items()}
            results_dict.update(result)
        return results_dict
    return compute_metrics

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
############################
###### CV LOOP PREP ########
############################

DATE_STR = "20240327"
df = pd.read_csv(f"../data/processed/conala/{DATE_STR}/all_drifts.csv")
df["t_batch"] = df["time_batch"]
df.loc[df.rewritten_intent.isna(), "rewritten_intent"] = "translate an ISO 8601 datetime string into a Python datetime object"

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)

full_train_idx = pd.Series(df.question_id.unique()).sample(n=1200, random_state=RS)
test_idx = pd.Series(df.loc[~df.question_id.isin(full_train_idx), "question_id"].unique())


In [4]:
############################
###### MODEL SETTINGS ######
############################


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

compute_metrics = compute_metric_with_params(tokenizer)



TRAIN_ARGS["SEQ_TRAINER_ARGS"]["output_dir"] = f'reports/upper_bound/results'
TRAIN_ARGS["SEQ_TRAINER_ARGS"]["logging_dir"] = f'reports/upper_bound/logs'

if not os.path.exists('reports/'): 
    os.mkdir('reports/')

training_args = Seq2SeqTrainingArguments(
        **TRAIN_ARGS["SEQ_TRAINER_ARGS"],
    )

NameError: name 'model' is not defined

In [None]:
############################
######### CV TRAINING ######
############################

kf = KFold(n_splits=3, random_state=RS, shuffle=True)

for fold, (train_idx, val_idx) in enumerate(kf.split(full_train_idx.values)):
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    print(device)
    model.to(device)

    print(f"Fold: {fold}")
    
    fold_results= {}

    train_df = df.loc[df.question_id.isin(full_train_idx.iloc[train_idx]),:]
    val_df = df.loc[df.question_id.isin(full_train_idx.iloc[val_idx]),:]

    train_df, train_dataset = prep_for_hf(train_df)
    val_df, val_dataset = prep_for_hf(val_df)

    print("Preparing train data")

    train_data = train_dataset.map(
            lambda batch: batch_tokenize_preprocess(
                batch,
                tokenizer=tokenizer,
                max_input_length=ENCODER_LENGTH,
                max_output_length=DECODER_LENGTH,
            ),
            batch_size=4,
            batched=True,
            #remove_columns=train_dataset.column_names,
        )

    print("Preparing val data")

    val_data = val_dataset.map(
            lambda batch: batch_tokenize_preprocess(
                batch,
                tokenizer=tokenizer,
                max_input_length=ENCODER_LENGTH,
                max_output_length=DECODER_LENGTH,
            ),
            batch_size=4,
            batched=True,
            #remove_columns=train_dataset.column_names,
        )

    fold_trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    FOLD_MODEL_PATH = "reports/no_drift/fold_model"
    if not os.path.exists(FOLD_MODEL_PATH): 
        os.mkdir(FOLD_MODEL_PATH)

    print("Training")
    fold_trainer.train()
    fold_trainer.save_model(FOLD_MODEL_PATH)

    val_ground_truths = val_data["output_sequence"]
    rouge = evaluate.load('rouge')

    print("Inference")
    # Fine-Tuned
    model = AutoModelForSeq2SeqLM.from_pretrained(FOLD_MODEL_PATH)
    tokenizer = AutoTokenizer.from_pretrained(FOLD_MODEL_PATH, skip_special_tokens=False)
    fold_predictions_ft = generate_summary(val_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

    # Zero-Shot
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
    fold_predictions_zs = generate_summary(val_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

    # Rouge
    print("Evaluation")
    fold_predictions_ft_rouge = rouge.compute(references=val_ground_truths, predictions=fold_predictions_ft, use_aggregator=False)["rouge1"]
    fold_predictions_zs_rouge = rouge.compute(references=val_ground_truths, predictions=fold_predictions_zs, use_aggregator=False)["rouge1"]
    
    fold_results["input_sequence"] = val_ground_truths
    fold_results["fold_predictions_ft"] = fold_predictions_ft
    fold_results["fold_predictions_zs"] = fold_predictions_zs
    fold_results["fold_predictions_ft_rouge"] = fold_predictions_ft_rouge
    fold_results["fold_predictions_zs_rouge"] = fold_predictions_zs_rouge

    # Combine
    fold_results = pd.DataFrame(fold_results)
    fold_results["fold"] = fold

    if fold==0:
        results = fold_results.copy()
    else: 
        results = pd.concat([results, fold_results])

results.to_csv("cv_results.csv", index=False)

Fold: 0
Preparing train data


Map:   0%|          | 0/1133 [00:00<?, ? examples/s]

                                                                 

Preparing val data


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Training


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.8304,3.665829,0.379,0.1447,0.3446,0.3446,13.0182,0.1496,0.7968,0.8149,5581,6849


INFO:absl:Using default tokenizer.


KeyboardInterrupt: 

In [None]:
############################
######## FULL TRAINING #####
############################

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print(device)
model.to(device)

train_df = df.loc[df.question_id.isin(full_train_idx),:]
test_df = df.loc[df.question_id.isin(test_idx),:]

train_df, train_dataset = prep_for_hf(train_df)
test_df, test_dataset = prep_for_hf(test_df)

train_data = train_dataset.map(
        lambda batch: batch_tokenize_preprocess(
            batch,
            tokenizer=tokenizer,
            max_input_length=ENCODER_LENGTH,
            max_output_length=DECODER_LENGTH,
        ),
        batch_size=4,
        batched=True,
        #remove_columns=train_dataset.column_names,
    )

test_data = test_dataset.map(
        lambda batch: batch_tokenize_preprocess(
            batch,
            tokenizer=tokenizer,
            max_input_length=ENCODER_LENGTH,
            max_output_length=DECODER_LENGTH,
        ),
        batch_size=4,
        batched=True,
        #remove_columns=train_dataset.column_names,
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
MODEL_PATH = f"reports/upper_bound/saved_model"
if not os.path.exists(MODEL_PATH): 
    os.mkdir(MODEL_PATH)

trainer.train()
trainer.save_model(MODEL_PATH)

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7774,3.518943,0.4028,0.1585,0.3652,0.3656,13.9464,0.1538,0.8621,0.8708,5992,6881
2,2.9576,3.384997,0.4375,0.1957,0.4005,0.4008,14.2495,0.1942,0.9102,0.914,6289,6881
3,2.2519,3.3903,0.4402,0.1922,0.3954,0.3963,14.9704,0.2044,0.9727,0.9731,6696,6881


INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


In [None]:
############################
######### INFERENCE ########
############################

test_ground_truths = test_data["output_sequence"]

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, skip_special_tokens=False)
test_df["predictions_ft"] = generate_summary(test_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
test_df["predictions_zs"] = generate_summary(test_data, model, tokenizer, encoder_max_length=ENCODER_LENGTH, decoder_max_length=DECODER_LENGTH)[1] 

############################
######## EVALUATION ########
############################

rouge = evaluate.load('rouge')
test_df["predictions_ft_rouge"] = rouge.compute(references=test_ground_truths, predictions=test_df["predictions_ft"].values, use_aggregator=False)["rouge1"]
test_df["predictions_zs_rouge"] = rouge.compute(references=test_ground_truths, predictions=test_df["predictions_zs"].values, use_aggregator=False)["rouge1"]

INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.
INFO:absl:Using default tokenizer.


In [None]:
print(test_df.predictions_zs_rouge.mean())
test_df.predictions_ft_rouge.mean()

0.278477188184076


0.4189334588957666

In [None]:
test_df.to_csv("test_df.csv", index=False)