In [1]:
%load_ext autoreload
%autoreload 2
!source /home/murilo/RelNetCare/.env

In [2]:
from transformers import AutoTokenizer
import torch
import wandb
from tqdm.notebook import tqdm  # <--- Use notebook version for Jupyter

tqdm.pandas()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

exp_group = "DialogRESredfmBARTSummarizeRelations"  # Change this for each run
data_folder = "/home/murilo/RelNetCare/data/processed/dialog-re-llama-11cls-rebalPairs-rwrtKeys-instrC-mxTrnCp3-skpTps-prepBART"
freeze_encoder = False
model_name = 'facebook/bart-base'

if model_name == 'facebook/bart-base':
    train_batch_size = 24 # was 32
    eval_batch_size = 48 # was 64
elif model_name == 'facebook/bart-large':
    train_batch_size = 6
    eval_batch_size = 12
else:
    raise Exception("Model without batch size match")

# Initialize Weights and Biases with more args
args_dict = {
    "per_device_train_batch_size": train_batch_size,
    "per_device_eval_batch_size": eval_batch_size,
    "num_train_epochs": 5,
    'learning_rate': 2e-5,  # good starting point: 2e-5 (min acceptable for large 2.5e-7)
    "exp_group": exp_group,
    "data_stem": data_folder.split('/')[-1],
    "data_folder": data_folder,
    "freeze_encoder": freeze_encoder,
    'truncation': True,
    'max_length': 512,
    'model_name':model_name,
    'memorization_task': False,
    'fp16': False,
    'merge_train_dev': False,
    'dropout_regularization_proba': None, # default 0.10
}
args_dict['output_dir'] = f"/mnt/vdb1/murilo/models/fine-tuned/{args_dict['model_name']}/{args_dict['data_stem']}"


checkpoint = args_dict['model_name']
tokenizer = AutoTokenizer.from_pretrained(args_dict['model_name'])

In [3]:
from datasets import Dataset, DatasetDict
import os
import json

data_cap = -1
set_data = None
dataset_sets = {}
dict_sets = {}
for set_ in ('train', 'test', 'dev'):

    data_path = os.path.join(data_folder, f'{set_}.json')

    with open(data_path, 'r') as f:
        data = json.load(f)
            
    # Remap keys and separate into train/test
    if args_dict['memorization_task']:
        if not set_data:
            set_data = [{"text": item["input"], "summary": item["output"], "title": ""} for item in data[data_cap:]]
    else:
        set_data = [{"text": item["input"], "summary": item["output"], "title": ""} for item in data]
        
    # Merge 'train' and 'dev' if the flag is set
    if args_dict['merge_train_dev']:
        if set_ == 'dev':
            dict_sets['train'] = dict_sets['train'] + set_data
        else:
           dict_sets[set_] = set_data
    else:
        dict_sets[set_] = set_data
        

    
for set_ in ('train', 'test', 'dev'):
    if args_dict['merge_train_dev']:
        if set_ == 'dev':
            continue
    set_data = dict_sets[set_]
    dataset_sets[set_] = Dataset.from_dict(
        {"text": [item["text"] for item in set_data],
         "summary": [item["summary"] for item in set_data],
         "title": [item["title"] for item in set_data]}
        )
    

# Create DatasetDict
dataset_dict = DatasetDict(dataset_sets)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 2304
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 841
    })
    dev: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 705
    })
})

In [4]:
def preprocess_function(examples, model_max_length=args_dict['max_length'], tokenizer_max_length=args_dict['max_length']):
    inputs = [doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset_dict.map(preprocess_function, batched=True)


Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

Map:   0%|          | 0/705 [00:00<?, ? examples/s]

In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=args_dict['model_name'])

In [6]:
import evaluate

rouge = evaluate.load("rouge")


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/murilo/miniconda3/envs/llama-lora/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /home/murilo/miniconda3/envs/llama-lora/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variab

  warn(msg)


In [7]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [8]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, BartConfig
import torch.nn as nn


try:
    # Release GPU
    model.cpu()
    del model
    torch.cuda.empty_cache()
except:
    print("No model loaded...")

if args_dict['dropout_regularization_proba']:
    config_extreme = BartConfig.from_pretrained(checkpoint,
                                                # encoder_layerdrop=0.2,
                                                # decoder_layerdrop=0.5,
                                                dropout=args_dict['dropout_regularization_proba'],
                                                # attention_dropout=0.5,
                                                # activation_dropout=0.5
                                                )

    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, config=config_extreme)

else:
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


if args_dict['freeze_encoder']:
    print('freezing encoder!')
    for param in model.model.encoder.parameters():
        param.requires_grad = False


print("model.config.encoder_layerdrop=",model.config.encoder_layerdrop)
print("model.config.decoder_layerdrop=",model.config.decoder_layerdrop)
print("model.config.dropout=",model.config.dropout)
print("model.config.attention_dropout=",model.config.attention_dropout)
print("model.config.activation_dropout=",model.config.activation_dropout)

model.to(device)
args_dict


No model loaded...
model.config.encoder_layerdrop= 0.0
model.config.decoder_layerdrop= 0.0
model.config.dropout= 0.1
model.config.attention_dropout= 0.1
model.config.activation_dropout= 0.1


{'per_device_train_batch_size': 24,
 'per_device_eval_batch_size': 48,
 'num_train_epochs': 5,
 'learning_rate': 2e-05,
 'exp_group': 'DialogRESredfmBARTSummarizeRelations',
 'data_stem': 'dialog-re-llama-11cls-rebalPairs-rwrtKeys-instrC-mxTrnCp3-skpTps-prepBART',
 'data_folder': '/home/murilo/RelNetCare/data/processed/dialog-re-llama-11cls-rebalPairs-rwrtKeys-instrC-mxTrnCp3-skpTps-prepBART',
 'freeze_encoder': False,
 'truncation': True,
 'max_length': 512,
 'model_name': 'facebook/bart-base',
 'memorization_task': False,
 'fp16': False,
 'merge_train_dev': False,
 'dropout_regularization_proba': None,
 'output_dir': '/mnt/vdb1/murilo/models/fine-tuned/facebook/bart-base/dialog-re-llama-11cls-rebalPairs-rwrtKeys-instrC-mxTrnCp3-skpTps-prepBART'}

In [9]:

wandb.init(project="huggingface", config=args_dict, reinit=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=args_dict['output_dir'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1,
    learning_rate=args_dict['learning_rate'],
    per_device_train_batch_size=args_dict['per_device_train_batch_size'],
    per_device_eval_batch_size=args_dict['per_device_eval_batch_size'],
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=args_dict['num_train_epochs'],
    predict_with_generate=True,
    fp16=args_dict['fp16'],
    load_best_model_at_end=True,
    # max_grad_norm=0.1,
    seed=42,
    # push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# @todo: set random seed fo training
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmbellatini[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4432,0.432354,0.5028,0.34,0.4496,0.4492,19.8794
2,0.3457,0.342355,0.4608,0.3144,0.4148,0.4143,18.3759
3,0.3261,0.347971,0.4985,0.3461,0.4474,0.4471,19.3007
4,0.3662,0.356209,0.5164,0.3642,0.4622,0.4614,19.5262
5,0.3274,0.334293,0.4955,0.3519,0.4437,0.443,18.3574


TrainOutput(global_step=480, training_loss=0.4008096211589873, metrics={'train_runtime': 190.3085, 'train_samples_per_second': 60.533, 'train_steps_per_second': 2.522, 'total_flos': 925279838945280.0, 'train_loss': 0.4008096211589873, 'epoch': 5.0})

# Inference

In [10]:
from transformers import AutoTokenizer

idx = -69
text = dataset_dict['train'][idx]['text']
inputs = tokenizer(text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
# overfit_accomplished = dataset_dict['train'][0]['summary'] == summary
# print("overfit_accomplished=",overfit_accomplished)
print("input=", dataset_dict['train'][idx]['text'])
print("ground truth =",dataset_dict['train'][idx]['summary'])
print("raw inference=",summary )


input= Speaker 1: No, I-I mean your-your old best friend, here. Lily, from high school. Remember? Speaker 2: Oh gosh, Lily, yes. Of course I remember Lily. I... Then you must be? Speaker 1: Phoebe. Phoebe. Phoebe, yeah. She named me after you I guess.
ground truth = Speaker 1 is a parent of Lily. Lily is a child of Speaker 1.
raw inference= Speaker 1 is a parent of Lily. Lily is a child of Speaker 1.


In [11]:
# @todo: create a custom loss
# first inspect this

import inspect
from transformers import Seq2SeqTrainer

# print(inspect.getsource(Seq2SeqTrainer.training_step))
# print(inspect.getsource(Seq2SeqTrainer.compute_loss))
# print(inspect.getsource(Seq2SeqTrainer.label_smoother))

# # then overwrite the methods (either compute_loss or training_step, or even both)

# from transformers import Seq2SeqTrainer

# class CustomSeq2SeqTrainer(Seq2SeqTrainer):
#     def training_step(self, model, inputs):
#         outputs = model(**inputs)
#         loss = outputs.loss
#         logits = outputs.logits

#         # Add your custom loss here, let's say L1 regularization
#         lambda_l1 = 0.01  # regularization coefficient
#         l1_norm = sum(p.abs().sum() for p in model.parameters())
#         loss = loss + lambda_l1 * l1_norm

#         self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
#         return {"loss": loss}

# # lastly the trainer

# trainer = CustomSeq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )



In [12]:

# model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")

In [13]:
# Release GPU
model.cpu()
del model
torch.cuda.empty_cache()
