In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_from_disk
from datasets import *
import pandas as pd
import transformers
import numpy as np
import accelerate
import torch
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!pip install -q datasets nltk bitsandbytes transformers==4.30

In [None]:
# basic variables
max_input = 512
max_target = 128
model_checkpoints = "facebook/bart-base"

In [None]:
data = load_dataset('notmehul/slicknotifications')

In [None]:
data['train']

Dataset({
    features: ['input', 'output'],
    num_rows: 861
})

In [None]:
# formatting data according to the tokeniser.

dataset = data["train"]
df = pd.DataFrame(dataset)
df['id'] = range(1, len(df) + 1) # added IDs

#df = df.rename(columns={"input": "dialogue", "output": "summary"}) # rename cos vibes

df = df[['id', 'output', 'input']] # new data
formatted_data = df.to_dict(orient='records') # yes oxford
formatted_data[:5]

[{'id': 1,
  'output': 'Dard me koi masum pyara nhi hota',
  'input': 'Dard me koi masum pyara nhi hota...  Dil ho pyasa toh pani se guzara nhi hota  Yahi toh hamari kamzori hai..  Hum sabke hojate hai pr hamara koi nhi hota...'},
 {'id': 2,
  'output': "couldn't watch them die slowly",
  'input': "Felt like becoming one among stars myself but couldn't muster up courage to burn myself the trees were long forgetful to me but couldn't watch them die slowly after burning me."},
 {'id': 3,
  'output': 'best season to be in this city?',
  'input': 'best season to be in this city?'},
 {'id': 4,
  'output': 'getting addicted to slick',
  'input': 'Why am I getting addicted to slick 🙃  people who are feeling the same ➡️'},
 {'id': 5,
  'output': 'i cheated',
  'input': 'i cheated on one of my exams and still failed'}]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints)

In [None]:
def preprocess_data(data_to_process):
  inputs = [input for input in data_to_process['input']]

  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  with tokenizer.as_target_tokenizer(): # to set the target variable
    targets = tokenizer(data_to_process['output'], max_length=max_target, padding='max_length', truncation=True)

  #set labels
  model_inputs['labels'] = targets['input_ids']

  #returns input_ids, attention_mask and labels
  return model_inputs

In [None]:
train_dataset, validation_dataset= dataset.train_test_split(test_size=0.1).values()
dataset = DatasetDict({'train': train_dataset, 'val': validation_dataset})

In [None]:
tokenized_datasets = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/774 [00:00<?, ? examples/s]



Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [None]:
batch_size = 8
model_name = model_checkpoints.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    push_to_hub=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/bart-base-finetuned-xsum is already a clone of https://huggingface.co/notmehul/bart-base-finetuned-xsum. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
import gc
gc.collect()

209

In [None]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


AttributeError: module 'datasets.metric' has no attribute 'compute'

In [None]:
input_string = """Unleashing the Power of Generative AI: Insights from a Thrilling Hackathon 🚀🔥

Let me take you on an exhilarating journey through the Generative AI Hackathon, where innovation and collaboration converge to unravel the potential of this groundbreaking technology with my team members Shreeja Kapoor & Sujan Shahi. 💻💡
The hackathon started with an air of anticipation as teams gathered, eager to put their skills to the test. Little did we know that the dataset we received would be a true test of our ingenuity and problem-solving abilities. 🧩🧠

With a single column of text data containing 1,000 survey reviews, our team had to navigate through a sea of information within a mere six-hour timeframe. Undeterred, we rolled up our sleeves and got to work. ⌛️💻

"""
#tokenize the conversation
model_inputs = tokenizer(input_string,  max_length=max_input, padding='max_length', truncation=True)
#make prediction
pred, _, _ = trainer.predict([model_inputs])
#decode the output
print(tokenizer.decode(pred[0]))


Epoch,Training Loss,Validation Loss


</s><s>Unleashing the Power of Generative AI: Insights from a Thrilling Hack</s>


In [None]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/532M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/4.43k [00:00<?, ?B/s]

Upload file runs/Mar28_03-20-00_118e14740fe8/events.out.tfevents.1711596024.118e14740fe8.8539.0:   0%|        …

Upload file runs/Mar28_03-17-12_118e14740fe8/events.out.tfevents.1711595843.118e14740fe8.4193.2:   0%|        …

Upload file runs/Mar28_03-12-32_118e14740fe8/events.out.tfevents.1711595580.118e14740fe8.4193.1:   0%|        …

To https://huggingface.co/notmehul/bart-base-finetuned-xsum
   cffab77..2873794  main -> main

   cffab77..2873794  main -> main

To https://huggingface.co/notmehul/bart-base-finetuned-xsum
   2873794..f588059  main -> main

   2873794..f588059  main -> main



'https://huggingface.co/notmehul/bart-base-finetuned-xsum/commit/28737940fb3735edcf47e8352301f57e1e781d76'