In [None]:
pip install evaluate

In [None]:
%pip install rouge_score

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments, Seq2SeqTrainingArguments
import evaluate
import numpy as np
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = load_dataset("knkarthick/samsum")
df

In [None]:
train = df['train'].to_pandas()
test = df['test'].to_pandas()
val = df['validation'].to_pandas()

In [None]:
# Train
print(train.info())
print(train.isna().sum())

# Val
print(val.info())
print(val.isna().sum())

# Test
print(test.info())
print(test.isna().sum())

In [None]:
# Max lenght
max_dialogue_len = train['dialogue'].apply(lambda x: len(x.split())).max()
max_summary_len = train['summary'].apply(lambda x: len(x.split())).max()

print("Maximum dialogue length:", max_dialogue_len)
print("Maximum summary length:", max_summary_len)


max_val_dialogue_len = val['dialogue'].apply(lambda x: len(x.split())).max()
max_val_summary_len = val['summary'].apply(lambda x: len(x.split())).max()

print("Maximum dialogue length:", max_val_dialogue_len)
print("Maximum summary length:", max_val_summary_len)

max_test_dialogue_len = test['dialogue'].apply(lambda x: len(x.split())).max()
max_test_summary_len = test['summary'].apply(lambda x: len(x.split())).max()

print("Maximum dialogue length:", max_test_dialogue_len)
print("Maximum summary length:", max_test_summary_len)

Maximum dialogue length: 803
Maximum summary length: 64
Maximum dialogue length: 540
Maximum summary length: 59
Maximum dialogue length: 516
Maximum summary length: 58


In [None]:
train

In [None]:
# Model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess(example):
    input_text = "summarize: " + example["dialogue"]
    target_text = example["summary"]

    # Tokenize inputs and labels
    inputs = tokenizer(input_text, max_length=803, truncation=True, padding="max_length")

    labels = tokenizer(target_text,max_length=103,truncation=True,padding="max_length")

    # Important: labels must be a list, not a tensor
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_data = {k: v.map(preprocess) for k, v in df.items()}

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt")

In [None]:
tokenized_data['train'].to_pandas().head()

Unnamed: 0,id,dialogue,summary,input_ids,attention_mask,labels
0,13818513,Amanda: I baked cookies. Do you want some?\nJ...,Amanda baked cookies and will bring Jerry some...,"[21603, 10, 21542, 10, 27, 13635, 5081, 5, 531...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[21542, 13635, 5081, 11, 56, 830, 16637, 128, ..."
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...,"[21603, 10, 25051, 10, 2645, 33, 25, 10601, 21...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[25051, 11, 20373, 5144, 33, 10601, 21, 10215,..."
2,13681000,"Tim: Hi, what's up?\nKim: Bad mood tbh, I was ...",Kim may try the pomodoro technique recommended...,"[21603, 10, 4485, 10, 2018, 6, 125, 31, 7, 95,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[6777, 164, 653, 8, 3, 17043, 7512, 32, 3317, ..."
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...,"[21603, 10, 8200, 10, 15868, 6, 27, 317, 27, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[8200, 317, 7, 3, 88, 19, 16, 333, 28, 5377, 9..."
4,13728094,Sam: hey overheard rick say something\nSam: i...,"Sam is confused, because he overheard Rick com...","[21603, 10, 3084, 10, 3, 13133, 147, 88, 986, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3084, 19, 11319, 6, 250, 3, 88, 147, 88, 986,..."


In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 2) for k, v in result.items()}
    return result

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
#
training_args = TrainingArguments(
    output_dir="./t5_samsum_custom",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=3e-4,
    weight_decay=0.01,
    num_train_epochs=2,
    save_total_limit=1,
    logging_steps=200,
    dat
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnsingh8691[0m ([33mnsingh8691-zi-systech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-4032920361.py", line 1, in <cell line: 0>
    trainer.train()
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2325, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2674, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 4020, in training_step
    loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/d

TypeError: object of type 'NoneType' has no len()

In [None]:
test_dialogue = test.iloc[0]["dialogue"]
inputs = tokenizer("summarize: " + test_dialogue, return_tensors="pt", truncation=True)
inputs.to("cuda")
summary_ids = model.generate(inputs["input_ids"], max_length=100, num_beams=4)

print("Original Dialogue:\n", test_dialogue)
print("Generated Summary:\n", tokenizer.decode(summary_ids[0], skip_special_tokens=True))
print("Reference Summary:\n", test.iloc[0]["summary"])

Original Dialogue:
 Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
Generated Summary:
 Amanda has Betty's number. Larry called her last time.
Reference Summary:
 Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
