In [None]:
!which python

In [1]:
import torch
import transformers
from datasets import load_dataset, load_metric, load_from_disk
import numpy as np
import nltk
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/vazgen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Loading the data and pretrained model from the Hugging Face repository 

In [2]:

#data = load_dataset('samsum')
#data.save_to_disk('/content/samsum')
data = load_dataset("samsum")
metric = load_metric('rouge')
model_checkpoints = 'facebook/bart-large-xsum'

Found cached dataset samsum (/Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

  metric = load_metric('rouge')


Defining model varaibles

In [3]:

max_input = 512
max_target = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)
device = torch.device('mps')

# tokenizer.to(device)

In [4]:
#DEFINING FUCTION FOR PROCESSING DATA

def preprocess_data(data_to_process):
    #get the dialogue text
    inputs = [dialogue for dialogue in data_to_process['dialogue']]
    #tokenize text
    model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)
  
    #tokenize labels
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data_to_process['summary'], max_length=max_target, padding='max_length', truncation=True)
      
    model_inputs['labels'] = targets['input_ids']
    #reuturns input_ids, attention_masks, labels
    return model_inputs

In [5]:
#tokenizing data
tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['id', 'dialogue', 'summary'])

Loading cached processed dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-274c4ece19d71c23.arrow
Loading cached processed dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-031e961de5c55c00.arrow
Loading cached processed dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-4e9917fd7977320c.arrow


In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
tokenize_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [8]:
#splitting data for illustrative purpose

#sample the data
train_sample = tokenize_data['train'].shuffle(seed=123).select(range(1000))
validation_sample = tokenize_data['validation'].shuffle(seed=123).select(range(200))
test_sample = tokenize_data['test'].shuffle(seed=123).select(range(200))

tokenize_data['train'] = train_sample
tokenize_data['validation'] = validation_sample
tokenize_data['test'] = test_sample
     


Loading cached shuffled indices for dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-9567d0d2232a6b7a.arrow
Loading cached shuffled indices for dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-9a8841ff94abe0e0.arrow
Loading cached shuffled indices for dataset at /Users/vazgen/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e/cache-22334d949f69a92d.arrow


# Initializing Model with pretrained weights

In [9]:

model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

In [10]:

model.to(device)
#collator to create batches. It preprocess data with the given tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:

#####################
# metrics
# compute rouge for evaluation 
#####################

def compute_rouge(pred):
    predictions, labels = pred
    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  
    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    #get %
    res = {key: value.mid.fmeasure * 100 for key, value in res.items()}
  
    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)
  
    return {k: round(v, 4) for k, v in res.items()}

In [12]:

args = transformers.Seq2SeqTrainingArguments(
    'conversation-summ',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=1,
#     fp16=True
    )


In [13]:

trainer = transformers.Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

Fitting a model

In [14]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 1500


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.5419,0.358164,49.1869,24.5295,39.7862,39.7685,23.965
2,0.2163,0.367485,50.8327,24.929,40.7415,40.6611,26.41
3,0.1352,0.415083,51.0237,24.1668,39.646,39.6125,29.625


Saving model checkpoint to conversation-summ/checkpoint-500
Configuration saved in conversation-summ/checkpoint-500/config.json
Model weights saved in conversation-summ/checkpoint-500/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-500/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [conversation-summ/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 200
  Batch size = 1
Saving model checkpoint to conversation-summ/checkpoint-1000
Configuration saved in conversation-summ/checkpoint-1000/config.json
Model weights saved in conversation-summ/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in conversation-summ/checkpoint-1000/tokenizer_config.json
Special tokens file saved in conversation-summ/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [conversation-summ/checkpoint-1500] due to args.save_total_li

TrainOutput(global_step=1500, training_loss=0.2978282725016276, metrics={'train_runtime': 12233.3307, 'train_samples_per_second': 0.245, 'train_steps_per_second': 0.123, 'total_flos': 3250656903168000.0, 'train_loss': 0.2978282725016276, 'epoch': 3.0})

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 1


{'eval_loss': 0.41508251428604126,
 'eval_rouge1': 51.0237,
 'eval_rouge2': 24.1668,
 'eval_rougeL': 39.646,
 'eval_rougeLsum': 39.6125,
 'eval_gen_len': 29.625,
 'eval_runtime': 848.0805,
 'eval_samples_per_second': 0.236,
 'eval_steps_per_second': 0.236,
 'epoch': 3.0}

In [16]:
print(data['test'][10]['dialogue'])

Wanda: Let's make a party!
Gina: Why?
Wanda: beacuse. I want some fun!
Gina: ok, what do u need?
Wanda: 1st I need too make a list
Gina: noted and then?
Wanda: well, could u take yours father car and go do groceries with me?
Gina: don't know if he'll agree
Wanda: I know, but u can ask :)
Gina: I'll try but theres no promisess
Wanda: I know, u r the best!
Gina: When u wanna go
Wanda: Friday?
Gina: ok, I'll ask


In [17]:
import re
conversation = """Wanda: Let's make a party!
Gina: Why?
Wanda: beacuse. I want some fun!
Gina: ok, what do u need?
Wanda: 1st I need too make a list
Gina: noted and then?
Wanda: well, could u take yours father car and go do groceries with me?
Gina: don't know if he'll agree
Wanda: I know, but u can ask :)
Gina: I'll try but theres no promisess
Wanda: I know, u r the best!
Gina: When u wanna go
Wanda: Friday?
Gina: ok, I'll ask
"""
#tokenize the conversation
model_inputs = tokenizer(conversation,  max_length=max_input, padding='max_length', truncation=True)
#make prediction
raw_pred, _, _ = trainer.predict([model_inputs])

result = tokenizer.decode(raw_pred[0]).replace("</s>","").replace("<pad>","")
#decode the output


print(result)

***** Running Prediction *****
  Num examples = 1
  Batch size = 1


Wanda wants to have a party. Wanda and Gina will go shopping on Friday. Gina will take her father's car and go shopping with Wanda.


Loading

In [18]:
args = transformers.Seq2SeqTrainingArguments(
    output_dir=f"mt5-small-finetune-sumsum",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
#     logging_steps=logging_steps,
#     push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [19]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/google/mt5-small/resolve/main/config.json from cache at /Users/vazgen/.cache/huggingface/transformers/97693496c1a0cae463bd18428187f9e9924d2dfbadaa46e4d468634a0fc95a41.dadce13f8f85f4825168354a04675d4b177749f8f11b167e87676777695d4fe4
Model config MT5Config {
  "_name_or_path": "google/mt5-small",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 250112
}

loa

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [22]:
model2 = AutoModelForSeq2SeqLM.from_pretrained('Paligonshik/mt5-small-finetune-sumsum')
trainer2 = Seq2SeqTrainer(
    model2,
    args,
    train_dataset=tokenize_data["train"],
    eval_dataset=tokenize_data["validation"],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

loading configuration file https://huggingface.co/Paligonshik/mt5-small-finetune-sumsum/resolve/main/config.json from cache at /Users/vazgen/.cache/huggingface/transformers/9633a5c3ab445d813c376ecfdaa6d0d708b24e7cf9efc04acda49bc1d409dcc4.befe39e9588513959b94c3916f8f56144e1d9cf5ec098657fc5af1fde9d5439c
Model config MT5Config {
  "_name_or_path": "Paligonshik/mt5-small-finetune-sumsum",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_

In [23]:
data['test'][10]

{'id': '13680771',
 'dialogue': "Wanda: Let's make a party!\r\nGina: Why?\r\nWanda: beacuse. I want some fun!\r\nGina: ok, what do u need?\r\nWanda: 1st I need too make a list\r\nGina: noted and then?\r\nWanda: well, could u take yours father car and go do groceries with me?\r\nGina: don't know if he'll agree\r\nWanda: I know, but u can ask :)\r\nGina: I'll try but theres no promisess\r\nWanda: I know, u r the best!\r\nGina: When u wanna go\r\nWanda: Friday?\r\nGina: ok, I'll ask",
 'summary': "Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries together. They set the date for Friday. "}

In [24]:
max_input_length = 512
max_target_length = 30
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["dialogue"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
test_sample = preprocess_function(data['test'][10])

In [26]:
from nltk.tokenize import sent_tokenize
import evaluate

rouge_score = evaluate.load("rouge")

In [27]:
pred = trainer2.predict([test_sample])

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


In [28]:
pred = trainer2.predict([test_sample])
tokenizer.decode(pred[1][0])

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


"Wanda wants to throw a party. She asks Gina to borrow her father's car and go do groceries</s>"