In [1]:
!pip install transformers datasets evaluate rouge_score

Collecting transformers
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting datasets
  Using cached datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Using cached evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.10.3-cp38-cp38-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp38-cp38-macosx_11_

In [2]:
!pip install torch



In [3]:
!pip install bert_score

Collecting bert_score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [47]:
from datasets import load_dataset
import multiprocessing as mp
import torch

### Q1 : Use a pre-trained google/flan-t5-small as the model.

In [48]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

### Q2 : Verify if the summariza'on task works.

In [49]:
ca_test_billsum = load_dataset("billsum", split='ca_test')

In [50]:
ca_test_billsum

Dataset({
    features: ['text', 'summary', 'title'],
    num_rows: 1237
})

In [52]:
def preprocess(x):
    x['text'] = ['summarize: ' + xi for xi in x['text']]
    return x

ca_test_billsum = ca_test_billsum.map(lambda x: preprocess(x), batched=True, num_proc=mp.cpu_count())

Map (num_proc=10):   0%|          | 0/1237 [00:00<?, ? examples/s]

In [53]:
ca_test_billsum = ca_test_billsum.map(lambda x: tokenizer(x['text'], padding=True, truncation=True, max_length=1692, 
                                                          return_tensors="pt"), batched=True, num_proc=mp.cpu_count())

Map (num_proc=10):   0%|          | 0/1237 [00:00<?, ? examples/s]

In [54]:
def generate(x):
    
    x['output'] = [model.generate(torch.tensor(x_ids).reshape(1, -1), 
                                  max_new_tokens=100, 
                                  do_sample=False)[0] for x_ids in x['input_ids']]
    return x
    
ca_test_billsum = ca_test_billsum.map(lambda x: generate(x), batched=True, batch_size=8)

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [55]:
def decode(x):
    x['generated_summary'] = [tokenizer.decode(torch.tensor(xo), skip_special_tokens=True) for xo in x['output']]
    return x

ca_test_billsum = ca_test_billsum.map(lambda x: decode(x), batched=True)

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [56]:
ca_test_billsum

Dataset({
    features: ['text', 'summary', 'title', 'input_ids', 'attention_mask', 'output', 'generated_summary'],
    num_rows: 1237
})

In [57]:
from datasets import load_metric
metric = load_metric('rouge')

In [58]:
metric_dict = metric.compute(predictions=ca_test_billsum['generated_summary'], references=ca_test_billsum['summary'])

In [61]:
md = {}
for k, v in metric_dict.items():
    lp = 'L:' + str(round(v.low.precision, 3))
    mp = 'M:' + str(round(v.mid.precision, 3))
    hp = 'H:' + str(round(v.high.precision, 3))
    ps = ', '.join([lp, mp, hp])

    lr = 'L:' + str(round(v.low.recall, 3))
    mr = 'M:' + str(round(v.mid.recall, 3))
    hr = 'H:' + str(round(v.high.recall, 3))
    rs = ', '.join([lr, mr, hr])

    lf = 'L:' + str(round(v.low.fmeasure, 3))
    mf = 'M:' + str(round(v.mid.fmeasure, 3))
    hf = 'H:' + str(round(v.high.fmeasure, 3))
    fs = ', '.join([lf, mf, hf])

    md[k] = [ps, rs, fs]
pdf = pd.DataFrame(md, index=['precision', 'recall', 'f-measure']) 

In [62]:
pdf

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
precision,"L:0.535, M:0.55, H:0.566","L:0.185, M:0.197, H:0.209","L:0.429, M:0.444, H:0.46","L:0.477, M:0.493, H:0.507"
recall,"L:0.08, M:0.084, H:0.09","L:0.033, M:0.036, H:0.039","L:0.057, M:0.06, H:0.064","L:0.067, M:0.071, H:0.075"
f-measure,"L:0.124, M:0.131, H:0.138","L:0.052, M:0.056, H:0.06","L:0.089, M:0.094, H:0.098","L:0.105, M:0.111, H:0.116"


In [64]:
generated_summarys_len = [len(gs) for gs in ca_test_billsum['generated_summary']]

In [65]:
import numpy as np

np.average(generated_summarys_len), np.median(generated_summarys_len)

(237.56750202101858, 187.0)

In [66]:
summarys_len = [len(gs) for gs in ca_test_billsum['summary']]

np.average(summarys_len), np.median(summarys_len)

(2168.5828617623283, 1894.0)

### Q3: Verify if the Q&A task works.

In [249]:
from datasets import load_dataset

squad = load_dataset("squad", split='validation[0:1000]')

In [250]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})

In [251]:
from transformers import AutoModelForQuestionAnswering
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
# model = AutoModelForQuestionAnswering.from_pretrained("google/flan-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [252]:
def preprocess(x):
    
    x['prompt'] = ['Given a question and context, Answer the question using context. Question: ' + xi[0] + ' Context: ' + xi[1]  
                 for xi in list(zip(x['question'],  x['context']))]
    return x

squad = squad.map(lambda x: preprocess(x), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [253]:
squad = squad.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True, 
                                      return_tensors="pt"), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [254]:
def generate(x):
    
    x['output'] = [model.generate(torch.tensor(x_ids).reshape(1, -1), 
                                  max_new_tokens=100, 
                                  do_sample=False)[0] for x_ids in x['input_ids']]
    return x
    
squad = squad.map(lambda x: generate(x), batched=True, batch_size=8)

# def apply(x):
#     x['output'] = [model(torch.tensor(x_ids).reshape(1, -1)) for x_ids in x['input_ids']]
#     return x
    
# squad = squad.map(lambda x: apply(x), batched=True, batch_size=8)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [255]:
def decode(x):
    x['predicted_answers'] = [tokenizer.decode(torch.tensor(xo), skip_special_tokens=True) for xo in x['output']]
    return x

squad = squad.map(lambda x: decode(x), batched=True)

# def decode(x):

#     pa = []
#     for i, xo in enumerate(x['output']):
#         answer_start_index = np.argmax(xo['start_logits'])
#         answer_end_index = np.argmax(xo['end_logits'])
#         inputs = x['input_ids'][i]
#         predict_answer_tokens = inputs[answer_start_index : answer_end_index + 1]
#         pa.append(tokenizer.decode(predict_answer_tokens))   
#     x['predicted_answers'] = pa
#     return x

# squad = squad.map(lambda x: decode(x), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [256]:
from evaluate import load
squad_metric = load("squad")

In [257]:
predictions = [{'id': t[0], 'prediction_text':t[1]} for t in list(zip(squad['id'], squad['predicted_answers']))]
references = [{'id': t[0], 'answers':t[1]} for t in list(zip(squad['id'], squad['answers']))]

squad_metric.compute(predictions=predictions, references=references)

{'exact_match': 68.0, 'f1': 73.30533849203195}

scores suggest that the Flan T5 Small pretrained model performs reasonably well on the SQuAD validation set, but there is still room for improvement, especially in achieving exact matches for a higher percentage of questions. 

### Q4: Verify if English to French transla'on task works.

In [190]:
from datasets import load_dataset

books = load_dataset("opus_books", "en-fr", split='train[0:500]')

In [191]:
books

Dataset({
    features: ['id', 'translation'],
    num_rows: 500
})

In [194]:
books['translation'][0]

{'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}

In [196]:
def preprocess(x):
    
    x['prompt'] = ['Translate given english text to french. Given English Text: ' + xi['en'] for xi in x['translation']]
    return x

books = books.map(lambda x: preprocess(x), batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [199]:
books = books.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True, 
                                      return_tensors="pt"), batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [200]:
books = books.map(lambda x: generate(x), batched=True, batch_size=8)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [201]:
books = books.map(lambda x: decode(x), batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [203]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.3.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-4.9.3.tar.gz (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hDownloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.8.2-py3-none

In [204]:
import evaluate

metric = evaluate.load("sacrebleu")

In [205]:
books

Dataset({
    features: ['id', 'translation', 'prompt', 'input_ids', 'attention_mask', 'output', 'predicted_answers'],
    num_rows: 500
})

In [206]:
def get_actual_translations(x):
    x['actual_translation'] = [t['fr'] for t in x['translation']]
    return x
    

books = books.map(lambda x: get_actual_translations(x), batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [212]:
books['predicted_answers'][2], books['actual_translation'][2]

('Première partie', 'PREMIÈRE PARTIE')

In [220]:
metric.compute(predictions=books['predicted_answers'], references=[[at] for at in books['actual_translation']], 
               lowercase=True)

{'score': 4.012295245174718,
 'counts': [2980, 723, 217, 75],
 'totals': [10894, 10417, 9948, 9489],
 'precisions': [27.354507068110888,
  6.940577901507152,
  2.1813429835142744,
  0.7903888713246917],
 'bp': 0.943290711629967,
 'sys_len': 10894,
 'ref_len': 11530}

The BLEU score of 4.0123 suggests that the Flan T5 Small pretrained model achieves a moderate level of performance in English to French translation.

The n-gram precisions show that the model performs better at capturing individual words (unigrams) than longer phrases.

The brevity penalty is close to 1, indicating that the lengths of the generated and reference translations are reasonably well-matched

### Q5: Programma'cally print the names of all the model layers and their dimensions.

In [296]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [297]:
for name, param in model.named_parameters():
    print(f"Layer: {name}, Shape: {param.data.shape}")

Layer: shared.weight, Shape: torch.Size([32128, 512])
Layer: encoder.block.0.layer.0.SelfAttention.q.weight, Shape: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.k.weight, Shape: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.v.weight, Shape: torch.Size([384, 512])
Layer: encoder.block.0.layer.0.SelfAttention.o.weight, Shape: torch.Size([512, 384])
Layer: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight, Shape: torch.Size([32, 6])
Layer: encoder.block.0.layer.0.layer_norm.weight, Shape: torch.Size([512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_0.weight, Shape: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wi_1.weight, Shape: torch.Size([1024, 512])
Layer: encoder.block.0.layer.1.DenseReluDense.wo.weight, Shape: torch.Size([512, 1024])
Layer: encoder.block.0.layer.1.layer_norm.weight, Shape: torch.Size([512])
Layer: encoder.block.1.layer.0.SelfAttention.q.weight, Shape: torch.Size([384, 512])

In [298]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params}")

Total Parameters: 76961152


In [299]:
print('Before final layer weights update:\n ', model.decoder.final_layer_norm.weight)

model.decoder.final_layer_norm.weight.data.fill_(0.0)

# Verify that the values have been set to zeros
print('After final layer weights update:\n ', model.decoder.final_layer_norm.weight)

Before final layer weights update:
  Parameter containing:
tensor([ 1.5583e-01,  1.6458e-01,  1.8197e-01,  2.0792e-01,  1.5886e-01,
         1.4222e-01,  1.5845e-01,  1.4269e-01,  1.3648e-01,  1.5702e-01,
         1.6670e-01,  1.3271e-01,  1.7980e-01,  3.2683e-01,  2.0897e-01,
         2.6234e-01,  1.8381e-01,  1.8566e-01,  1.8115e-01,  1.9588e-01,
         1.5456e-01,  2.1353e-01,  1.5126e-01,  1.6348e-01,  1.8062e-01,
         1.4414e-01,  1.7974e-01,  2.0646e-01,  1.7899e-01,  2.0434e-01,
         1.6415e-01,  1.4987e-01,  1.3866e-01,  2.2488e-01,  1.7041e-01,
         6.1698e-01,  1.8228e-01,  1.7578e-01,  1.6113e-01,  2.4024e-01,
         1.6280e-01,  2.2871e-01,  1.6127e-01,  1.8426e-01,  2.1641e-01,
         2.6774e-01,  1.8475e-01,  1.5955e-01,  2.5002e-01,  1.9592e-01,
         1.5467e-01,  2.0025e-01,  1.7020e-01,  1.4393e-01,  1.9788e-01,
         1.5900e-01,  1.4895e-01,  1.5042e-01,  2.6026e-01,  1.5933e-01,
         1.5081e-01,  2.0102e-01,  1.9843e-01,  1.5577e-01,  1.52

### Q6: Verify if the Q&A task works after resetting the weights of the above layer.

In [263]:
from datasets import load_dataset

squad = load_dataset("squad", split='validation[0:1000]')

In [264]:
def preprocess(x):
    
    x['prompt'] = ['Given a question and context, Answer the question using context. Question: ' + xi[0] + ' Context: ' + xi[1]  
                 for xi in list(zip(x['question'],  x['context']))]
    return x

squad = squad.map(lambda x: preprocess(x), batched=True)

In [265]:
squad = squad.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True, 
                                      return_tensors="pt"), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [266]:
def generate(x):
    
    x['output'] = [model.generate(torch.tensor(x_ids).reshape(1, -1), 
                                  max_new_tokens=100, 
                                  do_sample=False)[0] for x_ids in x['input_ids']]
    return x
    
squad = squad.map(lambda x: generate(x), batched=True, batch_size=8)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [268]:
squad = squad.map(lambda x: decode(x), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [269]:
from evaluate import load
squad_metric = load("squad")

predictions = [{'id': t[0], 'prediction_text':t[1]} for t in list(zip(squad['id'], squad['predicted_answers']))]
references = [{'id': t[0], 'answers':t[1]} for t in list(zip(squad['id'], squad['answers']))]

squad_metric.compute(predictions=predictions, references=references)

{'exact_match': 0.0, 'f1': 0.0}

Resetting the final decoder layer weights to zero has severely degradation in the model's ability to understand and generate accurate responses to questions

### Q9: Replace the decoder.final_layer_norm.weight with a layer of smaller dimensions and adjust all the dependent layers to match the dimension

In [301]:
model.decoder

T5Stack(
  (embed_tokens): Embedding(32128, 512)
  (block): ModuleList(
    (0): T5Block(
      (layer): ModuleList(
        (0): T5LayerSelfAttention(
          (SelfAttention): T5Attention(
            (q): Linear(in_features=512, out_features=384, bias=False)
            (k): Linear(in_features=512, out_features=384, bias=False)
            (v): Linear(in_features=512, out_features=384, bias=False)
            (o): Linear(in_features=384, out_features=512, bias=False)
            (relative_attention_bias): Embedding(32, 6)
          )
          (layer_norm): T5LayerNorm()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (1): T5LayerCrossAttention(
          (EncDecAttention): T5Attention(
            (q): Linear(in_features=512, out_features=384, bias=False)
            (k): Linear(in_features=512, out_features=384, bias=False)
            (v): Linear(in_features=512, out_features=384, bias=False)
            (o): Linear(in_features=384, out_features=512, bias=Fa

In [286]:
new_dimension = 256

In [306]:
model.decoder.final_layer_norm.weight.data = model.decoder.final_layer_norm.weight.data[:new_dimension]
for i, block in enumerate(model.decoder.block):
    if i == 0:
        block.layer[0].SelfAttention.o.weight.data = block.layer[0].SelfAttention.o.weight.data[:new_dimension, :]
        continue
    
    # Self-Attention Layer
    block.layer[0].SelfAttention.q.weight.data = block.layer[0].SelfAttention.q.weight.data[:, :new_dimension]
    block.layer[0].SelfAttention.k.weight.data = block.layer[0].SelfAttention.k.weight.data[:, :new_dimension]
    block.layer[0].SelfAttention.v.weight.data = block.layer[0].SelfAttention.v.weight.data[:, :new_dimension]
    block.layer[0].SelfAttention.o.weight.data = block.layer[0].SelfAttention.o.weight.data[:new_dimension, :]

    # Cross-Attention Layer
    block.layer[1].EncDecAttention.q.weight.data = block.layer[1].EncDecAttention.q.weight.data[:, :new_dimension]
    block.layer[1].EncDecAttention.k.weight.data = block.layer[1].EncDecAttention.k.weight.data[:, :new_dimension]
    block.layer[1].EncDecAttention.v.weight.data = block.layer[1].EncDecAttention.v.weight.data[:, :new_dimension]
    block.layer[1].EncDecAttention.o.weight.data = block.layer[1].EncDecAttention.o.weight.data[:new_dimension, :]

    # Dense Relu Layer
    block.layer[2].DenseReluDense.wi_0.weight.data = block.layer[2].DenseReluDense.wi_0.weight.data[:, :new_dimension]
    block.layer[2].DenseReluDense.wi_1.weight.data = block.layer[2].DenseReluDense.wi_1.weight.data[:, :new_dimension]
    block.layer[2].DenseReluDense.wo.weight.data = block.layer[2].DenseReluDense.wo.weight.data[:new_dimension, :]
    
    # Layer Norm
    block.layer[2].layer_norm.weight.data = block.layer[2].layer_norm.weight.data[:new_dimension]

### Reload the original google/flan-t5-small model.

In [319]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.24.1


In [25]:
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [26]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

### Q11: Train the model for a Q&A task that takes a context as addi'onal input along with the ques'on.

In [27]:
def preprocess(examples):
    questions =[q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    answers = [" or ".join(a['text']) for a in examples["answers"]]
    answers = [a.strip() for a in answers]
    
    l = list(zip(questions, contexts, answers))
    instr = "This is QA task where given a question, you need to answer it strictly using the context. "
    examples['prompt'] = [instr + " \n Question is: \n " + i[0] + " \n Context is: \n " + i[1] + " \n Answer is : \n " for i in l]

    return examples

In [57]:
from datasets import load_dataset

squad = load_dataset("squad", split='train[0:5000]')

p_squad = squad.map(preprocess, batched=True)

Using the latest cached version of the module from /Users/drpawar/.cache/huggingface/modules/datasets_modules/datasets/squad/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453 (last modified on Sat Nov 25 11:52:51 2023) since it couldn't be found locally at squad., or remotely on the Hugging Face Hub.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [58]:
from datasets import load_dataset

squad = load_dataset("squad", split='train[5001:8000]')

p_squad_val = squad.map(preprocess, batched=True)

Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

In [59]:
p_squad = p_squad.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True,
                                          return_tensors="pt"), batched=True)
p_squad_val = p_squad_val.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True, 
                                                  return_tensors="pt"), batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

In [60]:
def get_labels(x):
    answers = [" or ".join(a['text']) for a in x["answers"]]
    answers = [a.strip() for a in answers]
    labels = tokenizer(text_target=answers, padding=True, truncation=True, return_tensors="pt")
    x["labels"] = labels["input_ids"]
    return x
    
p_squad = p_squad.map(lambda x: get_labels(x), batched=True)
p_squad_val = p_squad_val.map(lambda x: get_labels(x), batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

In [61]:

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./finetuning_output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=3e-5,
    num_train_epochs=3,
    # logging & evaluation strategies
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    push_to_hub=False,
    use_mps_device=True
)



In [62]:
import evaluate

# Metric
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

In [63]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [64]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=p_squad,
    eval_dataset=p_squad_val,
    compute_metrics=compute_metrics
)

In [65]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,4.4117,0.079908,0.766433,0.472363,0.765357,0.765771
2,0.1222,0.039436,0.803508,0.502198,0.802617,0.803051
3,0.0522,0.038852,0.805777,0.501293,0.80493,0.805163


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1875, training_loss=1.2329924387613933, metrics={'train_runtime': 1710.9676, 'train_samples_per_second': 8.767, 'train_steps_per_second': 1.096, 'total_flos': 2788357570560000.0, 'train_loss': 1.2329924387613933, 'epoch': 3.0})

In [66]:
finetuned_model = trainer.model

In [67]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
finetuned_model_mps = finetuned_model.to(device)

In [68]:
from datasets import load_dataset

squad = load_dataset("squad", split='validation[0:1000]')

squad = squad.map(preprocess, batched=True)

In [69]:
squad = squad.map(lambda x: tokenizer(x['prompt'], padding=True, truncation=True, 
                                      return_tensors="pt"), batched=True)

In [70]:
import torch
def generate(x):
    
    x['output'] = [finetuned_model_mps.generate(torch.tensor(x_ids, device='mps').reshape(1, -1), 
                                  max_new_tokens=100, 
                                  do_sample=False)[0] for x_ids in x['input_ids']]
    return x
    
squad = squad.map(lambda x: generate(x), batched=True, batch_size=8)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [71]:
def decode(x):
    x['predicted_answers'] = [tokenizer.decode(torch.tensor(xo), skip_special_tokens=True) for xo in x['output']]
    return x

squad = squad.map(lambda x: decode(x), batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [72]:
import evaluate

squad_metric = evaluate.load('squad')



In [73]:
predictions = [{'id': t[0], 'prediction_text':t[1]} for t in list(zip(squad['id'], squad['predicted_answers']))]
references = [{'id': t[0], 'answers':t[1]} for t in list(zip(squad['id'], squad['answers']))]

squad_metric.compute(predictions=predictions, references=references)

{'exact_match': 76.7, 'f1': 82.45654752265274}

The fine-tuned model has shown a notable enhancement in both the Exact Match and F1 Score compared to the pretrained model, indicating that the fine-tuning process has effectively improved the model's performance on the SQuAD dataset.

Achieving an EM score of 76.7% is considered quite good in the context of SQuAD, where providing the exact correct answer can be challenging due to the diversity and complexity of questions.

The F1 score of 82.46% demonstrates that the fine-tuned model is not only accurate in terms of exact matches but also excels in providing answers that closely align with the ground truth answers