# T5 Fine-Tuning

### Import Libraries

In [2]:
from copy import deepcopy
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import pandas as pd
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration
import math

### Load Dataset Function

In [3]:
def load_data(name, inpExpOutFunc, tokenizer):
    dataset = load_dataset(name)
    train_data = dataset['train']
    test_data = dataset['validation']

    def preprocess_function(dataset):
        dataset["input_ids"] = []
        dataset["attention_mask"] = []
        dataset["labels"] = []
        dataset = dict(dataset)
        for index in range(len(dataset[list(dataset)[0]])):
            inp, exp_out = inpExpOutFunc(dataset, index)
            model_inputs = tokenizer(inp, max_length=1024, truncation=True)
            labels = tokenizer(exp_out, max_length=1024, truncation=True)
            dataset["input_ids"].append(model_inputs["input_ids"])
            dataset["attention_mask"].append(model_inputs["attention_mask"])
            dataset["labels"].append(labels["input_ids"])
        return dataset

    train_data = train_data.map(preprocess_function, batched=True)
    test_data = test_data.map(preprocess_function, batched=True)

    return train_data, test_data

### Load Dataset

In [8]:

#************************ TRY TO FIND OUT MORE DATASETS #************************  
#********************* https://arxiv.org/pdf/2101.10421.pdf *********************
#********************************************************************************


tokenizer = AutoTokenizer.from_pretrained('t5-small')
tokenizer.pad_token = tokenizer.eos_token

def sciq_extract (dataset, index): 
    return dataset['support'][index], dataset['question'][index]
train_data_sciq, test_data_sciq = load_data("sciq", sciq_extract, tokenizer)  # scientific questions and answers


def squad_extract(dataset, index):
    return {
        "answer": dataset['answers'][index]["text"][0], 
        "question": dataset['question'][index],
    }
train_data_squad, test_data_squad = load_data('squad', squad_extract, tokenizer)  # wikipedia questions and answers


def piqa_extract(dataset, index):
    if dataset["label"][index] == 0:
        return dataset["sol1"][index], dataset["goal"][index]
    else:
        return dataset["sol2"][index], dataset["goal"][index]
train_data_piqa, test_data_piqa = load_data("piqa", piqa_extract, tokenizer)

test_data = datasets.concatenate_datasets(
    (test_data_sciq, test_data_squad, test_data_piqa))
train_data = datasets.concatenate_datasets(
    (train_data_sciq, train_data_squad, train_data_piqa))

# keep only input_ids, attention_mask, and labels

def clean_dataset(dataset):
    columns_remove = dataset.column_names
    columns_remove.remove("input_ids")
    columns_remove.remove("attention_mask")
    columns_remove.remove("labels")
    return dataset.remove_columns(columns_remove)


test_data = clean_dataset(test_data)
train_data = clean_dataset(train_data)

Using custom data configuration default
Reusing dataset sciq (C:\Users\eshaa\.cache\huggingface\datasets\sciq\default\0.1.0\50e5c6e3795b55463819d399ec417bfd4c3c621105e00295ddb5f3633d708493)
100%|██████████| 3/3 [00:00<00:00, 599.81it/s]
100%|██████████| 12/12 [00:06<00:00,  1.80ba/s]
100%|██████████| 1/1 [00:00<00:00,  1.76ba/s]
Reusing dataset squad (C:\Users\eshaa\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 153.85it/s]
100%|██████████| 88/88 [00:20<00:00,  4.29ba/s]
100%|██████████| 11/11 [00:02<00:00,  3.97ba/s]
Reusing dataset piqa (C:\Users\eshaa\.cache\huggingface\datasets\piqa\plain_text\1.1.0\6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011)
100%|██████████| 3/3 [00:00<00:00, 748.18it/s]
100%|██████████| 17/17 [00:04<00:00,  3.90ba/s]
100%|██████████| 2/2 [00:00<00:00,  4.13ba/s]


In [13]:
tokenizer("Hello There I am eshaan barkataki", "and I am not eshaan barkataki")

{'input_ids': [8774, 290, 27, 183, 3, 15, 7, 1024, 152, 21696, 144, 11259, 1, 11, 27, 183, 59, 3, 15, 7, 1024, 152, 21696, 144, 11259, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### Visualizing at the dataset

In [9]:
train_data.to_pandas()

Unnamed: 0,question,distractor3,distractor1,distractor2,correct_answer,support,input_ids,attention_mask,labels,id,title,context,answers,goal,sol1,sol2,label
0,What type of organism is commonly used in prep...,viruses,protozoa,gymnosperms,mesophilic organisms,"Mesophiles grow best in moderate temperature, ...","[10162, 21144, 15, 7, 1604, 200, 16, 8107, 291...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[363, 686, 13, 9329, 19, 5871, 261, 16, 4537, ...",,,,,,,,
1,What phenomenon makes global winds blow northe...,tropical effect,muon effect,centrifugal effect,coriolis effect,Without Coriolis Effect the global winds would...,"[6404, 638, 27953, 159, 14247, 8, 1252, 13551,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[363, 15037, 656, 1252, 13551, 6019, 25806, 12...",,,,,,,,
2,Changes from a less-ordered state to a more-or...,endothermic,unbalanced,reactive,exothermic,Summary Changes of state are examples of phase...,"[20698, 5968, 7, 13, 538, 33, 4062, 13, 3944, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5968, 7, 45, 3, 9, 705, 18, 9397, 15, 26, 538...",,,,,,,,
3,What is the least dangerous radioactive decay?,zeta decay,beta decay,gamma decay,alpha decay,All radioactive decay is dangerous to living t...,"[432, 2252, 6645, 18907, 19, 5107, 12, 840, 37...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[363, 19, 8, 709, 5107, 2252, 6645, 18907, 58, 1]",,,,,,,,
4,Kilauea in hawaii is the world’s most continuo...,magma,greenhouse gases,carbon and smog,smoke and ash,Example 3.5 Calculating Projectile Motion: Hot...,"[18792, 3, 9285, 18555, 1014, 2786, 699, 18833...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[12672, 402, 15, 9, 16, 3, 107, 7396, 23, 23, ...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115386,,,,,,,"[54, 36, 169, 12, 2451, 3, 115, 3082, 12, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[3, 17, 4365, 1]",,,,,tack,can be use to secure pole to the wall,can be use to secure broom to the wall,1.0
115387,,,,,,,"[3399, 3, 9, 18876, 1346, 3047, 30, 8, 420, 77...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[571, 12, 1349, 3, 9, 18876, 5, 1]",,,,,How to clean a dishwasher.,Place a dishwasher safe bowl on the top rack o...,Place a dishwasher safe bowl on the top rack o...,1.0
115388,,,,,,,"[286, 8, 4295, 30, 3, 9, 161, 1774, 6, 171, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[304, 7042, 4295, 21, 1991, 6, 1]",,,,,"To seal leather for furniture,","place the leather on a work surface, pour a sm...","place the leather on a work surface, pour a sm...",1.0
115389,,,,,,,"[12607, 3, 9, 1905, 81, 16672, 423, 28, 387, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1796, 7490, 9, 221, 1]",,,,,Make lemonade,Fill a glass about 3/4 full with water. Using ...,Fill a glass about 3/4 full with water. Using ...,0.0
