<a href="https://colab.research.google.com/github/pinilDissanayaka/sinhala-gpt2-finetune/blob/main/sinhala_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
from datasets import load_dataset


df = load_dataset("ihalage/sinhala-finetune-qa-eli5")

In [7]:
df

DatasetDict({
    train: Dataset({
        features: ['q_id', 'subreddit', 'url', 'sinhala_question', 'sinhala_answer', 'english_question', 'english_answer'],
        num_rows: 10000
    })
})

In [9]:
questions=list(df["train"]["sinhala_question"])

answers=list(df["train"]["sinhala_answer"])

In [12]:
from torch.utils.data import Dataset



class TextDataset(Dataset):
    def __init__(self, questions:list, answers:list, tokenizer):
        self.questions = questions
        self.answers = answers
        self.data=list()
        self.tokenizer = tokenizer

        for question, answer in zip(self.questions, self.answers):
            self.data.append("<startofstring> "+question+" <bot>:"+answer+" <endofstring>")

        self.data_tokenized = self.tokenizer(self.data, return_tensors="pt", padding=True, truncation=True)


        self.input_ids = self.data_tokenized["input_ids"]
        self.attention_mask = self.data_tokenized["attention_mask"]




    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [14]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm


device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.add_special_tokens({"pad_token": "<pad>", "bos_token": "<startofstring>", "eos_token": "<endofstring>"})

tokenizer.add_tokens(["<bot>:"])


optimizer=Adam(params=model.parameters())

dataset=TextDataset(questions, answers, tokenizer)

data_loader=DataLoader(dataset, batch_size=16, shuffle=True)


def train(model, optimizer, data, epochs):
    model=model.to(device)

    for epoch in tqdm(range(epochs)):
        for input_id, attention_mask in data:

            input_id = input_id.to(device)
            attention_mask = attention_mask.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_id, attention_mask=attention_mask, labels=input_id)

            loss = outputs[0]

            print(f"epoch: {epoch} loss: {loss}")

            loss.backward()
            optimizer.step()

    torch.save(model.state_dict(), f"model_epoch_{epochs}.pth")


def predict(model, text):
    text="<startofstring> "+text+" <bot>:"

    text_tokens=tokenizer(text)

    return model.generale(**text_tokens)

In [17]:
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50261, 768)

In [None]:
train(model=model, optimizer=optimizer, data=dataset, epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 0 loss: 1.3364648818969727
epoch: 0 loss: 11.647012710571289
epoch: 0 loss: 11.129278182983398
epoch: 0 loss: 8.42028522491455
epoch: 0 loss: 3.309866428375244
epoch: 0 loss: 9.103604316711426
epoch: 0 loss: 2.21403431892395
epoch: 0 loss: 2.6414053440093994
epoch: 0 loss: 2.589963674545288
epoch: 0 loss: 3.4212234020233154
epoch: 0 loss: 1.9077298641204834
epoch: 0 loss: 1.8403658866882324
epoch: 0 loss: 1.915820837020874
epoch: 0 loss: 6.328808784484863
epoch: 0 loss: 1.78564453125
epoch: 0 loss: 2.2644808292388916
epoch: 0 loss: 1.8613187074661255
epoch: 0 loss: 1.9595396518707275
epoch: 0 loss: 1.6613502502441406
epoch: 0 loss: 3.824514389038086
epoch: 0 loss: 3.90932559967041
epoch: 0 loss: 1.5255367755889893
epoch: 0 loss: 3.148688793182373
epoch: 0 loss: 3.3191535472869873
epoch: 0 loss: 2.155150890350342
epoch: 0 loss: 1.5667481422424316
epoch: 0 loss: 1.479907512664795
epoch: 0 loss: 1.709983468055725
epoch: 0 loss: 1.1891648769378662
epoch: 0 loss: 1.6299494504928589
e