In [None]:
!pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git

Collecting git+https://github.com/PyTorchLightning/pytorch-lightning.git
  Cloning https://github.com/PyTorchLightning/pytorch-lightning.git to /tmp/pip-req-build-d8tu1r0f
  Running command git clone --filter=blob:none --quiet https://github.com/PyTorchLightning/pytorch-lightning.git /tmp/pip-req-build-d8tu1r0f
  Resolved https://github.com/PyTorchLightning/pytorch-lightning.git to commit 896c2a656ad2db3278ec11520aed04e378f4462b
  Running command git submodule update --init --recursive -q
  Encountered 22 file(s) that should have been pointers, but weren't:
        .notebooks/course_UvA-DL/01-introduction-to-pytorch.ipynb
        .notebooks/course_UvA-DL/02-activation-functions.ipynb
        .notebooks/course_UvA-DL/03-initialization-and-optimization.ipynb
        .notebooks/course_UvA-DL/04-inception-resnet-densenet.ipynb
        .notebooks/course_UvA-DL/05-transformers-and-MH-attention.ipynb
        .notebooks/course_UvA-DL/06-graph-neural-networks.ipynb
        .notebooks/course_UvA

In [None]:
# Data Loading and Preprocessing
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl

In [None]:
# Download and unzip dataset
!gdown --id 1vzhXELAhY98RqVIpBxAOr70WBkDh04hU
!unzip -q data.zip

Downloading...
From: https://drive.google.com/uc?id=1vzhXELAhY98RqVIpBxAOr70WBkDh04hU
To: /content/data.zip
100% 13.9M/13.9M [00:00<00:00, 52.9MB/s]


In [None]:
# Load the data from a JSON file
with open('/content/data/tydiqa-goldp-v1.1-train-ar.json', 'r') as file:
    data = json.load(file)

# Initialize lists to store extracted information
questions = []
contexts = []
answer_texts = []
answer_starts = []

# Iterate over each row in the DataFrame
for row in data['data']:
    for paragraph in row['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            answer = qa['answers'][0]  # Assuming there's only one answer
            questions.append(question)
            contexts.append(context)
            answer_texts.append(answer['text'])
            answer_starts.append(answer['answer_start'])

# Compute answer_ends
answer_ends = [start + len(answer) for start, answer in zip(answer_starts, answer_texts)]

# Create DataFrame
df = pd.DataFrame({
    'question': questions,
    'context': contexts,
    'answer_text': answer_texts,
    'answer_start': answer_starts,
    'answer_end': answer_ends
})

In [None]:
df = df.drop_duplicates(subset=["context"]).reset_index(drop=True)
df = df.drop(df.tail(1).index)  # Drop the last row

In [None]:
# Dataset and DataLoader
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import T5Tokenizer

In [None]:
class AraQADataset(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len=512, target_max_token_len=64):
        self.data = data
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        source_encoding = self.tokenizer(
            data_row["question"],
            data_row["context"],
            max_length=self.source_max_token_len,
            padding="max_length",
            truncation="only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            data_row["answer_text"],
            max_length=self.target_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100

        return {
            'question': data_row["question"],
            'context': data_row["context"],
            'answer_text': data_row["answer_text"],
            'input_ids': source_encoding["input_ids"].flatten(),
            'attention_mask': source_encoding["attention_mask"].flatten(),
            'labels': labels.flatten()
        }

In [None]:
class AraQADataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, tokenizer, batch_size=8, source_max_token_len=512, target_max_token_len=64):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage=None):
        self.train_dataset = AraQADataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = AraQADataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=2)

# Model Definition
from transformers import T5ForConditionalGeneration, AdamW
import pytorch_lightning as pl
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
class AraQAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("UBC-NLP/AraT5v2-base-1024", return_dict=True)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        scheduler = {
            'scheduler': ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True),
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [None]:
# Training Setup
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [None]:
BATCH_SIZE = 8
N_EPOCHS = 4

In [None]:
train_df, val_df = train_test_split(df, test_size=0.05)
tokenizer = T5Tokenizer.from_pretrained("UBC-NLP/AraT5v2-base-1024")
data_module = AraQADataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
model = AraQAModel()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [None]:
logger = TensorBoardLogger("training-logs", name="ara-qa")

trainer = pl.Trainer(
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
    accelerator='auto'  # Use GPU if available
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
%load_ext tensorboard


In [None]:
%tensorboard --logdir ./training-logs


In [None]:
trainer.fit(model, data_module)


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 367 M 
-----------------------------------------------------
367 M     Trainable params
0         Non-trainable params
367 M     Total params
1,470.035 Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 0, global step 1259: 'val_loss' reached 0.38236 (best 0.38236), saving model to '/content/checkpoints/best-checkpoint.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 1, global step 2518: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 2, global step 3777: 'val_loss' was not in top 1


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Epoch 3, global step 5036: 'val_loss' was not in top 1
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/checkpoints/best-checkpoint.ckpt" "/content/drive/MyDrive/"
