<a href="https://colab.research.google.com/github/nnilayy/LLMs-And-Transformers/blob/main/Finetuning_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download HF Libraries

In [1]:
!pip install transformers -U
!pip install datasets -U
!pip install accelerate -U
!pip install evaluate -U



## Setting Up Device & Google Drive

In [None]:
import torch
print(torch.__version__)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
# torch.cuda.empty_cache()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importing and PreProcessing Dataset ❌

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True,)

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Train, Test, Valid Dataset
train_dataset = load_dataset("rotten_tomatoes",split='train')
test_dataset = load_dataset("rotten_tomatoes",split='test')
validation_dataset = load_dataset("rotten_tomatoes",split='validation')

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [24]:
# Sample Small Dataset
small_train_dataset = train_dataset.shuffle(seed=42).select(range(100))
small_eval_dataset = test_dataset.shuffle(seed=42).select(range(100))

In [None]:
train_dataset

In [None]:
train_dataset[0]

In [None]:
train_dataset.column_names

In [None]:
train_dataset.set_format(type="torch", columns=train_dataset.column_names, device='cpu')
train_dataset.format['type']

In [25]:
# Preprocessing Dataset
def tokenization(example):
    return tokenizer(example["text"],padding=True,truncation=True,max_length=512)

In [27]:
train_dataset = train_dataset.map(tokenization,
                               batched=True,
                               batch_size=1000,
                               num_proc=2,
                              #  remove_columns=dataset.column_names
                                  )



Map (num_proc=2):   0%|          | 0/8530 [00:00<?, ? examples/s]

## DataLoader

In [31]:
# DataLoader
from torch.utils.data import DataLoader
batch_size=512
train_loader=DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=0)
test_loader=DataLoader(test_dataset, batch_size=batch_size,shuffle=False, num_workers=0)
validation_loader=DataLoader(validation_dataset, batch_size=batch_size,shuffle=False, num_workers=0)

## Loading Model ❌

In [None]:
# Loading Model
from transformers import AutoModelForSequenceClassification
num_labels=5
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

## Setting Up Optimizer and LR Scheduler

In [None]:
# Optimizers
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Learning Rate Scheduler
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training and Testing Loops

In [None]:
# Training Loop
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Evaluation Loop
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

## Saving Model and Tokenizer

## Useful Functions

In [None]:
# Renaming
dataset = dataset.rename_column("sentence1", "sentenceA")
dataset = dataset.rename_column("sentence2", "sentenceB")
dataset

# Removing Columns
dataset = dataset.remove_columns("label")
dataset = dataset.remove_columns(['sentence1', 'sentence2'])