<a href="https://colab.research.google.com/github/nnilayy/LLMs-And-Transformers/blob/main/Finetuning_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download HF Libraries

In [None]:
!pip install transformers -U
!pip install datasets -U
!pip install accelerate -U
!pip install evaluate -U

## Setting Up Device & Google Drive

In [None]:
import torch
print(torch.__version__)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
# torch.cuda.empty_cache()

2.0.1+cu118


device(type='cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importing and PreProcessing Dataset ❌

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Train, Test, Valid Dataset
# train_dataset = load_dataset("rotten_tomatoes",split='train')
# test_dataset = load_dataset("rotten_tomatoes",split='test')
# validation_dataset = load_dataset("rotten_tomatoes",split='validation')


# Entire Dataset
dataset=load_dataset("rotten_tomatoes")

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
for i in range(500):
  print(dataset['train']['labels'][i].item())

In [None]:
dataset.column_names

In [None]:
# Sample Small Dataset
small_train_dataset = train_dataset.shuffle(seed=42).select(range(100))
small_eval_dataset = test_dataset.shuffle(seed=42).select(range(100))
train_dataset[0]

In [None]:
# Preprocessing Function
def tokenization(example):
    return tokenizer(example["text"],
                     padding=True,
                     truncation=True,
                     max_length=512)

In [None]:
# train_dataset = train_dataset.map(tokenization,batched=True,batch_size=1000,num_proc=2,
#                                   # remove_columns=dataset.column_names
#                                   )
# test_dataset = test_dataset.map(tokenization,batched=True,batch_size=1000,num_proc=2,
#                                 # remove_columns=dataset.column_names
#                                 )
# validation_dataset = validation_dataset.map(tokenization,batched=True,batch_size=1000,num_proc=2,
#                                             # remove_columns=dataset.column_names
#                                             )

dataset=dataset.map(tokenization,batched=True,num_proc=2,
                                   # remove_columns=dataset.column_names
                                   )

In [None]:
dataset.column_names

In [None]:
# Dropping, Renaming Columns and Formatting dataset
# Desired Columns: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
dataset=dataset.remove_columns(["text"])
dataset=dataset.rename_column("label", "labels")
sets=['train','test','validation']
for set_type in sets:
  dataset[set_type].format['type']
  dataset[set_type].column_names

In [None]:
train_dataset=dataset['train']
test_dataset=dataset['test']
validation_dataset=dataset['validation']

## Custom Dataset

In [None]:
# Custom Dataset

## DataLoader

In [None]:
# DataLoader
from torch.utils.data import DataLoader
batch_size=256
train_loader=DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=0,collate_fn=data_collator)
test_loader=DataLoader(test_dataset, batch_size=batch_size,shuffle=False, num_workers=0,collate_fn=data_collator)
validation_loader=DataLoader(validation_dataset, batch_size=batch_size,shuffle=False, num_workers=0,collate_fn=data_collator)

## Loading Model ❌

In [None]:
# Loading Model
from transformers import AutoModelForSequenceClassification
num_labels=2
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Load the pre-trained BERT model and tokenizer
# model = BertModel.from_pretrained("bert-base-uncased")

## Tuning Models

In [None]:
from torch import nn
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = model
        self.fc = nn.Linear(768, 2)  # Adjust the output size according to your task

    def forward(self, input_ids, attention_mask):  # Add the 'labels' argument
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits


In [None]:
from torchsummary import summary
model=MyModel()
model.to(device)
# summary(model,(1,28,128))

## Setting Up Optimizer and LR Scheduler

In [None]:
# Optimizers
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Learning Rate Scheduler
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training and Testing Loops

In [None]:
# Training Loop
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for data in train_loader:
        outputs = model(input_ids=data['input_ids'], attention_mask=data['attention_mask'])
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
# Evaluation Loop
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

## Saving Model and Tokenizer

## Useful Functions

In [None]:
# Renaming
dataset = dataset.rename_column("sentence1", "sentenceA")
dataset = dataset.rename_column("sentence2", "sentenceB")
dataset

# Removing Columns
dataset = dataset.remove_columns("label")
dataset = dataset.remove_columns(['sentence1', 'sentence2'])

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets['train'][0]

In [None]:
tokenized_datasets=tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets=tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define the housing dataset
housing_data = [
    {"square_feet": 1500, "num_bedrooms": 3, "num_bathrooms": 2, "label": 1},
    {"square_feet": 2000, "num_bedrooms": 4, "num_bathrooms": 3, "label": 0},
    {"square_feet": 1200, "num_bedrooms": 2, "num_bathrooms": 1, "label": 1},
]

# Preprocess the dataset
input_texts = []
labels = []

for data in housing_data:
    square_feet = str(data["square_feet"])
    num_bedrooms = str(data["num_bedrooms"])
    num_bathrooms = str(data["num_bathrooms"])

    input_text = f"Square Feet: {square_feet}, Bedrooms: {num_bedrooms}, Bathrooms: {num_bathrooms}"
    input_texts.append(input_text)
    labels.append(data["label"])

# Tokenize the input texts
tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# Prepare input tensors and labels
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]
labels = torch.tensor(labels)

# Create a DataLoader for batching
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define the model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = model
        self.fc = nn.Linear(768, 2)  # Adjust the output size according to your task

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits

# Create an instance of the model
model = MyModel()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10

for epoch in range(epochs):
    running_loss = 0.0

    for inputs, masks, labels in dataloader:
        optimizer.zero_grad()

        # Forward pass
        logits = model(inputs, masks)
        loss = criterion(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/10 - Loss: 0.5054
Epoch 2/10 - Loss: 0.9691
Epoch 3/10 - Loss: 2.1802
Epoch 4/10 - Loss: 0.6162
Epoch 5/10 - Loss: 1.5451
Epoch 6/10 - Loss: 0.7947
Epoch 7/10 - Loss: 0.7743
Epoch 8/10 - Loss: 0.5414
Epoch 9/10 - Loss: 0.5629
Epoch 10/10 - Loss: 0.6441


In [None]:
tokenized_inputs

{'input_ids': tensor([[  101,  2675,  2519,  1024, 10347,  1010, 18390,  1024,  1017,  1010,
         28942,  1024,  1016,   102],
        [  101,  2675,  2519,  1024,  2456,  1010, 18390,  1024,  1018,  1010,
         28942,  1024,  1017,   102],
        [  101,  2675,  2519,  1024, 14840,  1010, 18390,  1024,  1016,  1010,
         28942,  1024,  1015,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}