<a href="https://colab.research.google.com/github/nnilayy/LLMs-And-Transformers/blob/main/Finetuning_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download HF Libraries

In [None]:
!pip install transformers -U
!pip install datasets -U
!pip install accelerate -U
!pip install evaluate -U



## Setting Up Device & Google Drive

In [None]:
import torch
print(torch.__version__)
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
# torch.cuda.empty_cache()

2.0.1+cu118


device(type='cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Importing and PreProcessing Dataset ❌

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Train, Test, Valid Dataset
train_dataset = load_dataset("rotten_tomatoes",split='train')
test_dataset = load_dataset("rotten_tomatoes",split='test')
validation_dataset = load_dataset("rotten_tomatoes",split='validation')


# Entire Dataset
# dataset=load_dataset("rotten_tomatoes")

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Sample Small Dataset
# small_train_dataset = train_dataset.shuffle(seed=42).select(range(100))
# small_eval_dataset = test_dataset.shuffle(seed=42).select(range(100))

In [None]:
tokenizer.vocab_size

In [None]:
# Preprocessing Function
def tokenization(example):
    return tokenizer(example["text"],
                     padding=True,
                     truncation=True,
                    #  max_length=50
                     )

In [None]:
train_dataset = train_dataset.map(tokenization,
                                  batched=True,
                                  # batch_size=1000,
                                  # num_proc=2,
                                  # remove_columns=dataset.column_names
                                  )
test_dataset = test_dataset.map(tokenization,
                                batched=True,
                                # batch_size=1000,
                                # num_proc=2,
                                # remove_columns=dataset.column_names
                                )
validation_dataset = validation_dataset.map(tokenization,
                                            batched=True,
                                            # batch_size=1000,
                                            # num_proc=2,
                                            # remove_columns=dataset.column_names
                                            )
# dataset=dataset.map(tokenization,
#                     batched=True,
#                     # num_proc=2,
#                     # remove_columns=dataset.column_names
#                     )

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
train_dataset.column_names

['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
# Dropping, Renaming Columns and Formatting dataset
# datasets=[train_dataset,test_dataset,validation_dataset]
# for dataset in datasets:
train_dataset=train_dataset.remove_columns(["text"])
train_dataset=train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch')
# train_dataset.format['type']
# train_dataset.column_names

test_dataset=test_dataset.remove_columns(["text"])
test_dataset=test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch')
# test_dataset.format['type']
# test_dataset.column_names

validation_dataset=validation_dataset.remove_columns(["text"])
validation_dataset=validation_dataset.rename_column("label", "labels")
validation_dataset.set_format('torch')
# validation_dataset.format['type']
# validation_dataset.column_names

In [None]:
# train_dataset.format['type']

In [None]:
for batch in train_loader:
  break
{k: v.shape for k, v in batch.items()}

In [None]:
from transformers import AutoModelForSequenceClassification
checkpoint = "bert-base-uncased"
model=AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7063, grad_fn=<NllLossBackward0>) torch.Size([32, 2])


## Custom Dataset

In [None]:
# Custom Dataset

## DataLoader

In [None]:
# DataLoader
from torch.utils.data import DataLoader
batch_size=32
train_loader=DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=0,collate_fn=data_collator,drop_last=True)
test_loader=DataLoader(test_dataset, batch_size=batch_size,shuffle=False, num_workers=0,collate_fn=data_collator,drop_last=True)
validation_loader=DataLoader(validation_dataset, batch_size=batch_size,shuffle=False, num_workers=0,collate_fn=data_collator,drop_last=True)

## Loading Model ❌

In [None]:
# Loading Model
from transformers import AutoModelForSequenceClassification
num_labels=2
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Load the pre-trained BERT model and tokenizer
# model = BertModel.from_pretrained("bert-base-uncased")

## Tuning Models

In [None]:
class MyModel(nn.Module):
    def __init__(self, bert_model):
        super(MyModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 2)  # Adjust the output size according to your task

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits


In [None]:
from torchsummary import summary
model=MyModel(bert_model)
model.to(device)
# summary(model,(1,28,128))

## Setting Up Optimizer and LR Scheduler

In [None]:
# Optimizers
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Learning Rate Scheduler
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


## Training and Testing Loops

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(total=num_training_steps)  # Initialize the progress bar
model.train()  # Set the model to training mode
for epoch in range(num_epochs):
    for data in train_loader:
      input_ids = data['input_ids']
      attention_mask = data['attention_mask']
      labels = data['labels']
      outputs = model(input_ids, attention_mask)
      # input_ids = torch.tensor(input_ids, dtype=torch.long)  # Convert input_ids to torch.LongTensor
      # attention_mask = torch.tensor(attention_mask, dtype=torch.long)  # Convert attention_mask to torch.LongTensor

      outputs = model(input_ids=input_ids, attention_mask=attention_mask)

      # outputs = model(input_ids, attention_mask)
      # print(data['labels'])
      break
    break

#         # Forward pass
#         logits = outputs.logits

#         # Compute the loss
#         loss = criterion(logits, labels)

#         # Backward pass
#         loss.backward()

#         # Update model parameters
#         optimizer.step()

#         # Update the learning rate
#         scheduler.step()

#         # Clear gradients
#         optimizer.zero_grad()

#         progress_bar.update(1)  # Update the progress bar

# progress_bar.close()  # Close the progress bar after training completes


In [None]:
input_ids

In [None]:
model.train()  # Set the model to training mode
for epoch in range(num_epochs):
    for data in train_loader:
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']
        labels = data['labels']

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs

        # Compute the loss
        loss = criterion(logits, labels)

        # Backward pass
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Update the learning rate
        scheduler.step()

        # Clear gradients
        optimizer.zero_grad()

torch.Size([256])

In [None]:
# Evaluation Loop
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
from accelerate import Accelerator
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

## Saving Model and Tokenizer

## Useful Functions

In [None]:
# Renaming
dataset = dataset.rename_column("sentence1", "sentenceA")
dataset = dataset.rename_column("sentence2", "sentenceB")
dataset

# Removing Columns
dataset = dataset.remove_columns("label")
dataset = dataset.remove_columns(['sentence1', 'sentence2'])

NameError: ignored

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets['train'][0]

In [None]:
tokenized_datasets=tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets=tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define the housing dataset
housing_data = [
    {"square_feet": 1500, "num_bedrooms": 3, "num_bathrooms": 2, "label": 1},
    {"square_feet": 2000, "num_bedrooms": 4, "num_bathrooms": 3, "label": 0},
    {"square_feet": 1200, "num_bedrooms": 2, "num_bathrooms": 1, "label": 1},
]

# Preprocess the dataset
input_texts = []
labels = []

for data in housing_data:
    square_feet = str(data["square_feet"])
    num_bedrooms = str(data["num_bedrooms"])
    num_bathrooms = str(data["num_bathrooms"])

    input_text = f"Square Feet: {square_feet}, Bedrooms: {num_bedrooms}, Bathrooms: {num_bathrooms}"
    input_texts.append(input_text)
    labels.append(data["label"])

# Tokenize the input texts
tokenized_inputs = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

# Prepare input tensors and labels
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]
labels = torch.tensor(labels)

# Create a DataLoader for batching
dataset = TensorDataset(input_ids, attention_mask, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define the model architecture
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = model
        self.fc = nn.Linear(768, 2)  # Adjust the output size according to your task

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.fc(pooled_output)
        return logits

# Create an instance of the model
model = MyModel()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10

for epoch in range(epochs):
    running_loss = 0.0

    for inputs, masks, labels in dataloader:
        optimizer.zero_grad()

        # Forward pass
        logits = model(inputs, masks)
        loss = criterion(logits, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}")


In [None]:
tokenized_inputs

## Proper Way of Doing Stuff

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True,
                     max_length=256)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=64, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=64, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-3)

In [None]:
from transformers import get_scheduler

num_epochs = 30
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1740


In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(total=num_training_steps, desc="Training", unit="step")
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0.0

    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        progress_bar.update(1)

    average_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}: Loss = {average_loss:.4f}")

progress_bar.close()


Training:   0%|          | 0/1740 [00:00<?, ?step/s]

Epoch 1: Loss = 0.9364
Epoch 2: Loss = 0.6810
Epoch 3: Loss = 0.6882
Epoch 4: Loss = 0.6907
Epoch 5: Loss = 0.7582
Epoch 6: Loss = 0.6733
Epoch 7: Loss = 0.6520
Epoch 8: Loss = 0.6910
Epoch 9: Loss = 0.6779
Epoch 10: Loss = 0.6749
Epoch 11: Loss = 0.6712
Epoch 12: Loss = 0.6674
Epoch 13: Loss = 0.6687
Epoch 14: Loss = 0.6539
Epoch 15: Loss = 0.6485
Epoch 16: Loss = 0.6769
Epoch 17: Loss = 0.6471
Epoch 18: Loss = 0.6466
Epoch 19: Loss = 0.6487
Epoch 20: Loss = 0.6466
Epoch 21: Loss = 0.6566
Epoch 22: Loss = 0.6488
Epoch 23: Loss = 0.6513
Epoch 24: Loss = 0.6471
Epoch 25: Loss = 0.6567
Epoch 26: Loss = 0.6394
Epoch 27: Loss = 0.6446
Epoch 28: Loss = 0.6350
Epoch 29: Loss = 0.6360
Epoch 30: Loss = 0.6354


In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()