# Install all the libraries

In [23]:
!pip install --upgrade transformers datasets evaluate huggingface_hub torch
!pip install --upgrade torch torchvision
!pip install --upgrade transformers
!pip install --upgrade evaluate transformers
!pip install --upgrade accelerate



In [24]:
import torch
import torchvision
import evaluate
from torchvision import models, transforms
from torch.utils.data import DataLoader

# Load the dataset

In [25]:
from datasets import load_dataset

# Load the IMDB dataset
dataset = load_dataset("stanfordnlp/imdb")

In [26]:
dataset["train"][2]

{'text': "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />",
 'label': 0}

# Load the tokenizer and create a function to tokenise your text

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

# Create a small batch from the dataset

In [28]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# Load the model

In [29]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Initialise the training arguments

In [30]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

# Set up the metric calculation function

In [31]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [33]:
from huggingface_hub import login
login()
model.push_to_hub("NeuraFusionAI/Finetune-imdb-BERT")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NeuraFusionAI/Finetune-imdb-BERT/commit/96046ace3fb0a0c0f57115809cbe773150f97e04', commit_message='Upload BertForSequenceClassification', commit_description='', oid='96046ace3fb0a0c0f57115809cbe773150f97e04', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import torch
import torch.nn.functional as F
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("NeuraFusionAI/Finetune-imdb-BERT")
s="The was awesome and I loved it"
tt=tokenizer(s,return_tensors="pt", padding=True, truncation=True)

In [None]:
model.eval()
with torch.no_grad():
    outputs=model(**tt)

In [None]:
logits = outputs.logits
print("Logits:", logits)

# Convert logits to probabilities using softmax
probabilities = F.softmax(logits, dim=-1)
print("Probabilities:", probabilities)

# Determine the predicted class
predicted_class = torch.argmax(probabilities, dim=-1)
print("Predicted Class:", predicted_class.item())

# Fine tuning using pytorch

## Dropping columns

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Create a Dataloader

In [None]:
import torch
from torch.utils.data import DataLoader
traindataloader=DataLoader(small_train_dataset,batch_size=8,shuffle=True)
testdataloader=DataLoader(small_eval_dataset,batch_size=8)

## Download the model and load it onto the GPU

In [None]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

##  Create an optimizer and a learning rate scheduler

In [None]:
from torch.optim import AdamW,SGD
from transformers import get_scheduler
optimizer=SGD(model.parameters(),lr=5e-5)
num_epochs=3
num_training_steps = num_epochs * len(traindataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training and evaluation

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in traindataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in testdataloader:
    b = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**b)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()