# Install all the libraries

In [1]:
!pip install --upgrade transformers datasets evaluate huggingface_hub torch
!pip install --upgrade torch torchvision
!pip install --upgrade transformers
!pip install --upgrade evaluate transformers
!pip install --upgrade accelerate



In [2]:
import torch
import torchvision
import evaluate
from torchvision import models, transforms
from torch.utils.data import DataLoader

# Load the dataset

In [3]:
from datasets import load_dataset
dataset=load_dataset("yelp_review_full")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset["train"][2]

{'label': 3,
 'text': "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life."}

# Load the tokenizer and create a function to tokenise your text

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Create a small batch from the dataset

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# Load the model

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Initialise the training arguments

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

# Set up the metric calculation function

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
from huggingface_hub import login
login()
model.push_to_hub("NeuraFusionAI/Finetune-BERT-yelp")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NeuraFusionAI/Finetune-BERT-yelp/commit/b76f5c6eba840f3c152af0c75bc45664a24527dd', commit_message='Upload BertForSequenceClassification', commit_description='', oid='b76f5c6eba840f3c152af0c75bc45664a24527dd', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
import torch
import torch.nn.functional as F
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("NeuraFusionAI/Finetune-BERT-yelp")
s="The was awesome and I loved it"
tt=tokenizer(s,return_tensors="pt", padding=True, truncation=True)



config.json:   0%|          | 0.00/972 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [15]:
model.eval()
with torch.no_grad():
    outputs=model(**tt)

In [16]:
logits = outputs.logits
print("Logits:", logits)

# Convert logits to probabilities using softmax
probabilities = F.softmax(logits, dim=-1)
print("Probabilities:", probabilities)

# Determine the predicted class
predicted_class = torch.argmax(probabilities, dim=-1)
print("Predicted Class:", predicted_class.item())

Logits: tensor([[-1.8183, -1.7290, -1.2371,  1.8548,  3.5704]])
Probabilities: tensor([[0.0038, 0.0042, 0.0068, 0.1502, 0.8350]])
Predicted Class: 4


# Fine tuning using pytorch

## Dropping columns

In [17]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

## Create a Dataloader

In [18]:
import torch
from torch.utils.data import DataLoader
traindataloader=DataLoader(small_train_dataset,batch_size=8,shuffle=True)
testdataloader=DataLoader(small_eval_dataset,batch_size=8)

## Download the model and load it onto the GPU

In [19]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

##  Create an optimizer and a learning rate scheduler

In [20]:
from torch.optim import AdamW,SGD
from transformers import get_scheduler
optimizer=SGD(model.parameters(),lr=5e-5)
num_epochs=3
num_training_steps = num_epochs * len(traindataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training and evaluation

In [21]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in traindataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

In [22]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in testdataloader:
    b = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**b)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.186}