# fine tuning BERT classifier for sentiment analysis

Reference - github - https://github.com/jeffprosise/Applied-Machine-Learning/blob/main/Chapter%2013/Sentiment%20Analysis%20(BERT).ipynb#

In [10]:
import torch


print(torch.backends.mps.is_available())
print(torch.cuda.is_available())

True
False


In [11]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [14]:
from sklearn.model_selection import train_test_split

train_testvalid = tokenized_datasets['train'].train_test_split(test_size=0.2)
train_dataset = train_testvalid['train']
valid_dataset = train_testvalid['test']

In [16]:
from transformers import logging

logging.set_verbosity_warning()

In [17]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)



In [20]:
trainer.train()

  0%|          | 0/7500 [00:00<?, ?it/s]

{'loss': 0.2778, 'grad_norm': 54.86619186401367, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}
{'loss': 0.3206, 'grad_norm': 17.51486587524414, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}
{'loss': 0.2815, 'grad_norm': 7.929140090942383, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.6}
{'loss': 0.2807, 'grad_norm': 17.788843154907227, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}
{'loss': 0.2711, 'grad_norm': 8.982817649841309, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.3079302906990051, 'eval_runtime': 168.6941, 'eval_samples_per_second': 29.639, 'eval_steps_per_second': 3.705, 'epoch': 1.0}
{'loss': 0.1616, 'grad_norm': 0.059980329126119614, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.1461, 'grad_norm': 0.18193337321281433, 'learning_rate': 1.0666666666666667e-05, 'epoch': 1.4}
{'loss': 0.1473, 'grad_norm': 18.061504364013672, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}
{'loss': 0.1557, 'grad_norm': 10.02248477935791, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.8}
{'loss': 0.1519, 'grad_norm': 0.08669919520616531, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.31526172161102295, 'eval_runtime': 245.8135, 'eval_samples_per_second': 20.341, 'eval_steps_per_second': 2.543, 'epoch': 2.0}
{'loss': 0.0699, 'grad_norm': 0.0712442472577095, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.2}
{'loss': 0.0629, 'grad_norm': 0.030077064409852028, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}
{'loss': 0.0614, 'grad_norm': 0.0294658076018095, 'learning_rate': 2.666666666666667e-06, 'epoch': 2.6}
{'loss': 0.0653, 'grad_norm': 0.05548787862062454, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.8}
{'loss': 0.0695, 'grad_norm': 0.04963398352265358, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.38648656010627747, 'eval_runtime': 170.2372, 'eval_samples_per_second': 29.371, 'eval_steps_per_second': 3.671, 'epoch': 3.0}
{'train_runtime': 11758.1619, 'train_samples_per_second': 5.103, 'train_steps_per_second': 0.638, 'train_loss': 0.16822385813395183, 'epoch': 3.0}


TrainOutput(global_step=7500, training_loss=0.16822385813395183, metrics={'train_runtime': 11758.1619, 'train_samples_per_second': 5.103, 'train_steps_per_second': 0.638, 'total_flos': 1.627710232879104e+16, 'train_loss': 0.16822385813395183, 'epoch': 3.0})

In [21]:
metrics = trainer.evaluate()
print(metrics)

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.38648656010627747, 'eval_runtime': 173.1256, 'eval_samples_per_second': 28.881, 'eval_steps_per_second': 3.61, 'epoch': 3.0}


In [39]:
trainer.save_model("/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/model")

In [40]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


tokenizer.save_pretrained("/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser")

('/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser/tokenizer_config.json',
 '/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser/special_tokens_map.json',
 '/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser/vocab.txt',
 '/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser/added_tokens.json')

In [41]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer from the saved directory
model = BertForSequenceClassification.from_pretrained("/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/model")
tokenizer = BertTokenizer.from_pretrained("/Users/sriyan/Documents/techjam/final-sentiment-analysis/elmo-BERT/tokeniser")

# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub("sr1ya/elmo-BERTjam-classifier")
tokenizer.push_to_hub("sr1ya/elmo-BERT-jam-tokeniser")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sr1ya/elmo-BERT-jam-tokeniser/commit/acd1aa39d62709652773e46ace4ced5d1cc26bb6', commit_message='Upload tokenizer', commit_description='', oid='acd1aa39d62709652773e46ace4ced5d1cc26bb6', pr_url=None, pr_revision=None, pr_num=None)

In [34]:

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

acc = trainer.history['accuracy']
val = trainer.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training accuracy')
plt.plot(epochs, val, ':', label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

AttributeError: 'Trainer' object has no attribute 'history'

In [32]:
from transformers import BertTokenizer

# Load your tokenizer
model_path = "/Users/sriyan/Documents/techjam/final-sentiment-analysis/models"  # Replace with your actual model path
tokenizer = BertTokenizer.from_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples['Review Text'], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

OSError: Can't load tokenizer for '/Users/sriyan/Documents/techjam/final-sentiment-analysis/models'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/Users/sriyan/Documents/techjam/final-sentiment-analysis/models' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model_id = "/Users/sriyan/Documents/techjam/final-sentiment-analysis"  # Replace with your model path or identifier
model = BertForSequenceClassification.from_pretrained(model_id)
tokenizer = BertTokenizer.from_pretrained(model_id)

In [None]:
import torch

# Example text
text = "This movie is not really the best movie I have ever seen"
inputs = tokenizer(text, return_tensors="pt")

# Convert inputs to tuple format
dummy_input = (inputs["input_ids"], inputs["attention_mask"])