# BERT Model Fine-tuning

## Model Definition

Import required modules

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

Define Model and Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        print(name, "unfreezed")
        param.requires_grad = True

pooler.dense.weight unfreezed
pooler.dense.bias unfreezed


In [None]:
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
sentence = "Congratulations! You've won a $1000 gift card!"

In [None]:
tokenizer(text=sentence, max_length=20, truncation=True, padding="max_length")

{'input_ids': [101, 23156, 999, 2017, 1005, 2310, 2180, 1037, 1002, 6694, 5592, 4003, 999, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

In [None]:
token_ids = tokenizer.encode(sentence)
token_ids

[101,
 23156,
 999,
 2017,
 1005,
 2310,
 2180,
 1037,
 1002,
 6694,
 5592,
 4003,
 999,
 102]

In [None]:
tokenizer.convert_ids_to_tokens(token_ids)

['[CLS]',
 'congratulations',
 '!',
 'you',
 "'",
 've',
 'won',
 'a',
 '$',
 '1000',
 'gift',
 'card',
 '!',
 '[SEP]']

In [None]:
tokenizer.decode(token_ids)

"[CLS] congratulations! you ' ve won a $ 1000 gift card! [SEP]"

## Data Preparation

In [None]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

Import required modules

In [None]:
from datasets import load_dataset, DatasetDict
import pandas as pd

Load IMDB Dataset

In [None]:
ds = load_dataset("stanfordnlp/imdb")

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
imdb = ds['train']

Check for Data Distribution

In [None]:
imdb

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [None]:
pd.DataFrame(imdb['label']).value_counts()

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
0,12500
1,12500


Split Dataset

In [None]:
imdb_train_testvalid

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [None]:
imdb_train_testvalid = imdb.train_test_split(test_size=0.2)
imdb_testvalid = imdb_train_testvalid['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': imdb_train_testvalid['train'],
    'valid': imdb_testvalid['train'],
    'test': imdb_testvalid['test']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2500
    })
})

Tokenize data

In [None]:
def preprocess(example):
    return tokenizer(text=example['text'], truncation=True, max_length=512, padding='max_length')

tokenized = dataset.map(preprocess)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [None]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
})

In [None]:
tokenized = tokenized.rename_column('label', 'labels')
tokenized

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2500
    })
})

## Fine-tuning

Import required modules

In [None]:
!pip install evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

In [None]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Compute softmax probabilities
    probs = np.exp(preds) / np.exp(preds).sum(-1, keepdims=True)

    # Extract positive class probabilities
    positive_class_probs = probs[:, 1]

    # Compute AUC score
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs,
                                     references=labels)['roc_auc'],
                   3)

    # Compute predicted classes
    predicted_classes = np.argmax(preds, axis=1)

    # Compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes,
                                    references=labels)['accuracy'],
                   3)

    return {"Accuracy": acc, "AUC": auc}


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [None]:
lr = 2e-5
batch_size = 16
num_epochs = 3

training_args = TrainingArguments(
    output_dir="bert-imdb",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

## Evaluate Model

In [None]:
trainer.evaluate()

In [None]:
preds = trainer.predict(tokenized['test'].select([10,11,12,13]))

In [None]:
preds

PredictionOutput(predictions=array([[-0.24423042, -0.0832856 ],
       [-0.5016683 , -0.00799495],
       [-0.4922276 , -0.01885031],
       [-0.57780904, -0.05066704]], dtype=float32), label_ids=array([0, 1, 0, 1]), metrics={'test_loss': 0.6687073111534119, 'test_model_preparation_time': 0.0067, 'test_Accuracy': 0.5, 'test_AUC': 1.0, 'test_runtime': 10.9263, 'test_samples_per_second': 0.366, 'test_steps_per_second': 0.092})

In [None]:
logits = preds.predictions
labels = preds.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': 0.75, 'AUC': 0.333}


## Save Model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

source_dir = "bert-imdb"
destination_dir = "/content/drive/MyDrive/bert-imdb"

# Copy the directory to Google Drive
shutil.copytree(source_dir, destination_dir, dirs_exist_ok=True)

Reload Model

# BERT Model Inference

## Model Definition

Import required modules

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

Define Model and Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

## Input Preparation

In [None]:
question = '''What is the capital of Germany?'''

paragraph = ''' Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance
                on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or
                decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection
                of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning
                is closely related to computational statistics, which focuses on making predictions using computers. Tehran is in the Germany. The study of mathematical optimization delivers methods,
                theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory
                data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics. '''

In [None]:
tokenized = tokenizer(text=question, text_pair=paragraph, return_tensors='pt')

tokenized

{'input_ids': tensor([[  101,  2054,  2003,  1996,  3007,  1997,  2762,  1029,   102,  3698,
          4083,  1006, 19875,  1007,  2003,  1996,  4045,  2817,  1997, 13792,
          1998,  7778,  4275,  2008,  3274,  3001,  2224,  2000, 20519,  5335,
          2037,  2836,  2006,  1037,  3563,  4708,  1012,  3698,  4083, 13792,
          3857,  1037,  8045,  2944,  1997,  7099,  2951,  1010,  2124,  2004,
          1000,  2731,  2951,  1000,  1010,  1999,  2344,  2000,  2191, 20932,
          2030,  6567,  2302,  2108, 12045, 16984,  2000,  4685,  1996,  4708,
          1012,  3698,  4083, 13792,  2024,  2109,  1999,  1996,  5097,  1997,
         10373, 22910,  1010, 10788,  1997,  2897, 22841,  2015,  1010,  1998,
          3274,  4432,  1010,  2073,  2009,  2003,  1999,  7959, 21369,  3468,
          2000,  4503,  2019,  9896,  1997,  3563,  8128,  2005,  4488,  1996,
          4708,  1012,  3698,  4083,  2003,  4876,  3141,  2000, 15078,  6747,
          1010,  2029,  7679,  2006,  

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized['input_ids'][0])
print(tokens)

['[CLS]', 'what', 'is', 'the', 'capital', 'of', 'germany', '?', '[SEP]', 'machine', 'learning', '(', 'ml', ')', 'is', 'the', 'scientific', 'study', 'of', 'algorithms', 'and', 'statistical', 'models', 'that', 'computer', 'systems', 'use', 'to', 'progressively', 'improve', 'their', 'performance', 'on', 'a', 'specific', 'task', '.', 'machine', 'learning', 'algorithms', 'build', 'a', 'mathematical', 'model', 'of', 'sample', 'data', ',', 'known', 'as', '"', 'training', 'data', '"', ',', 'in', 'order', 'to', 'make', 'predictions', 'or', 'decisions', 'without', 'being', 'explicitly', 'programmed', 'to', 'perform', 'the', 'task', '.', 'machine', 'learning', 'algorithms', 'are', 'used', 'in', 'the', 'applications', 'of', 'email', 'filtering', ',', 'detection', 'of', 'network', 'intruder', '##s', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'is', 'in', '##fe', '##asi', '##ble', 'to', 'develop', 'an', 'algorithm', 'of', 'specific', 'instructions', 'for', 'performing', 'the', 'task', '.',

## Model Inference

In [None]:
output = model(**tokenized)

In [None]:
output.keys()

odict_keys(['start_logits', 'end_logits'])

In [None]:
output['start_logits']

tensor([[-3.7280, -1.3089, -6.8252, -4.8026, -5.9179, -6.6193, -2.7931, -8.3888,
         -3.7280, -7.3628, -8.0483, -8.3990, -7.5350, -8.3629, -8.5277, -8.2288,
         -7.3658, -7.9824, -8.4520, -7.3041, -8.6154, -6.8588, -7.4985, -8.4875,
         -7.2898, -7.9530, -8.0183, -8.2683, -7.9432, -7.7995, -8.2329, -7.6775,
         -8.5290, -8.3066, -8.1940, -7.5540, -8.5589, -7.5190, -7.9499, -8.0342,
         -8.1650, -8.1053, -7.1164, -7.9513, -8.5842, -7.5792, -7.6036, -8.5981,
         -8.0212, -8.6023, -7.8780, -7.2582, -7.7037, -8.3078, -8.2441, -8.3848,
         -8.3325, -8.1775, -8.0636, -7.1881, -8.7470, -7.5080, -8.1897, -8.3767,
         -7.8955, -7.7002, -8.5011, -8.1419, -8.6478, -7.8455, -8.6530, -7.6326,
         -8.0560, -8.1381, -8.3852, -8.0078, -8.4868, -8.0349, -7.6627, -8.6074,
         -7.0122, -8.0887, -8.5800, -7.7288, -8.4475, -7.5097, -8.2597, -8.5626,
         -8.5586, -8.4964, -7.2834, -7.9477, -8.4876, -7.9103, -7.6791, -8.5449,
         -8.0409, -8.3619, -

## Display Results

Import required modules

In [None]:
import torch

In [None]:
start = torch.argmax(output['start_logits'])
end = torch.argmax(output['end_logits'])

In [None]:
start, end

(tensor(129), tensor(129))

In [None]:
answer = ' '.join(tokens[start:end+1])
answer

'tehran'