In [1]:
# Bert(BERT (Bidirectional Encoder Representations from Transformers)) (Encoder only ) Use Cases : text classification, NER, QA, embeddings, etc.
'''
Original BERT: Base, Large (bert-base-uncased(all token convert to lower case) , cased(token case sensitive), bert-large-uncased,cased)

Domain-Specific: BioBERT, SciBERT, FinBERT

Task-Specific Fine-Tuned: Classification, NER, QA

Lightweight / Efficient: DistilBERT, ALBERT, TinyBERT, MobileBERT

Multilingual / Language-specific: mBERT, XLM-R

Advanced Pretraining Variants: RoBERTa, Electra, DeBERTa, SpanBERT'''
from transformers import BertModel,BertTokenizer

model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

text = "Transformers are amazing for NLP Tasks"

inputs = tokenizer(text,padding=True,truncation=True,return_tensors="pt")  #Some sentences are short → need padding to match the model’s input size.

# Some sentences are too long → need truncation to fit the model’s maximum length (usually 512 tokens).
output = model(**inputs)
print(output[0].shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

torch.Size([1, 9, 768])


In [2]:
'''Text Classifications '''
from transformers import BertForSequenceClassification
import torch

#for binary classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
text = "I love NLP"
inputs = tokenizer(text,padding=True,truncation=True,return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)
output = model(**inputs,labels=labels)
loss=output.loss
logits = output.logits

#Two numbers correspond to class 0 and class 1 scores.
#Higher score → predicted class.
predicted_class = torch.argmax(logits, dim=1)
print(predicted_class)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([1])


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

sentence = "Hugging Face is based in New York City."
print(ner_pipeline(sentence))



config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


[{'entity_group': 'ORG', 'score': np.float32(0.76597357), 'word': 'Hugging Face', 'start': 0, 'end': 12}, {'entity_group': 'LOC', 'score': np.float32(0.9994878), 'word': 'New York City', 'start': 25, 'end': 38}]


In [22]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

# Load model + tokenizer together
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Question + context
question = "Where is Hugging Face located?"
context = "Hugging Face is an AI company mostly working on recent tech. It is establish in New York."

# IMPORTANT: tokenize before calling the model
inputs = tokenizer(question, context, return_tensors="pt")

# Run model
outputs = model(**inputs)

# Get answer span
answer_start = torch.argmax(outputs.start_logits)  # start index
answer_end = torch.argmax(outputs.end_logits) + 1  # end index

# Decode tokens back to string
token_ids = inputs["input_ids"][0][answer_start:answer_end]
answer = tokenizer.decode(token_ids)
print("Answer:", answer)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Answer: new york


In [32]:
# Feature extraction / Embedding
from transformers import BertModel,AutoModel
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
text = "bert embeddings are powerfull"
inputs = tokenizer(text,return_tensors="pt")
with torch.no_grad():
  output = model(**inputs)
token_embbending = output.last_hidden_state
sentence_embedding = output.pooler_output
print(sentence_embedding.shape)

torch.Size([1, 384])


In [33]:
#MaskModel

from transformers import BertForMaskedLM

model = BertForMaskedLM.from_pretrained("bert-base-uncased")

text = "Transformers are [MASK] for NLP tasks."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

predictions = torch.argmax(outputs.logits, dim=-1)
predicted_token = tokenizer.decode(predictions[0])
print(predicted_token)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


.. are used for nlp tasks..


In [35]:
# Sentence Pair Tasks (e.g., NLI / Similarity)
from transformers import BertForSequenceClassification

# Example: MNLI (3 classes)
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-MNLI")

sentence1 = "A man inspects the uniform of a figure in some East Asian country."
sentence2 = "The man is moving in asian country."

inputs = tokenizer(sentence1, sentence2, return_tensors="pt")
outputs = model(**inputs)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=1)
print(predicted_class)

#This prints the predicted label:

# 0 → contradiction

# 1 → neutral

# 2 → entailment

tensor([2])


In [None]:
# Fine Tuning Bert

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)
'''
1. output_dir="./bert-finetuned"
What: Folder where model checkpoints and training logs will be saved.
When to adjust: Always set a unique path for each training run.

2. num_train_epochs=3
What: Number of full passes through the training dataset.
Typical Values:
Classification: 3–5
QA / NER: 2–4 (overfitting is easy on small datasets)
Large datasets: 1–2 might be enough
Tip: Use early stopping or monitor validation metrics to avoid overfitting.

3. per_device_train_batch_size=8
What: Batch size per GPU/CPU during training.
Typical Values:
16 or 32 for faster training (if GPU memory allows)
8 is safe for most 8–12 GB GPUs
Tip: Increase if you have more VRAM. If OOM (out of memory) errors occur, reduce it.

4. per_device_eval_batch_size=8
What: Batch size per device during evaluation.
Tip: Can be larger than train_batch_size since eval doesn’t do backpropagation.
Often: 16 or 32 for faster evaluation if memory allows

5. evaluation_strategy="epoch"
What: How often to run evaluation on the validation set.
Options:
"no": No eval
"steps": Every X steps (set via eval_steps)
"epoch": At the end of each epoch
When to use:
"epoch": Simple and good for small to medium datasets
"steps": Use for large datasets where epochs are too long

6. save_strategy="epoch"
What: When to save model checkpoints.
Same options as above:
"epoch" or "steps"
Tip: Match with evaluation_strategy for consistency.
Use "steps" for longer training where checkpoints are needed more frequently.

7. learning_rate=5e-5
What: How quickly the model updates its weights.
Typical Ranges:
Classification: 2e-5 to 5e-5
QA / NER: 1e-5 to 3e-5
Fine-tuning large models: lower learning rates
Tips:
Lower = safer but slower
Try warm-up and schedulers (HF handles by default)

8. weight_decay=0.01
What: Regularization — prevents overfitting by penalizing large weights.
Typical Values:
0.0 (none) to 0.1 (strong)
Common default: 0.01
Tip: Helps generalization, especially with small datasets.

9. logging_dir="./logs"
What: Where to save TensorBoard logs.
Tip: Set this if you want to monitor training with TensorBoard:

10. logging_steps=50
What: Log training loss every X steps.
Tip:
Small = more frequent logs (good for small datasets)
Too small can clutter output; 50–200 is common

11. load_best_model_at_end=True
What: Load the model checkpoint with the best eval score (not just the last one).
When to use: Always a good idea when doing model selection.
Requires: evaluation_strategy and metric_for_best_model to be set.

12. metric_for_best_model="accuracy"
What: Which validation metric to track for picking the "best" model.
Options: Depends on your custom compute_metrics() function.
Classification: "accuracy", "f1"
QA: "f1" or "exact_match"
NER: "f1"
'''

In [38]:
'''. BERT Attention Basics

BERT has 12 layers (base) or 24 layers (large).

Each layer has multi-head self-attention (e.g., 12 heads in BERT-base).

You can access each layer’s attention using output_attentions=True in the forward pass.'''

from transformers import BertConfig, BertModel

# Custom config
config = BertConfig(
    num_hidden_layers=8,
    num_attention_heads=8,
    hidden_size=512,
    intermediate_size=2048,
    hidden_dropout_prob=0.2
)

model = BertModel(config)

config = BertConfig.from_pretrained("bert-base-uncased")
print(config)


'''"attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.55.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522'''

''' | Parameter             | Default (bert-base) | Small Dataset / Lightweight | Large Dataset / High Performance | Notes
| --------------------- | ------------------- | --------------------------- | -------------------------------- | ------------------------------------------------------------------------------------- |
| `hidden_size`         | 768                 | 256–512                     | 1024+                            | Smaller hidden size → fewer parameters → less chance of overfitting on small datasets |
| `num_hidden_layers`   | 12                  | 4–8                         | 24 (like BERT-large)             | Fewer layers → faster training, less overfitting on small data                        |
| `num_attention_heads` | 12                  | 4–8                         | 16–32                            | Must divide `hidden_size` evenly. Smaller datasets → fewer heads is enough            |
| `intermediate_size`   | 3072                | 1024–2048                   | 4096+                            | Size of FFN in each layer. Smaller dataset → smaller intermediate size                |

| Parameter                      | Default | Small Dataset | Large Dataset | Notes                                                       |
| ------------------------------ | ------- | ------------- | ------------- | ----------------------------------------------------------- |
| `hidden_dropout_prob`          | 0.1     | 0.2–0.3       | 0.1           | Higher dropout for small datasets to prevent overfitting    |
| `attention_probs_dropout_prob` | 0.1     | 0.2           | 0.1           | Same as above                                               |
| `classifier_dropout`           | null    | 0.2–0.3       | 0.1           | Only for sequence classification head, helps generalization |

| Parameter                 | Default | Small Dataset / Short Sequences | Large Dataset / Long Sequences | Notes                                                                  |
| ------------------------- | ------- | ------------------------------- | ------------------------------ | ---------------------------------------------------------------------- |
| `max_position_embeddings` | 512     | 128–256                         | 512–1024                       | Smaller max length → faster training. Increase if input texts are long |
| `type_vocab_size`         | 2       | Usually keep 2                  | Keep 2                         | Only changes if you have special token types                           |
| `pad_token_id`            | 0       | Keep                            | Keep                           | Standard padding token ID                                              |


| Parameter                | Default | Small Dataset | Large Dataset           | Notes                                             |
| ------------------------ | ------- | ------------- | ----------------------- | ------------------------------------------------- |
| `initializer_range`      | 0.02    | Keep          | Keep                    | Weight initialization                             |
| `layer_norm_eps`         | 1e-12   | Keep          | Keep                    | Stable training                                   |
| `gradient_checkpointing` | False   | False         | True if memory is tight | Saves memory on large models, slows down training |
| `use_cache`              | True    | Keep          | Keep                    | Mostly relevant for generation tasks              |


'''



BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.55.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

