<a href="https://colab.research.google.com/github/mehedihasanbijoy/How-to-HuggingFace/blob/main/Efficient_Ways_of_LLMs_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- GPU must be enabled to run this notebook.
- This is based on https://github.com/jonkrohn/NLP-with-LLMs/blob/main/code/IMDB-GPU-demo.ipynb

In [1]:
%%capture
!pip install transformers==4.25.1 datasets==2.4.0 nvidia-ml-py3==7.352.0 lime torchvision

In [2]:
# PyTorch:
import torch
import torch.nn.functional as F

# Hugging Face:
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
)

# NVIDIA Management Library (for tracking GPU usage):
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
    nvmlDeviceGetName,
)

# Explainability:
from lime.lime_text import LimeTextExplainer
import random

### Utilities for GPU benchmarking

In [3]:
def print_gpu_utilization():
    nvmlInit()
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        info = nvmlDeviceGetMemoryInfo(handle)
        print("Device", i, ":", nvmlDeviceGetName(handle))
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    torch.cuda.empty_cache()

def print_summary(result):
    print(f"Training Loss: {result.training_loss:.4f}")
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

Set device

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

Using device cuda


In [5]:
print_gpu_utilization()

Device 0 : b'Tesla T4'
GPU memory occupied: 260 MB.


In [6]:
torch.ones((1, 1)).to(DEVICE) # send a tensor to the GPU
print_gpu_utilization()

Device 0 : b'Tesla T4'
GPU memory occupied: 362 MB.


## Load model

In [7]:
hf_model_name = "microsoft/DialogRPT-updown"

Tokenizer

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained(
    hf_model_name,
    model_max_length=1024,
    cache_dir="./cache/"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

Model

In [9]:
model = GPT2ForSequenceClassification.from_pretrained(
    hf_model_name,
    num_labels=2, # binary outcome: positive or negative review
    ignore_mismatched_sizes=True, # allows us to have two output neurons on model pre-trained with one
    cache_dir="./cache/",
).to(DEVICE)

model.config.use_cache = False

pytorch_model.bin:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at microsoft/DialogRPT-updown and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([1, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.config

GPT2Config {
  "_name_or_path": "microsoft/DialogRPT-updown",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "n_special": 0,
  "pad_token_id": 50256,
  "predict_special_tokens": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.25.1",
  "use_cache": false,
  "vocab_size": 502

In [11]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=1024, out_features=2, bias=False)
)

In [12]:
print_gpu_utilization()

Device 0 : b'Tesla T4'
GPU memory occupied: 1740 MB.


# Load and preprocess data

In [13]:
split = ["train[:25]", "test[:25]"]

raw_train, raw_test = load_dataset(
    "imdb",
    split=split,
    cache_dir="./cache/",
)

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to ./cache/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to ./cache/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
len(raw_train)

25

In [15]:
raw_train[2]["text"]

"If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />"

In [16]:
raw_train[2]["label"]

0

In [18]:
for i in range(25):
  print(raw_train[i]["label"])

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [19]:
tokenizer.encode(raw_train[2]["text"])

[1532,
 691,
 284,
 3368,
 1642,
 428,
 2099,
 286,
 2646,
 287,
 262,
 2003,
 13,
 770,
 2646,
 318,
 3499,
 355,
 281,
 6306,
 475,
 4952,
 645,
 43072,
 298,
 1621,
 29847,
 1671,
 1220,
 6927,
 1671,
 11037,
 3198,
 1244,
 1254,
 41276,
 329,
 5586,
 33834,
 340,
 780,
 340,
 18105,
 319,
 523,
 867,
 30023,
 9863,
 8643,
 2428,
 475,
 340,
 857,
 523,
 1231,
 597,
 22024,
 540,
 20289,
 13,
 383,
 19091,
 2058,
 1497,
 351,
 645,
 649,
 22582,
 357,
 25252,
 530,
 2058,
 510,
 351,
 530,
 981,
 530,
 338,
 2000,
 11569,
 364,
 11,
 355,
 340,
 481,
 31338,
 466,
 1141,
 428,
 27158,
 2646,
 737,
 27,
 1671,
 1220,
 6927,
 1671,
 11037,
 3198,
 1244,
 1365,
 4341,
 530,
 338,
 640,
 16143,
 503,
 257,
 4324,
 379,
 257,
 5509,
 3957,
 29847,
 1671,
 1220,
 6927,
 1671,
 11037]

In [20]:
train = raw_train.map(
    lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length")["input_ids"]},
    batched=True,
    remove_columns=['text'],
)

test = raw_test.map(
    lambda x: {"input_ids": tokenizer(x["text"], truncation=True, padding="max_length")["input_ids"]},
    batched=True,
    remove_columns=['text'],
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
train

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 25
})

In [23]:
train.column_names

['label', 'input_ids']

In [22]:
test

Dataset({
    features: ['label', 'input_ids'],
    num_rows: 25
})

# Fine-tune Mode

#### Vanilla Training