In [None]:
# !pip install -q bitsandbytes datasets accelerate loralib
# !pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
# !pip install evaluate scikit-learn

In [1]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments
)

from datasets import load_dataset
from tqdm.autonotebook import tqdm
import torch
import numpy as np
import pandas as pd
import os
import evaluate

In [2]:
os.environ['WANDB_DISABLED'] = 'true'

In [3]:
bnb_config = BitsAndBytesConfig(
		load_in_4bit=True,
		bnb_4bit_quant_type='nf4',
		bnb_4bit_compute_dtype=torch.float16,
		bnb_4bit_use_double_quant=False,
	)

In [4]:
model_name='microsoft/phi-2'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# inference
def predict(text):
	inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
	with torch.no_grad():
		logits = model(**inputs).logits
	return torch.argmax(logits, dim=-1).item()

In [6]:
for name, param in model.named_parameters():
    if 'lora' not in name:
        param.requires_grad = False

In [7]:
def select_samples(dataset, step):
    dataset = dataset.select(range(0, len(dataset), step))
    return dataset

dataset = load_dataset('stanfordnlp/snli')
dataset['train'] = select_samples(dataset['train'], 550)
dataset['validation'] = select_samples(dataset['validation'], 100)
dataset['test'] = select_samples(dataset['test'], 100)

In [11]:
def preprocess_dataset(dataset):

    for split in dataset.keys():

        dataset[split] = dataset[split].rename_column('label', 'labels')
        dataset[split] = dataset[split].filter(lambda example: example['labels'] != -1)

        dataset[split] = dataset[split].map(
            lambda example: tokenizer(example['premise'], example['hypothesis'], padding='max_length', truncation=True, max_length=128),
            remove_columns=['premise', 'hypothesis']
        )
    return dataset


In [12]:
dataset = preprocess_dataset(dataset)

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1001 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [13]:
print(model)

PhiForSequenceClassification(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )

In [14]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=16, #Rank
    lora_alpha=64,
    bias="none",
    lora_dropout=0.05,
    task_type="SEQ_CLS",
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

peft_model = get_peft_model(model, config)

In [15]:
print(peft_model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): PhiForSequenceClassification(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lor

In [16]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 18357760
all model parameters: 1408634880
percentage of trainable model parameters: 1.30%


In [17]:
metric = evaluate.load('accuracy')

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
output_dir = './peft-snli/final-checkpoint'
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate=0.0001,
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    overwrite_output_dir=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    args=peft_training_args
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
peft_training_args.device

device(type='cuda', index=0)

In [20]:
peft_model.config.pad_token_id = tokenizer.pad_token_id

In [21]:
peft_trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,1.181,0.941513
2,0.7609,0.684881
3,0.489,0.56348
4,0.3457,0.548573
5,0.2554,0.521142


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=80, training_loss=0.60637948513031, metrics={'train_runtime': 1878.755, 'train_samples_per_second': 2.664, 'train_steps_per_second': 0.043, 'total_flos': 9747417346867200.0, 'train_loss': 0.60637948513031, 'epoch': 5.0})

![Example Image](./image.png)

### Thus the time taken in 30 min 55 sec

In [22]:
!zip -r model.zip '/kaggle/working/peft-snli'

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/peft-snli/ (stored 0%)
  adding: kaggle/working/peft-snli/final-checkpoint/ (stored 0%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/ (stored 0%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/optimizer.pt (deflated 8%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/README.md (deflated 66%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/scheduler.pt (deflated 56%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/rng_state.pth (deflated 25%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/adapter_config.json (deflated 53%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/adapter_model.safetensors (deflated 7%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/trainer_state.json (deflated 63%)
  adding: kaggle/working/peft-snli/final-checkpoint/checkpoint-32/training_args.bin (deflated 51%)
  adding: kaggle/working/peft