### Install Requirements

In [None]:
# !pip install -q bitsandbytes datasets accelerate loralib
# !pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build w

### Model loading

We will be using [Phi-2](https://huggingface.co/microsoft/phi-2) Model

In [1]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model_id = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Post-processing on the model

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [2]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### Apply LoRA

In [3]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [4]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "Wqkv",
        "fc1",
        "fc2",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 6553600 || all params: 1527946240 || trainable%: 0.42891561420380864


In [41]:
from datasets import load_dataset

class Dataset:
    
	def __init__(self):
		self.dataset_id = "stanfordnlp/snli"
		self.dataset = load_dataset(self.dataset_id)

	def preprocess_dataset(self, sample, test=False):

		INSTRUCTION = "### Instruct: You are given premise and hypothesis. You have to predict the relationship between them. You have to give a one word answer from [entailment, contradiction, or neutral] representing the relationship between the hypothesis and premise."
		PREMISE = f"Premise: {sample['premise']}"
		HYPOTHESIS = f"Hypothesis: {sample['hypothesis']}"
		OUTPUT = f"### Output: {sample['label']}"
		if test:
			OUTPUT = "### Output: "

		return {"prompt": f"{INSTRUCTION}\n{PREMISE}\n{HYPOTHESIS}\n{OUTPUT}"}

	def get_train_dataset(self):

		dataset = self.dataset['train']
		# select every 1000th example
		dataset = dataset.select(range(0, len(dataset), 1000))
		dataset = dataset.map(lambda x: self.preprocess_dataset(x), remove_columns=['premise', 'hypothesis', 'label'])
		data = dataset.map(lambda x: tokenizer(x['prompt']), batched=True)
		return data

		# print(dataset)
	
	def get_validation_dataset(self):
		df = pd.read_parquet(f"{self.dataset_id}/{self.dataset_splits['validation']}")
		df = df.iloc[::100]
		df['prompt'] = df.map(self.preprocess_dataset)
		return df['prompt']
		
	def get_test_dataset(self):
		df = pd.read_parquet(f"{self.dataset_id}/{self.dataset_splits['test']}")
		df = df.iloc[::100]
		df['prompt'] = df.apply(self.preprocess_dataset, axis=1, test=True)
		return df['prompt']

In [45]:
train_data = Dataset().get_train_dataset()

### Training

In [46]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [47]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  0%|          | 0/10 [00:00<?, ?it/s]

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


{'loss': 2.6956, 'grad_norm': 0.45128709077835083, 'learning_rate': 0.0001, 'epoch': 0.01}
{'loss': 2.75, 'grad_norm': 0.5430135130882263, 'learning_rate': 0.0002, 'epoch': 0.01}
{'loss': 2.6976, 'grad_norm': 0.5069280862808228, 'learning_rate': 0.000175, 'epoch': 0.02}
{'loss': 2.4346, 'grad_norm': 0.7234230041503906, 'learning_rate': 0.00015000000000000001, 'epoch': 0.03}
{'loss': 2.4771, 'grad_norm': 1.017820954322815, 'learning_rate': 0.000125, 'epoch': 0.04}
{'loss': 2.3196, 'grad_norm': 1.4494836330413818, 'learning_rate': 0.0001, 'epoch': 0.04}
{'loss': 2.1583, 'grad_norm': 1.2612639665603638, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.05}
{'loss': 2.0616, 'grad_norm': 1.2225252389907837, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 2.0148, 'grad_norm': 1.552794337272644, 'learning_rate': 2.5e-05, 'epoch': 0.07}
{'loss': 2.0079, 'grad_norm': 1.530984878540039, 'learning_rate': 0.0, 'epoch': 0.07}
{'train_runtime': 47.9152, 'train_samples_per_second': 0.835, 'train_ste

TrainOutput(global_step=10, training_loss=2.3616942882537844, metrics={'train_runtime': 47.9152, 'train_samples_per_second': 0.835, 'train_steps_per_second': 0.209, 'total_flos': 53496273285120.0, 'train_loss': 2.3616942882537844, 'epoch': 0.07259528130671507})