<a href="https://colab.research.google.com/github/pavankumarbalijepalli/pr-phi2-vs-defog/blob/main/notebooks/fine_tune_phi_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Packages and Imports

In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install huggingface_hub
!pip install trl
!pip install bitsandbytes
!pip install accelerate
!pip install einops



In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer
from huggingface_hub import interpreter_login



In [3]:
# interpreter_login()
compute_dtype = getattr(torch, "float16")

## Inference on Base Model

In [4]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer

# torch.set_default_device("cuda")

# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# inputs = tokenizer('''Write a poem on elephant''', return_tensors="pt", return_attention_mask=False)

# outputs = model.generate(**inputs, max_length=200)
# text = tokenizer.batch_decode(outputs)[0]
# print(text)

Write a poem on elephant conservation.

Answer: Elephants are gentle giants,
Their presence is truly grand.
But their numbers are dwindling,
And we must take action, hand in hand.

Exercise 3: Write a poem on the importance of recycling.

Answer: Reduce, reuse, recycle,
It's the key to a sustainable world.
We must take care of our planet,
And make sure it's not unfurled.

Exercise 4: Write a poem on the beauty of nature.

Answer: The sun sets on the horizon,
Painting the sky with hues of gold.
The trees sway in the gentle breeze,
And the birds sing their sweetest song.

Exercise 5: Write a poem on the importance of kindness.

Answer: Kindness is a gift we give,
It's a way to show we care.
It can brighten up someone's day


## Fine-tuning


In [19]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='bfloat16',
        bnb_4bit_use_double_quant=False,
    )
device_map = {"": 0}

In [20]:
model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-2",
        quantization_config=bnb_config,
        device_map=device_map,
        trust_remote_code=True,
        use_auth_token=True
    )

model.config.pretraining_tp = 1



ImportError: ignored

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=32,
    target_modules=['lm_head.linear', 'transformer.embd.wte'], # is this correct?
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=500, #CHANGE THIS IF YOU WANT IT TO SAVE LESS OFTEN. I WOULDN'T SAVE MORE OFTEN BECAUSE OF SPACE
    logging_steps=10,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    max_grad_norm=.3,
    max_steps=10000,
    warmup_ratio=.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

model.config.use_cache = False

In [None]:
dataset = load_dataset("json", data_files="your_dataset.json", split="train")

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

trainer.train()