### Installing Dependencies

In [22]:
%pip install peft==0.8.2
# %pip install bitsandbytes==0.39.0
%pip install datasets transformers trl accelerate #bitsandbytes==0.43.0
%pip install -i https://pypi.org/simple/ bitsandbytes
# %pip install git+https://github.com/Keith-Hon/bitsandbytes-windows.git

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple/, https://pypi.ngc.nvidia.com
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;

### Importing Dependencies

In [1]:
import os
import torch
from datasets import load_dataset
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

[1714073944.062194] [c89130109ddd:107306:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import transformers, peft, trl, accelerate
print(transformers.__version__)
print(peft.__version__)
print(trl.__version__)
print(accelerate.__version__)
import datasets
print(datasets.__version__)

4.39.3
0.8.2
0.8.4
0.29.3
2.18.0


### Specifying Model to Finetune and Dataset file

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
train_data = "/workspace/train.jsonl"
test_data = "/workspace/test.jsonl"

# Fine-tuned model name
new_model = "/workspace/Llama-2-7b-metaphor-finetune"

### Specifying parameters to be used for QLoRA, SFT, Training

In [4]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [5]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [17]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/workspace/results"

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 16

# Batch size per GPU for evaluation
per_device_eval_batch_size = 16

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

In [18]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# data_files = {}

# # Number of files
# data_files = {}

# # List to store filenames
# train_filenames = []
# test_filenames = []
# # Number of files
# num_files = 51

from trl import DataCollatorForCompletionOnlyLM

### Loading Tokenizer

In [19]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### Loading dataset

In [20]:
#Loading dataset with train and test split from specific json files

dataset = load_dataset("json", data_files = {"train": train_data, "test": test_data})

### Initialising Model, setting peft and training paremeters

In [21]:
# Read instances from data and tokenize
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1



# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

Your GPU supports bfloat16: accelerate training with bf16=True


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.85s/it]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


### Training

In [22]:
# Train model
trainer.train()
trainer.model.save_pretrained(new_model)



Step,Training Loss
25,4.411
50,1.6888
75,1.118
100,0.8436
125,1.0258
150,0.8163
175,0.9865
200,0.7981
225,0.9466
250,0.8597


In [23]:
trainer.model.save_pretrained(new_model)

### Testing Model

In [24]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.94s/it]


In [25]:
def format_answer(text):
    if "." in text:
        text = text.split(".")[0]
    if "," in text:
        text = text.split(",")[0]
    if "and" in text.lower():
        text = text.split(" ")[0]
    return text

def custom_test(word1,word2):
    word2 = word2.lower()
    word1 = word1.capitalize()
    # Run text generation pipeline with our next model
    prompt = f"{word1} is as [blank] as {word2}. Which word can replace the [blank] token?"
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
    result = pipe(f"<s>[INST] {prompt} [/INST]")

    output = result[0]['generated_text'].split("[/INST]")[1][1:]
    return(format_answer(output))

In [26]:
custom_test("Ice cream on a summer day"," the forbidden fruit")

'tempting'

In [None]:
test_llama_v1 = []
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
for instance in dataset["test"]:
    text = instance['text']
    for i in range(len(text)):
        if text[i:i+6] == "/INST]":
            query = text[:i+6]
            target = text[i+7:]

    result = pipe(query)

    output = result[0]['generated_text'].split("[/INST]")[1][1:]
    test_llama_v1.append(format_answer(output).capitalize())

In [74]:
import pickle
with open('/workspace/test_llama_v1','wb') as f:
    pickle.dump(test_llama_v1,f)

In [78]:
with open('/workspace/test_llama_v1','rb') as f:
    L = pickle.load(f)
len(L)

850

### Clear VRAM

In [81]:
# Empty VRAM
del model
del pipe
del trainer 
import gc
gc.collect()
gc.collect()

0