In [None]:
!pip install -q accelerate==0.21.0 bitsandbytes==0.40.2 peft==0.4.0 transformers==4.31.0 trl==0.4.7

In [None]:
 import os
 import torch
 from datasets import load_dataset
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
     HfArgumentParser,
     TrainingArguments,
     pipeline,
     logging,
 )
 from peft import LoraConfig, PeftModel
 from trl import SFTTrainer

In [None]:
#In Case of Llama 2, the following prompt template is used for the chat models
#<S>[INST]<<SYS>>System Prompt<</SYS>>User Prompt[/INST]Model Answer</S>

In [None]:
import re

In [None]:
#Load the dataset
dataset=load_dataset('timdettmers/openassistant-guanaco')

Repo card metadata block was not found. Setting CardData to empty.


In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9846
    })
    test: Dataset({
        features: ['text'],
        num_rows: 518
    })
})


In [None]:
print(type(dataset))

<class 'datasets.dataset_dict.DatasetDict'>


In [None]:
print(dataset['train'][0])

{'text': '### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining po

In [None]:
print(dataset['train'].num_rows)

9846


In [None]:
#Shuffle the dataset and slice it
dataset=dataset['train'].shuffle(seed=42).select(range(1000))

In [None]:
#Define a function to transform the data
def transform_conversation(example):
  conversation_text=example['text']
  segments=conversation_text.split('###')

  reformatted_segments=[]

  #iterate over pair of segments
  for i in range(1,len(segments)-1,2):
    human_text=segments[i].strip().replace('Human:','').strip()

    #check if there is a corresponding assistance segemnt before processing

    if i+1<len(segments):
      assistance_text=segments[i+1].strip().replace('Assistant:','').strip()

      #Apply the new template
      reformatted_segments.append(f'<s>[INT] {human_text} [/INT] {assistance_text} </s>')
    else:
      #Handle the case where there is a no corresponding assistant segment
      reformatted_segments.append(f'<s>[INT] {human_text} [/INT] </s>')

  return {'text':''.join(reformatted_segments)}


transformed_dataset=dataset.map(transform_conversation)


In [None]:
#How to fine tune Llama 2

In [None]:
#Full fine-tunning is not possible here: we need parameter-efficient fine-tunning(PEFT) technique like LoRA or QLoRA
#To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision which is why we'll use QLoRA here

In [None]:
#Load a llama-2-7b-chat-hf model(chat model)
# Train it on the mlabonne/guanaco-llama2-1k(1,000 samples) which will produce our fine-tuned model Llama-2-7b-chat-finetune
# QLoRA will use a rank of 64 with a scaling parameter of 16. we'll load the llama 2 model directly in 4-bit precision using the NF4 type and train it for one epoch

In [None]:
#The model we want to train from the Hugging Face hub

model_name="NousResearch/Llama-2-7b-chat-hf"

#The instruction dataset to use

dataset_name='mlabonne/guanaco-llama2-1k'

#Fine-tuned model_name
new_model='Llama-2-7b-chat-finetune'

#QLoRA parameters

# LoRA attention dimension

lora_r=64

#LoRA alpha parameter

lora_alpha=16

#Dropout probability for LoRA layers

lora_dropout=0.1

# bitsandbytes parameters

#Activate 4-bit precision base model loading
use_4bit=True

#Quantization type(fp4 or nf4)
bnb_4bit_compute_dtype='float16'

bnb_4bit_quant_type='nf4'

# Activate nested quantization for 4-bit base models (double quantization)

use_nested_quant=False

#TrainingArguments parameters

#Output directory where the model predictions and checkpoints will be stored

output_dir='./results'

#Number of training epochs

num_train_epochs=1

#Enable fp16/bf16 training (set bf16 to True with an A100)

fp16=False

bf16=False

#Batch size per GPU for training

per_device_train_batch_size=4

#Batch size per GPU for evaluation
per_device_eval_batch_size=4

#Gradient accumulation steps

gradient_accumulation_steps=1

#Enable gradient checkpointing

gradient_checkpointing=True

#Maximum gradient normal (gradient clipping)

max_grad_norm=0.3

#Initially learning rate (AdamW optimizer)
learning_rate=2e-4

#Weight decay to apply to all layers except bias/LayerNorm weights

weight_decay=0.001

# Optimizer to use
optim = 'paged_adamw_32bit'

# Learning rate schedule
lr_scheduler_type = 'cosine'

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# group sequences into batches with same length
# Saves memory and speeds up training considerably

group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

# Maximum sequence length to use
max_seq_length=None

# Pack multiple short examples in the same input sequence to increase efficiency
packing=False

# Load the entire model on the GPU 0

device_map='auto'



In [None]:
#Load everything and start the fine-tuning process

In [None]:
#(Load the dataset)
dataset=load_dataset(dataset_name,split='train')

# Load tokenizer and model with ALoRA configuration

compute_dtype=getattr(torch, bnb_4bit_compute_dtype)

bnb_config=BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

#check GPU compatibility with bfloat16

if compute_dtype==torch.float16 and use_4bit:
  major, _ = torch.cuda.get_device_capability()
  if major >= 8:
    print('Your GPU supports bfloat16: accelerate training with bf16=True')


#Load base model

model=AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config=bnb_config,
    device_map=device_map
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.config.use_cache=False
model.config.pretraining_tp=1

#Load LLaMA tokenizer

tokenizer=AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side='right' #Fix weired overflow issuse with fp16 training

# Load LoRA configuration

peft_config=LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias='none',
    task_type='CAUSAL_LM'
)

training_arguments= TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to='tensorboard'
)

# Set supervised fine-tuning parameters

trainer=SFTTrainer(
    model=model,
    train_dataset=transformed_dataset,
    peft_config=peft_config,
    dataset_text_field='text',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing
)

#Train model

trainer.train()

#Save trained model

trainer.model.save_pretrained(new_model)




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.401
50,1.6566
75,1.2128
100,1.4448
125,1.1789
150,1.366
175,1.175
200,1.468
225,1.1581
250,1.5457


In [None]:
trainer.save_model(new_model)

In [None]:
logging.set_verbosity(logging.CRITICAL)

#Run text generation pipeline with our new model

prompt="What is Economics ?"

pipe=pipeline(task='text-generation',model=model,tokenizer=tokenizer,max_length=200)
result=pipe(f'<s>[INST] {prompt} [/INST]')
print(result[0]['generated_text'])



<s>[INST] What is Economics ? [/INST] Economics is the social science that studies the production, distribution, exchange, and consumption of goods and services. It examines how individuals, businesses, governments, and other organizations make decisions about how to allocate resources and how to distribute goods and services.

Economics is divided into several branches, including:

1. Microeconomics: This branch of economics studies the behavior of individual consumers and firms in making decisions about how to allocate resources and how to produce goods and services.
2. Macroeconomics: This branch of economics studies the overall performance of an economy, including factors such as economic growth, inflation, and unemployment.
3. International economics: This branch of economics studies the interactions between countries and the global economy.
4. Development economics: This branch of economics studies the economic development of countries and the factors
