# Fine-tuning llama-2 for our task

In [2]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, PeftModel
import transformers
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


# Loading the Base Model to Fine-tune

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    cache_dir="/data/base_models/",
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", cache_dir="/data/base_models/")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.05s/it]


# Processing Dataset

In [7]:
data = load_dataset("jonathansuru/customer_service_information_extraction", cache_dir = "/data/datasets")

In [9]:
def process_data(example):
    completion = re.sub('  +', '', example['completion'].strip())
    completion = re.sub(',\"', ',\n\"', completion)
    completion = re.sub(',\'', ',\n\'', completion)
    example['complete_prompt'] = example['prompt'].strip() + '\n\n' + completion
    return example

In [10]:
data = data.map(process_data)

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'complete_prompt'],
        num_rows: 190
    })
})

In [12]:
print(data['train']['complete_prompt'][0])

Please extract the customer specifications from the conversation below:

###

Agent: Hello, thank you for calling [Company Name]. How may I help you today?
Customer: I'm calling to complain about a product that I purchased.
Agent: I understand. Can you please tell me what product you're referring to?
Customer: I purchased a [Product Name] from your website on August 10th. It arrived on August 15th, but it was damaged.
Agent: I'm sorry to hear that. I'll be happy to help you with this. Can you send me a picture of the damaged product?
Customer: Sure.
...
Agent: I've received the picture of the damaged product. I'm going to issue you a refund for the product. I'm also going to send you a replacement product.
Customer: Thank you for your help.
The extract is as follows:

"product name": "product name",
"issue": "damaged product"
END


In [13]:
def tokenize_dataset(example):
    response = tokenizer(example['complete_prompt'])
    return response

In [14]:
data = data.map(tokenize_dataset)

In [15]:
data

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'complete_prompt', 'input_ids', 'attention_mask'],
        num_rows: 190
    })
})

In [16]:
data = data['train'].train_test_split(test_size=0.1)

In [17]:
data['train']

Dataset({
    features: ['prompt', 'completion', 'complete_prompt', 'input_ids', 'attention_mask'],
    num_rows: 171
})

In [18]:
data['test']

Dataset({
    features: ['prompt', 'completion', 'complete_prompt', 'input_ids', 'attention_mask'],
    num_rows: 19
})

# Configuring model for fine-tuning

In [18]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [19]:
print_trainable_parameters(model)

trainable params: 6738415616 || all params: 6738415616 || trainable%: 100.0


In [20]:
for param in model.parameters():
  param.requires_grad = False

In [21]:
print_trainable_parameters(model)

trainable params: 0 || all params: 6738415616 || trainable%: 0.0


In [22]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

In [23]:
print_trainable_parameters(model)

trainable params: 8388608 || all params: 6746804224 || trainable%: 0.12433454005023165


We have added a small layer of trainable parameter to the base model. This layer will only be trained and the remaining whole model will remain constant.

# Starting the Training Loop

In [25]:
trainer = transformers.Trainer(
    model=model, 
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=15, 
        max_steps=30, 
        learning_rate=1e-3, 
        # fp16=True,
        logging_steps=1, 
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

tokenizer.pad_token = tokenizer.eos_token

trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,1.5089
2,1.4494
3,1.6433
4,1.4119
5,1.3184
6,1.2482
7,1.2712
8,1.0837
9,0.8762
10,0.9078


TrainOutput(global_step=30, training_loss=0.813673057158788, metrics={'train_runtime': 256.8793, 'train_samples_per_second': 1.869, 'train_steps_per_second': 0.117, 'total_flos': 6063517055262720.0, 'train_loss': 0.813673057158788, 'epoch': 2.79})

In [26]:
trainer.model.save_pretrained('outputs')

# Loading the fine-tuned model

In [3]:
output_dir = './outputs'

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    cache_dir="/data/base_models/",
    device_map='auto',
)

model = PeftModel.from_pretrained(base_model, output_dir)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", cache_dir="/data/base_models/")

Loading checkpoint shards: 100%|█████████████████████████████████████████| 2/2 [00:04<00:00,  2.22s/it]


# Testing model output of fine-tuned model

In [4]:
def get_llama2_reponse(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature= 0.00001)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [23]:
n = 0
prompt = data['test']['prompt'][n]
print(get_llama2_reponse(prompt, max_new_tokens=100))

Please extract the customer specifications from the conversation below: 

###

Customer: I'm having trouble downloading an app from the Google Play Store.
Agent: I'm sorry to hear that. Can you tell me what the problem is?
Customer: The app is stuck at 0% download.
Agent: Okay, I can help you with that. Can you please try restarting your device?
Customer: I've tried restarting my device, but the app is still stuck at 0% download.
Agent: Okay, I can try clearing the cache and data for the Google Play Store app.
Customer: Okay, please do.
Agent: Okay, I've cleared the cache and data for the Google Play Store app. Please try downloading the app again.
Customer: The app is downloading now! Thank you so much for your help!
The extract is as follows:

"problem": "app stuck at 0% download",
"solution": "cleared cache and data for google play store app"
END OF EXTRACT
END OF CONVERSATION
END OF TRANSACTON
END OF PRODUCT/SERVICE: APP DOWNLOAD
END OF PLATFORM: GOOGLE PLAY STORE
END OF PRODUCT/SE

In [24]:
print(data['test']['completion'][n])

     "product": "google play store",    "problem": "app stuck at 0% download",    "solution": "cleared cache and data for google play store app"
END


In the above example it had missed extracting the attribute "product" and also it looks like we need to add a stopping criteria. In our case whenever we encounter the word "END" it looks like that it our stopping criteria since our trianing dataset also contains "END" at the end of each trianing sample.

This is still a good improvement given that we only got 171 trianing samples in the dataset and we only trained it for 30 epochs. Training parameters can further be tweaked to improve model output.