# Fine-Tune Intent Recognition Model in LoRA way 

In [1]:
#Libraries
import os
import torch
import numpy as np
import pandas as pd
from shutil import unpack_archive

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from peft import get_peft_model, LoraConfig, TaskType
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option("display.max_columns", 101)
pd.set_option('display.max_colwidth', 100)

### 1. Load dataset

In [3]:
data = pd.read_csv(os.path.join("datasets","generated_inquiries.csv"))
data

Unnamed: 0,instruction,category
0,i dont have a user account can ya help me open it,Bank
1,"I'd like to close a user account, where to do it?",Bank
2,I'm trying to find informayion about the current balance of my account,Bank
3,i dotn wanna keep my fucking account help me close it,Bank
4,i got to close a fucking user account how to do it,Bank
...,...,...
15195,"I have to book a hotel Nonbank, how can I do it?",Nonbank
15196,"I have to download a file Nonbank, how can I do it?",Nonbank
15197,"I have to file my taxes Nonbank, how can I do it?",Nonbank
15198,"I have to book a hotel Nonbank, how can I do it?",Nonbank


In [4]:
# change label to binary
data.loc[data['category'] == 'Nonbank', 'category'] = 0
data.loc[data['category'] == 'Bank', 'category'] = 1
data['category'] = data['category'].astype(int)

In [5]:
data.head()

Unnamed: 0,instruction,category
0,i dont have a user account can ya help me open it,1
1,"I'd like to close a user account, where to do it?",1
2,I'm trying to find informayion about the current balance of my account,1
3,i dotn wanna keep my fucking account help me close it,1
4,i got to close a fucking user account how to do it,1


In [6]:
# samples are balanced
data['category'].value_counts()

category
0    8000
1    7200
Name: count, dtype: int64

### 2. Tokenization with Bert

In [7]:
PreTrained_Model = 'bert-base-uncased'

In [8]:
tokenizer = BertTokenizer.from_pretrained(PreTrained_Model)

### 3. Prepare Training/test dataset

In [9]:
# build dataset
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [10]:
MAX_LEN = 128
BATCH_SIZE = 16

In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(data['instruction'].to_numpy(), data['category'].to_numpy(), test_size=0.2)

train_dataset = NewsDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [12]:
train_labels

array([1, 1, 1, ..., 0, 0, 0])

### 4. Load Pre-Trained Model

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(PreTrained_Model, num_labels=2)

model = model.to(device)

# Freeze BERT parameters
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 5. Model Training

In [21]:
# Set up LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Define the task type
    r=8,  # Rank of low-rank matrices (you can adjust based on your needs)
    lora_alpha=16,  # Scaling factor (can be adjusted)
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none"  # Specify whether to include bias terms in the low-rank matrices
)

# Get the LoRA-enhanced model
model_with_lora = get_peft_model(model, lora_config)

# Set up training arguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)



In [22]:
# Define Trainer for evaluation
trainer = Trainer(
    model=model_with_lora,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,0.2334,0.005439
2,0.0015,0.000646
3,0.0006,0.000388
4,0.0005,0.00012
5,0.0003,2.3e-05
6,0.0001,3.3e-05
7,0.0006,6e-06
8,0.0001,9e-06
9,0.0001,5e-06
10,0.0014,7e-06




TrainOutput(global_step=7600, training_loss=0.01629712776536747, metrics={'train_runtime': 749.0564, 'train_samples_per_second': 162.338, 'train_steps_per_second': 10.146, 'total_flos': 8026261192704000.0, 'train_loss': 0.01629712776536747, 'epoch': 10.0})

### 5. Model Evaluation

In [27]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [28]:
def evaluate(model, val_dataloader):
    model.eval()
    total_correct = 0
    total_count = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            total_correct += (predictions == labels).sum().item()
            total_count += labels.size(0)

    accuracy = total_correct / total_count
    f1 = f1_score(all_labels, all_predictions, average='binary')  # 'binary' for 2-class classification

    print(f"Validation accuracy: {accuracy * 100:.2f}%")
    print(f"Validation F1 Score: {f1:.2f}")

In [29]:
model.eval()
total_correct = 0
total_count = 0

In [33]:
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        # attention_mask = batch['attention_ma sk'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids)
        predictions = torch.argmax(outputs.logits, dim=1)
        total_correct += (predictions == labels).sum().item()
        total_count += labels.size(0)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [34]:
accuracy = total_correct / total_count
print(f"Validation accuracy: {accuracy * 100:.2f}%")

Validation accuracy: 63.26%


### 6. Model Prediction

In [35]:
def predict(model, texts, tokenizer):
    model.eval()
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)

    return predictions

In [36]:
train_pred_list = []

for text in train_texts:
    pred = predict(model_with_lora, text, tokenizer)
    train_pred_list.append(predictions.tolist()[0])

In [37]:
accuracy = accuracy_score(train_labels, train_pred_list)
print(f'Accuracy of training dataset: {accuracy:.4f}')

Accuracy of training dataset: 0.5260


In [38]:
f1 = f1_score(train_labels, train_pred_list, average='binary')  # 'binary' for 2-class classification
print(f"F1 Score of training dataset: {f1:.2f}")

F1 Score of training dataset: 0.00


In [80]:
train_labels

array([0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0])