### Finetuning on custom Data




In [3]:
from datasets import load_dataset
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

model_path = "fine_tuned_bert_model"
# Load dataset 
data = pd.read_csv("intent_data.csv")

# Convert labels to numeric: 'informative' -> 0, 'transactional' -> 1
data['label'] = data['label'].map({'informative': 0, 'transactional': 1})

data.to_csv("encoded_intent_data.csv", index=False)

dataset = load_dataset('csv', data_files="encoded_intent_data.csv", split='train')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Save the tokenizer along with the model
tokenizer.save_pretrained(model_path)

def tokenize_function(examples):
    return tokenizer(examples['query'], padding='max_length', truncation=True, max_length=128)

dataset = dataset.map(tokenize_function, batched=True)

# dataset


Generating train split: 147 examples [00:00, 3317.31 examples/s]
Map: 100%|██████████| 147/147 [00:00<00:00, 1748.14 examples/s]


In [4]:
# Split the dataset into train and test (80% training, 20% testing)
train_dataset, test_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()

# Set dataset format for PyTorch (input_ids, attention_mask, and labels)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [5]:
# Load pre-trained BERT model for sequence classification (2 labels)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',                
    evaluation_strategy="epoch",          
    save_strategy="epoch",                
    learning_rate=2e-5,                    
    per_device_train_batch_size=16,        
    per_device_eval_batch_size=32,         
    num_train_epochs=3,                    
    weight_decay=0.01,                    
    logging_dir='./logs',                  
    logging_steps=10,
    load_best_model_at_end=True,           
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,           
)

# Train the model
trainer.train()


trainer.save_model(model_path)

# Evaluate 
results = trainer.evaluate()

# print(results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                              
 33%|███▎      | 8/24 [00:34<00:55,  3.47s/it]

{'eval_loss': 0.5125638246536255, 'eval_runtime': 2.3045, 'eval_samples_per_second': 13.018, 'eval_steps_per_second': 0.434, 'epoch': 1.0}


 42%|████▏     | 10/24 [00:45<01:10,  5.03s/it]

{'loss': 0.6132, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}


                                               
 67%|██████▋   | 16/24 [01:11<00:28,  3.60s/it]

{'eval_loss': 0.4886522591114044, 'eval_runtime': 2.2312, 'eval_samples_per_second': 13.446, 'eval_steps_per_second': 0.448, 'epoch': 2.0}


 83%|████████▎ | 20/24 [01:31<00:18,  4.68s/it]

{'loss': 0.5178, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


                                               
100%|██████████| 24/24 [01:48<00:00,  3.59s/it]

{'eval_loss': 0.4822712242603302, 'eval_runtime': 2.2392, 'eval_samples_per_second': 13.398, 'eval_steps_per_second': 0.447, 'epoch': 3.0}


100%|██████████| 24/24 [01:52<00:00,  4.69s/it]


{'train_runtime': 112.5599, 'train_samples_per_second': 3.118, 'train_steps_per_second': 0.213, 'train_loss': 0.5516497393449148, 'epoch': 3.0}


100%|██████████| 1/1 [00:00<00:00, 334.26it/s]


### Prediction

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("fine_tuned_bert_model")  #aved model
tokenizer = BertTokenizer.from_pretrained("fine_tuned_bert_model")  # Use the same tokenizer you used for training

def predict_intent(input_text):
    # Tokenize the input
    inputs = tokenizer(input_text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    
    # Set the model to evaluation mode (important for dropout layers during inference)
    model.eval()

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  

    # Get the predicted class (highest logit value)
    predicted_class = torch.argmax(logits, dim=-1)

    # Print the prediction
    if predicted_class == 0:
        print(f"Query: '{input_text}' is classified as 'informative'.")
    else:
        print(f"Query: '{input_text}' is classified as 'transactional'.")




In [21]:
predict_intent("")

Query: 'I want new credit card' is classified as 'transactional'.


In [12]:
query = "I want to apply for credit card"
predict_intent(query)

Query: 'I want to apply for credit card' is classified as 'transactional'.


In [89]:
# import torch
# import torch.nn.functional as F

# def get_feedback(query, model, tokenizer):
#     # Preprocess the query and get model prediction
#     inputs = tokenizer(query, padding=True, truncation=True, max_length=128, return_tensors="pt")
#     model.eval()
    
#     with torch.no_grad():
#         outputs = model(**inputs)
#         logits = outputs.logits
    
#     # Apply softmax to get probabilities
#     probs = F.softmax(logits, dim=-1)
#     predicted_class = torch.argmax(logits, dim=-1)
    
#     print(f"Model Prediction: {'informative' if predicted_class == 0 else 'transactional'} (Confidence: {probs[0][predicted_class].item():.4f})")
    
#     # If confidence is below a threshold, ask for user feedback
#     if probs[0][predicted_class].item() < 0.7:  # Example threshold for low confidence
#         user_feedback = input("Is this correct? (yes/no): ")
#         if user_feedback.lower() != 'yes':
#             correct_label = input("Please enter the correct label (informative / transactional): ")
#             return query, correct_label  # Store this for retraining
#     return None


In [90]:
# import pandas as pd


# def save_feedback_to_csv(query, correct_label, feedback_file="intent_data.csv"):
 
#     try:
#         feedback_data = pd.read_csv(feedback_file)
#     except FileNotFoundError:
#         feedback_data = pd.DataFrame(columns=["query", "label"])
    
#     # Add new feedback to the dataset
#     new_feedback = pd.DataFrame({"query": [query], "label": [correct_label]})
#     feedback_data = pd.concat([feedback_data, new_feedback], ignore_index=True)
    
#     # Save the updated feedback data back to CSV
#     feedback_data.to_csv(feedback_file, index=False)


### Testing

In [17]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

model_path = "fine_tuned_bert_model"
model = BertForSequenceClassification.from_pretrained(model_path)  
tokenizer = BertTokenizer.from_pretrained(model_path)  

def preprocess_input(input_text,tokenizer):
    # Tokenize the input
    inputs = tokenizer(input_text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    return inputs


def save_feedback_to_csv(query, correct_label, feedback_file):
 
    try:
        feedback_data = pd.read_csv(feedback_file)
    except FileNotFoundError:
        feedback_data = pd.DataFrame(columns=["query", "label"])
    
    
    new_feedback = pd.DataFrame({"query": [query], "label": [correct_label]})
    feedback_data = pd.concat([feedback_data, new_feedback], ignore_index=True)
    
    feedback_data.to_csv(feedback_file, index=False)

def get_feedback(query, model , tokenizer, feedback_file, confidence_threshold=0.8): # feedback_file="intent_data.csv"
    # Preprocess the query and get model prediction
    inputs = preprocess_input(query,tokenizer=tokenizer)
    model.eval()
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    # Apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1)
    predicted_class = torch.argmax(logits, dim=-1)
    
    print(f"Model Prediction: {'informative' if predicted_class == 0 else 'transactional'} (Confidence: {probs[0][predicted_class].item():.4f})")
    
    prediction_list = []
        # prediction
    if predicted_class == 0:
        prediction_list.append('informative')
        prediction_list.append('transactional')
    else:
        prediction_list.append('transactional')
        prediction_list.append('informative')
  
    if probs[0][predicted_class].item() < confidence_threshold: 
        user_feedback = input("Is this prediction correct? ( Y/N ): ")
        if user_feedback.lower() == 'y':
            save_feedback_to_csv(query=query, correct_label= prediction_list[0] , feedback_file= feedback_file)
            print(f"Thank you for the feedback! As per feedback {prediction_list[0].upper()} intent has been saved.")
        
        elif user_feedback.lower() == 'n':
            save_feedback_to_csv(query, prediction_list[1] , feedback_file)
            print(f"Thank you for the feedback! As per feedback {prediction_list[1].upper()} intent has been saved.")

        else:
            print("Not valid input.")


In [18]:
## TESTING
get_feedback(query = input("Enter Query : "), 
             model=model, tokenizer=tokenizer, 
             feedback_file="intent_data.csv", 
             confidence_threshold=0.8)

Model Prediction: transactional (Confidence: 0.5343)
Thank you for the feedback! As per feedback INFORMATIVE intent has been saved.


###  Finetuining on feedback data 

In [28]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer

def retrain_model_with_feedback(feedback_file):

    model_path = "fine_tuned_BERT_model"

    # Load feedback data
    feedback_data = pd.read_csv(feedback_file)
    
    # Tokenize the new feedback data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    trainer.save_model(model_path)

    def tokenize_function(examples):
        return tokenizer(examples['query'], padding=True, truncation=True, max_length=128)
    
    feedback_dataset = Dataset.from_pandas(feedback_data)
    feedback_dataset = feedback_dataset.map(tokenize_function, batched=True)
    
    # Load your pre-trained model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        load_best_model_at_end=True,
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=feedback_dataset,  # Use feedback data for training
    )
    
    # Retrain the model with the new data
    trainer.train()

    # Save the updated model
    trainer.save_model(model_path)
