# Dataset Preparation for Fine-Tuning

In [39]:
import os
import random
import pandas as pd

### 1. Load a Retail Bank Intent Recognition Dataset 

+ The dataset is from huggingface https://huggingface.co/datasets/bitext/Bitext-retail-banking-llm-chatbot-training-dataset 
+ 26 intents assigned to 9 categories, 25545 question/answer pairs, with approximately 1000 per intent

In [40]:
remote_file_path = "hf://datasets/bitext/Bitext-retail-banking-llm-chatbot-training-dataset/bitext-retail-banking-llm-chatbot-training-dataset.parquet"
local_cache_path = "datasets/bitext-retail-banking-llm-chatbot-training-dataset.parquet"

In [41]:
# Check whether the dataset exists
if os.path.exists(local_cache_path):
    print("Loading data from local cache...")
    df_bank = pd.read_parquet(local_cache_path)
else:
    print("Downloading data from remote and caching locally...")
    df_bank = pd.read_parquet(remote_file_path)
    df_bank.to_parquet(local_cache_path, index=False)

Loading data from local cache...


In [42]:
df_bank

Unnamed: 0,tags,instruction,category,intent,response
0,BCIPZ,"I would like to acivate a card, can you help me?",CARD,activate_card,I'm here to assist you with that! Activating y...
1,BCILZ,"I have to activate an Visa online, how can I d...",CARD,activate_card,I'm here to assist you with activating your {{...
2,BCIPQZ,I'd like to actiate a card where do i do it,CARD,activate_card,I can help you with that! Activating your card...
3,BCLPQZ,I'd likke to activate a visa on mobile i need ...,CARD,activate_card,I'm here to assist you with activating your {{...
4,BCILPZ,"I would ilke to activate a credit card online,...",CARD,activate_card,I'm here to assist you with activating your cr...
...,...,...,...,...,...
25540,BCEQZ,I'm trying to find apassword i need help,PASSWORD,set_up_password,I'm here to assist you in finding and setting ...
25541,BCIPQ,im looking for a password will uhelp me,PASSWORD,set_up_password,I'm here to assist you in setting up a passwor...
25542,BCIPZ,"I'd like to set up a password, will you help me?",PASSWORD,set_up_password,I'm here to assist you in setting up your pass...
25543,BCQ,wanna set up a password i need assistance,PASSWORD,set_up_password,I'm here to assist you in setting up your pass...


In [47]:
sampled_df_bank = df_bank.groupby('category', group_keys=False).apply(lambda x: x.sample(n=11, random_state=42))

  sampled_df_bank = df_bank.groupby('category', group_keys=False).apply(lambda x: x.sample(n=11, random_state=42))


In [48]:
sampled_df_bank = sampled_df_bank[['instruction', 'category']]
sampled_df_bank

Unnamed: 0,instruction,category
16242,i dont have a user account can ya help me open it,ACCOUNT
14964,"I'd like to close a user account, where to do it?",ACCOUNT
14197,I'm trying to find informayion about the curre...,ACCOUNT
14913,i dotn wanna keep my fucking account help me c...,ACCOUNT
15379,i got to close a fucking user account how to d...,ACCOUNT
...,...,...
23585,wanna send money to someone online where to ma...,TRANSFER
23624,wannas pay the rent i want help to perform a b...,TRANSFER
8733,I'd like ot cancel a transfer to a contact how...,TRANSFER
23241,wanna send money to someone i want help to mak...,TRANSFER


In [49]:
sampled_df_bank['category'] = 'Bank'
sampled_df_bank

Unnamed: 0,instruction,category
16242,i dont have a user account can ya help me open it,Bank
14964,"I'd like to close a user account, where to do it?",Bank
14197,I'm trying to find informayion about the curre...,Bank
14913,i dotn wanna keep my fucking account help me c...,Bank
15379,i got to close a fucking user account how to d...,Bank
...,...,...
23585,wanna send money to someone online where to ma...,Bank
23624,wannas pay the rent i want help to perform a b...,Bank
8733,I'd like ot cancel a transfer to a contact how...,Bank
23241,wanna send money to someone i want help to mak...,Bank


### 2. Generate a simulated dataset for non-bank inquires

In [50]:
# Define general non-bank inquiries
actions = [
    "set up a new email account", "book a hotel",
    "install a software program", "book a flight ticket", "register for an online course",
    "file my taxes", "update my profile information", "cancel a subscription",
    "connect to a Wi-Fi network", "find a lost item", "change my phone plan",
    "apply for a job", "schedule a meeting", "upgrade my device",
    "create a website", "learn a new language", "fix a technical issue",
    "pay my bills online", "track my order", "download a file"
]

subjects = [
    "Nonbank"
]

In [51]:
# Randomly generate 100
inquiries = []
for _ in range(100):
    action = random.choice(actions)
    subject = random.choice(subjects)
    inquiry = f"I have to {action} {subject}, how can I do it?"
    inquiries.append(inquiry)

In [52]:
df_nonbank = pd.DataFrame(inquiries, columns=["instruction"])
df_nonbank['category'] = "Nonbank"
df_nonbank

Unnamed: 0,instruction,category
0,"I have to file my taxes Nonbank, how can I do it?",Nonbank
1,"I have to set up a new email account Nonbank, ...",Nonbank
2,I have to register for an online course Nonban...,Nonbank
3,"I have to apply for a job Nonbank, how can I d...",Nonbank
4,"I have to connect to a Wi-Fi network Nonbank, ...",Nonbank
...,...,...
95,"I have to book a flight ticket Nonbank, how ca...",Nonbank
96,"I have to cancel a subscription Nonbank, how c...",Nonbank
97,"I have to set up a new email account Nonbank, ...",Nonbank
98,"I have to learn a new language Nonbank, how ca...",Nonbank


### 3. Merge two datasets together

In [54]:
dataset = pd.concat([sampled_df_bank, df_nonbank], ignore_index=True)
dataset

Unnamed: 0,instruction,category
0,i dont have a user account can ya help me open it,Bank
1,"I'd like to close a user account, where to do it?",Bank
2,I'm trying to find informayion about the curre...,Bank
3,i dotn wanna keep my fucking account help me c...,Bank
4,i got to close a fucking user account how to d...,Bank
...,...,...
194,"I have to book a flight ticket Nonbank, how ca...",Nonbank
195,"I have to cancel a subscription Nonbank, how c...",Nonbank
196,"I have to set up a new email account Nonbank, ...",Nonbank
197,"I have to learn a new language Nonbank, how ca...",Nonbank


In [56]:
# save
output_file = "datasets/generated_inquiries.csv"
dataset.to_csv(output_file, index=False)
print(f"\nInquiries saved to '{output_file}'")


Inquiries saved to 'datasets/generated_inquiries.csv'
