In [1]:
# Import libraries

import pandas as pd
import numpy as np
import re
import nltk
import inflect
from nltk import word_tokenize
import string

In [2]:
# read file

filename = 'Milestone QA Dataset.csv'
df = pd.read_csv(filename, index_col=0)
df.head()

Unnamed: 0,prompt (User Question),completion (Bot Answer),persona,pain_point,intent,suggested_usp,CTA
1.0,"""Hi, I need a contractor for our new corporate...","""We are structured precisely for this requirem...",B2B_Ops_Manager,Risk_Compliance,Verify_Credentials,USP_Certified_Certainty,
2.0,"""I'm a CEO. I don't just want an office; I nee...","""We share that philosophy. We act as a strateg...",B2B_CEO_Leader,Brand_Alignment,Verify_Capability,USP_Brand_Translation,
3.0,"""I'm an architect, and I'm tired of contractor...","""We position ourselves as the 'trusted executo...",B2B_Architect,Design_Compromise,Understand_Process,USP_Trusted_Executor,
4.0,"""I need my villa finished perfectly, but I'm a...","""Absolutely. We provide the luxury of complete...",B2C_Executive,Time_Stress,Verify_Service_Scope,USP_Peace_of_Mind,
5.0,"""My design for my apartment is very complex an...","""We thrive on that challenge. We act as master...",B2C_Visionary,Technical_Feasibility,Verify_Capability,USP_Master_Craftsman,


### Pre-Processing

In [3]:
# Select columns of interest only
df2 = df[['prompt (User Question)', 'completion (Bot Answer)']]

# drop index
df2.reset_index(drop=True, inplace=True)
df2.head(2)

Unnamed: 0,prompt (User Question),completion (Bot Answer)
0,"""Hi, I need a contractor for our new corporate...","""We are structured precisely for this requirem..."
1,"""I'm a CEO. I don't just want an office; I nee...","""We share that philosophy. We act as a strateg..."


In [4]:
# rename columns for easier access

df3 = df2.rename(columns={'prompt (User Question)':'prompt', 'completion (Bot Answer)':'answer'})
df3.head(1)

Unnamed: 0,prompt,answer
0,"""Hi, I need a contractor for our new corporate...","""We are structured precisely for this requirem..."


In [5]:
# Check for missing values
df3.isna().sum()

prompt    0
answer    0
dtype: int64

In [6]:
# Check for duplicates
df3.duplicated().sum()

0

No missing or duplicated values

In [7]:
# Check the text content of first row

print(df3.iloc[0,0])
print('--------------------')
print(df3.iloc[0,1])

"Hi, I need a contractor for our new corporate HQ. Our main concern is that our global board requires full compliance with international safety and quality standards. How do you handle this?"
--------------------
"We are structured precisely for this requirement. Our entire management system is audited and certified by Bureau Veritas to be in accordance with ISO 9001 (Quality), ISO 14001 (Environment), and ISO 45001 (Health & Safety)1. This provides 'certified certainty' and ensures our processes are fully compliant from day one."


The text seems to contain un-needed quotes in the begging and end of each entry. We need to remove them

In [8]:
df3['prompt'] = df3['prompt'].apply(lambda x: x.strip('"'))
df3['answer'] = df3['answer'].apply(lambda x: x.strip('"'))
df3.head()

Unnamed: 0,prompt,answer
0,"Hi, I need a contractor for our new corporate ...",We are structured precisely for this requireme...
1,I'm a CEO. I don't just want an office; I need...,We share that philosophy. We act as a strategi...
2,"I'm an architect, and I'm tired of contractors...",We position ourselves as the 'trusted executor...
3,"I need my villa finished perfectly, but I'm an...",Absolutely. We provide the luxury of complete ...
4,My design for my apartment is very complex and...,We thrive on that challenge. We act as master ...


### Text Pre-processing

In [9]:
p = inflect.engine()

def convert_number(text):
    temp_str = text.split()
    new_string = []
 
    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
        else:
            new_string.append(word)
 
    temp_str = ' '.join(new_string)
    return temp_str

In [47]:
def text_processing(text):
    text1=text.lower() # convert text to lower case
    text2 = re.sub(r'https?:\/\/\S+', '', text1) # removes url
    clean = re.compile('<.*?>') 
    text3 = re.sub(clean, '', text2) # remove HTML tags
    text4 = re.sub(r"\s+"," ", text3, flags = re.I) # replace whitespace characters with a single space
    text5 = re.sub(r'([\)\'"])\d+([\.,]?)', r'\1\2', text4)

    return text5

In [48]:
# Let's test the function:

some_text='text OR WiLL CHECK, and. Trys 1 and 2 and &@*#($&^!%) and http://www.aaa.com again queue and too or to html<and>'
text_processing(some_text)

'text or will check, and. trys 1 and 2 and &@*#($&^!%) and again queue and too or to html'

In [49]:
df3.head()

Unnamed: 0,prompt,answer
0,"Hi, I need a contractor for our new corporate ...",We are structured precisely for this requireme...
1,I'm a CEO. I don't just want an office; I need...,We share that philosophy. We act as a strategi...
2,"I'm an architect, and I'm tired of contractors...",We position ourselves as the 'trusted executor...
3,"I need my villa finished perfectly, but I'm an...",Absolutely. We provide the luxury of complete ...
4,My design for my apartment is very complex and...,We thrive on that challenge. We act as master ...


In [50]:
# Let's apply the function on our data

df4 = df3.copy()
df4['prompt'] = df3['prompt'].apply(lambda x: text_processing(x))
df4['answer'] = df3['answer'].apply(lambda x: text_processing(x))
df4.head()

Unnamed: 0,prompt,answer
0,"hi, i need a contractor for our new corporate ...",we are structured precisely for this requireme...
1,i'm a ceo. i don't just want an office; i need...,we share that philosophy. we act as a strategi...
2,"i'm an architect, and i'm tired of contractors...",we position ourselves as the 'trusted executor...
3,"i need my villa finished perfectly, but i'm an...",absolutely. we provide the luxury of complete ...
4,my design for my apartment is very complex and...,we thrive on that challenge. we act as master ...


In [51]:
print(df3.iloc[0,1])
print('----------------')
print(df4.iloc[0,1])

We are structured precisely for this requirement. Our entire management system is audited and certified by Bureau Veritas to be in accordance with ISO 9001 (Quality), ISO 14001 (Environment), and ISO 45001 (Health & Safety)1. This provides 'certified certainty' and ensures our processes are fully compliant from day one.
----------------
we are structured precisely for this requirement. our entire management system is audited and certified by bureau veritas to be in accordance with iso 9001 (quality), iso 14001 (environment), and iso 45001 (health & safety). this provides 'certified certainty' and ensures our processes are fully compliant from day one.


Now let's prepare the dataset for training using a hugging face model

In [55]:
#!pip install torch transformers datasets scikit-learn evaluate

In [63]:
#!pip install hf_xet

In [71]:
# convert the data to a hugging face object

from datasets import Dataset

raw_dataset = Dataset.from_pandas(df4)

We will use GPT-2 pre-trained model. Let's use its tokenizer (AutoTokenizer)

In [59]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling

model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [65]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

In [64]:
# set the padding token to end of sentence
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [66]:
# initialize the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Let's prepare our text by formatting the input pairs

In [67]:
def tokenize_function(df):
    # Join the conversational turn into a single sequence for the model
    # Format: User_Prompt + <|endoftext|> + Chatbot_Response + <|endoftext|>
    # <|endoftext|> is GPT-2's EOS token and acts as a separator/padding token here.
    
    # We will assume your dataset has 'input_text' and 'target_text' columns
    concatenated_text = [
        f"{prompt} {tokenizer.eos_token} {response} {tokenizer.eos_token}"
        for prompt, response in zip(df['prompt'], df['answer'])
    ]

    return tokenizer(
        concatenated_text, 
        truncation=True, 
        max_length=512 # Adjust based on your dataset and VRAM
    )

In [78]:
# apply the tokenization to the data

tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=raw_dataset.column_names)

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

In [79]:
print(tokenized_dataset[1])

{'input_ids': [72, 1101, 257, 2906, 78, 13, 1312, 836, 470, 655, 765, 281, 2607, 26, 1312, 761, 257, 2272, 326, 12497, 674, 1664, 338, 29063, 290, 4508, 5369, 13, 460, 345, 5203, 319, 257, 1029, 12, 5715, 5761, 30, 220, 50256, 356, 2648, 326, 8876, 13, 356, 719, 355, 257, 10039, 5212, 284, 15772, 534, 4508, 338, 5369, 656, 257, 3518, 2858, 17, 13, 674, 15320, 3407, 20533, 6355, 4493, 329, 7534, 588, 1976, 538, 353, 3230, 290, 3958, 4188, 2984, 81, 18, 11, 810, 262, 3061, 373, 10582, 326, 25, 284, 1382, 511, 4508, 338, 29063, 656, 262, 2272, 13, 220, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [80]:
from sklearn.model_selection import train_test_split

train_test_split = tokenized_datasets.train_test_split(test_size=0.2, seed=0)

In [84]:
from datasets import DatasetDict

final_tokenized_datasets = DatasetDict({'train': train_test_split['train'],'validation': train_test_split['test']})

In [85]:
from transformers import TrainingArguments, Trainer

# --- 2.1 Define Training Arguments ---
training_args = TrainingArguments(
    output_dir="./gpt2_chatbot_results",  # Directory to save logs and checkpoints
    num_train_epochs=3,                   # Number of training epochs (adjust as needed)
    per_device_train_batch_size=4,        # Adjust based on your GPU VRAM
    per_device_eval_batch_size=4,
    warmup_steps=500,                     # Number of steps for learning rate warmup
    learning_rate=5e-5,                   # Standard learning rate for fine-tuning
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",          # Evaluate at the end of each epoch
    save_strategy="epoch",                # Save model checkpoint at the end of each epoch
    load_best_model_at_end=True,          # Load the model with the best validation loss
    fp16=False,                            # Enable mixed precision training (if you have a compatible GPU)
)

# --- 2.2 Initialize the Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_tokenized_datasets["train"],
    eval_dataset=final_tokenized_datasets["validation"],
    data_collator=data_collator,
)

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.