In [2]:
import pandas as pd
import time
import requests
import json
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")

# gpt-3.5-turbo: Labeling email spam (0) and or not spam (1)

In [2]:
url = "https://api.openai.com/v1/chat/completions"

#encode and decode data between Python objects and JSON (JavaScript Object Notation) format — a lightweight data format widely used for APIs,
# web applications, and data exchange
payload = json.dumps({
  "model": "gpt-3.5-turbo-0125",
  "messages": [
    {
      "content": "You are a text classification model, given an input text, you will return a JSON object containing the probability scores for the following categories: 'label'. The JASON object should have key corresponding to these category, and the values should be 0 if email is classified as spam and 1 if email is not a spam.  Please respond with only the JSON object, without any additional text or explanation.",
      "role": "system"
    },
    {
      "content": "Hi sister, how are you doing .",
      "role": "user"
    }
  ]
})
headers = {
  'Authorization': f'Bearer {api_key}',
  'Content-Type': 'application/json',
  'Cookie': '__cf_bm=FtpZ3B647tvV7GZ._lv.KIu8.LPIJ3BDiP8vri_7i2Y-1753915269-1.0.1.1-tY0KiYDDbhxrmWYAm5wI0iG1Gndd5w_dcikjVlVivAe8lxsZ2jBIXxiGyz1lWMcA0ZyCrzfEmRbDGniIMERrMe4zP0rEw_N5Fe.2V1TqbSg; _cfuvid=HLSW49QkwFYFKcQx3KgORiPpuhUDd7c51ewvhuQggc8-1753913185134-0.0.1.1-604800000'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)


{
  "id": "chatcmpl-BzCRnojJ1s7li1SQMyb27VOX1xDDx",
  "object": "chat.completion",
  "created": 1753925555,
  "model": "gpt-3.5-turbo-0125",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{\n  \"label\": 1\n}",
        "refusal": null,
        "annotations": []
      },
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 100,
    "completion_tokens": 9,
    "total_tokens": 109,
    "prompt_tokens_details": {
      "cached_tokens": 0,
      "audio_tokens": 0
    },
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "audio_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  },
  "service_tier": "default",
  "system_fingerprint": null
}



# Reading data

In [3]:
df=pd.read_csv("df.csv")
df_validation=pd.read_csv("validation_data.csv")

In [4]:
df.head(10)

Unnamed: 0,label,id,sentence
0,0,0,Unusual trading pattern detected in your portf...
1,0,1,Alert: Unusual activity on your OCBC card. Ver...
2,0,2,ALERT: Your Singtel line will be terminated du...
3,0,3,Congratulations! You've won a prize in our luc...
4,1,4,Reminder: Your yoga class at True Fitness Nove...
5,0,5,Congratulations! You've won in a lucky draw. P...
6,0,6,WARNING: Your account will be terminated in 24...
7,1,7,"Hi Alex, I've reviewed your code for the new A..."
8,0,8,Fantastic deal! 70% off at Ion Orchard shops T...
9,1,9,"Hi Alex, I've reviewed your CV and I'm impress..."


# Formating data for fine tuning

## Training data

In [6]:
combined_scores=df["label"].to_frame()
scores_training=[]

In [7]:
combined_scores

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,1
...,...
221,1
222,1
223,1
224,0


In [8]:
for index,row in combined_scores.iterrows():
    scores_dict=row.to_dict()
    scores_str=json.dumps(scores_dict)
    scores_training.append(scores_str)

In [9]:
df["scores"]=scores_training

In [10]:
df.head()

Unnamed: 0,label,id,sentence,scores
0,0,0,Unusual trading pattern detected in your portf...,"{""label"": 0}"
1,0,1,Alert: Unusual activity on your OCBC card. Ver...,"{""label"": 0}"
2,0,2,ALERT: Your Singtel line will be terminated du...,"{""label"": 0}"
3,0,3,Congratulations! You've won a prize in our luc...,"{""label"": 0}"
4,1,4,Reminder: Your yoga class at True Fitness Nove...,"{""label"": 1}"


In [11]:
system_prompt="You are a text classification model, given an input text, you will return a JSON object containing the probability scores for the following categories: 'label'. The JASON object should have key corresponding to these category, and the values should be 0 if email is classified as spam and 1 if email is not a spam.  Please respond with only the JSON object, without any additional text or explanation."

In [12]:
example_prompt={
    "messages":[
       {"role":"system","content":system_prompt},
       {"role":"user","content":df["sentence"].iloc[1]},
       {"role":"assistant","content":df["scores"].iloc[1]}
        
    ]
}

In [13]:
example_prompt

{'messages': [{'role': 'system',
   'content': "You are a text classification model, given an input text, you will return a JSON object containing the probability scores for the following categories: 'label'. The JASON object should have key corresponding to these category, and the values should be 0 if email is classified as spam and 1 if email is not a spam.  Please respond with only the JSON object, without any additional text or explanation."},
  {'role': 'user',
   'content': 'Alert: Unusual activity on your OCBC card. Verify transactions now: ocbc-sg-secure.com'},
  {'role': 'assistant', 'content': '{"label": 0}'}]}

In [14]:
#Let's create a complete data structure for training data
formatted_training_data=[]
for index,row in df.iterrows():
    item={
    "messages":[
       {"role":"system","content":system_prompt},
       {"role":"user","content":row["sentence"]},
       {"role":"assistant","content":row["scores"]}
        
    ]}
    formatted_training_data.append(item)

In [15]:
formatted_training_data[1]

{'messages': [{'role': 'system',
   'content': "You are a text classification model, given an input text, you will return a JSON object containing the probability scores for the following categories: 'label'. The JASON object should have key corresponding to these category, and the values should be 0 if email is classified as spam and 1 if email is not a spam.  Please respond with only the JSON object, without any additional text or explanation."},
  {'role': 'user',
   'content': 'Alert: Unusual activity on your OCBC card. Verify transactions now: ocbc-sg-secure.com'},
  {'role': 'assistant', 'content': '{"label": 0}'}]}

## Test data

In [16]:
combined_scores=df_validation["label"].to_frame()
scores_validation=[]

for index,row in combined_scores.iterrows():
    scores_dict=row.to_dict()
    scores_str=json.dumps(scores_dict)
    scores_validation.append(scores_str)
    
df_validation["scores"]=scores_validation
df_validation.head()

Unnamed: 0,label,id,sentence,scores
0,0,0,Exclusive VIP access: Pre-sale tickets for Mar...,"{""label"": 0}"
1,1,1,"Operative Echo, mission abort. Compromised ass...","{""label"": 1}"
2,1,2,"Sis, mum asking if you're coming for dinner. S...","{""label"": 1}"
3,1,3,"Hi Lisa, I've reviewed your proposal for the A...","{""label"": 1}"
4,1,4,LinkedIn: Jon Smith has endorsed you for 'Proj...,"{""label"": 1}"


In [17]:
#Let's create a complete data structure for test data
formatted_validation_data=[]
for index,row in df_validation.iterrows():
    item={
    "messages":[
       {"role":"system","content":system_prompt},
       {"role":"user","content":row["sentence"]},
       {"role":"assistant","content":row["scores"]}
        
    ]}
    formatted_validation_data.append(item)

In [18]:
formatted_validation_data[-1]

{'messages': [{'role': 'system',
   'content': "You are a text classification model, given an input text, you will return a JSON object containing the probability scores for the following categories: 'label'. The JASON object should have key corresponding to these category, and the values should be 0 if email is classified as spam and 1 if email is not a spam.  Please respond with only the JSON object, without any additional text or explanation."},
  {'role': 'user',
   'content': "Meeting Mr. Tan for lunch at Food Republic Wisma Atria. 1pm okay? He wants to discuss the new project proposal. Bring any relevant documents you have. If you're running late, give me a call and I'll order for you."},
  {'role': 'assistant', 'content': '{"label": 1}'}]}

## OpenAI expects this messages in a JSONL file, also known as JSON Lines

In [19]:
with open("./training_dataset.jsonl","w") as f:
    for item in formatted_training_data:
        f.write(json.dumps(item))
        f.write("\n")

In [20]:
with open("./validation_dataset.jsonl","w") as f:
    for item in formatted_validation_data:
        f.write(json.dumps(item))
        f.write("\n")

## Fine tuning

In [4]:
from openai import OpenAI

In [26]:
client=OpenAI(api_key=api_key)

In [27]:
training_file=client.files.create(
    file=open("./training_dataset.jsonl","rb"),
    purpose="fine-tune"
)

training_file

FileObject(id='file-NpCXcQK7CedWpoAm53AqdH', bytes=156122, created_at=1753926212, filename='training_dataset.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [28]:
validation_file=client.files.create(
    file=open("./validation_dataset.jsonl","rb"),
    purpose="fine-tune"
)

validation_file

FileObject(id='file-AWthpDqXqDFvpsQCHv3GNc', bytes=38253, created_at=1753926438, filename='validation_dataset.jsonl', object='file', purpose='fine-tune', status='processed', expires_at=None, status_details=None)

In [29]:
fine_tuning=client.fine_tuning.jobs.create(
    training_file=training_file.id,
    model="gpt-3.5-turbo-0125",
    validation_file=validation_file.id,
    hyperparameters={"n_epochs":1, "batch_size": 25}   
)

In [30]:
fine_tuning

FineTuningJob(id='ftjob-eW38TZ3H4cSceXK1AnXXwwL0', created_at=1753926694, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-JH6SpMdHPPAXKH6ayW3x68Ur', result_files=[], seed=1669942151, status='validating_files', trained_tokens=None, training_file='file-NpCXcQK7CedWpoAm53AqdH', validation_file='file-AWthpDqXqDFvpsQCHv3GNc', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=2)), type='supervised'), user_provided_suffix=None, usage_metrics=None, shared_with_openai=False, eval_id=None)

In [32]:
#Check status of tuning
client.fine_tuning.jobs.list(limit=1)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-eW38TZ3H4cSceXK1AnXXwwL0', created_at=1753926694, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::BzD1XHjE', finished_at=1753927768, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-JH6SpMdHPPAXKH6ayW3x68Ur', result_files=['file-1UFAbZFHhFgCwwtpXUBkcf'], seed=1669942151, status='succeeded', trained_tokens=61286, training_file='file-NpCXcQK7CedWpoAm53AqdH', validation_file='file-AWthpDqXqDFvpsQCHv3GNc', estimated_finish=None, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=2)), type='supervised'), user_provided_suffix=None, usage_metrics=None, shared_with_openai=False, eval_id=None)], has_more=False, object='list')

# Using custom fine tuned model

In [35]:
client= OpenAI(api_key=api_key)


response=client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::BzD1XHjE",  #name of the 
    messages= [
    {
      "content": system_prompt,
      "role": "system"
    },
    {
      "content": "Hey Boss, get your job done by yourself!",
      "role": "user"
    }
  ],
)

In [36]:
response.choices[0].message.content

'{"label": 1}'

In [37]:
#This corresponds to spam, which is reasonable