# Generating Synthetic Scam and Non-Scam Data using the OpenAI API

In [1]:
!pip install openai
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [None]:
from openai import OpenAI
import os
import csv
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=OPENAI_API_KEY)

In [204]:
# scam
prompt = "Generate 100 unique dialogues between a scammer and a victim. The length of each dialogue should vary in length. Minimum words per dialogue is 20 and maximum is 300 words. Only output the data, and output each dialogue on a single line without numbering the lines. The scammer's part should start with caller: and the victim's part should start with receiver: Avoid redundancy."

# nonscam
# prompt = "Generate 100 unique authentic dialogues between a caller and a receiver that are not scam. The length of each dialogue should vary in length. Minimum words per dialogue is 20 and maximum is 300 words. Only output the data, and output each dialogue on a single line without numbering the lines. The scammer's part should start with caller: and the victim's part should start with receiver: Avoid redundancy."
model = "gpt-4.1"

response = client.responses.create(
    model=model,
    input=prompt
)

output = response.output_text
print(output)

caller: Hello, this is the fraud department from your bank. We noticed some unusual activity on your account. Can you confirm your card number for verification? receiver: Oh no! What kind of activity? I’m really worried. caller: It appears there were charges made in another state. We need your card info to secure your funds. receiver: Um, should I not call the bank directly? caller: For your safety, stay on the line so we can assist immediately.

caller: Good afternoon, you have won a free vacation to Hawaii! Congratulations! receiver: Seriously? I don't remember entering a contest. caller: You were randomly selected from a customer database. All we need is your address and a small fee to process. receiver: That sounds suspicious. I’ll pass.

caller: Hi, I’m calling from Microsoft. Your computer has a virus causing data leaks. receiver: Oh no! What should I do? caller: I’ll need remote access to your PC to remove the virus. Just download this software. receiver: I think I should talk t

In [205]:
sentences = output.split("\n")
print(sentences)

['caller: Hello, this is the fraud department from your bank. We noticed some unusual activity on your account. Can you confirm your card number for verification? receiver: Oh no! What kind of activity? I’m really worried. caller: It appears there were charges made in another state. We need your card info to secure your funds. receiver: Um, should I not call the bank directly? caller: For your safety, stay on the line so we can assist immediately.', '', "caller: Good afternoon, you have won a free vacation to Hawaii! Congratulations! receiver: Seriously? I don't remember entering a contest. caller: You were randomly selected from a customer database. All we need is your address and a small fee to process. receiver: That sounds suspicious. I’ll pass.", '', 'caller: Hi, I’m calling from Microsoft. Your computer has a virus causing data leaks. receiver: Oh no! What should I do? caller: I’ll need remote access to your PC to remove the virus. Just download this software. receiver: I think I

In [207]:
# run if sentences are seperated by extra new lines to get rid of the empty strings from the split
sentences = [s for s in sentences if s]
print(sentences)

# import re
# sentences = [re.sub(r'^\d+\.\s*', '', s) for s in sentences]
# print(sentences)

['caller: Hello, this is the fraud department from your bank. We noticed some unusual activity on your account. Can you confirm your card number for verification? receiver: Oh no! What kind of activity? I’m really worried. caller: It appears there were charges made in another state. We need your card info to secure your funds. receiver: Um, should I not call the bank directly? caller: For your safety, stay on the line so we can assist immediately.', "caller: Good afternoon, you have won a free vacation to Hawaii! Congratulations! receiver: Seriously? I don't remember entering a contest. caller: You were randomly selected from a customer database. All we need is your address and a small fee to process. receiver: That sounds suspicious. I’ll pass.", 'caller: Hi, I’m calling from Microsoft. Your computer has a virus causing data leaks. receiver: Oh no! What should I do? caller: I’ll need remote access to your PC to remove the virus. Just download this software. receiver: I think I should 

The cell below writes the output of the request to a csv file.

In [208]:

# Define the CSV file name
csv_filename = "scam/scam_data21.csv"

# Open the CSV file in write mode
with open(csv_filename, mode="w", newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(['dialogue'])

    # Write each row from the response data
    for idx, sentence in enumerate(sentences):
        writer.writerow([sentence])

In [None]:
# uncomment to remove new line characters in a row (sometimes the API call returned the dialogue with new lines between the caller and receiver)

# import csv

# with open('scam/scam_data16.csv', 'r', newline='', encoding='utf-8') as infile, \
#      open('scam_data16.csv', 'w', newline='', encoding='utf-8') as outfile:

#     reader = csv.DictReader(infile)
#     writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
#     writer.writeheader()

#     for row in reader:
#         # Clean newlines from a specific column, e.g., 'message'
#         row['dialogue'] = row['dialogue'].replace('\n', ' ').replace('\r', ' ')
#         writer.writerow(row)


Run the cell below to concatenate the seperate csv files for the batches of data that were generated.
The below cell combines all the csv files in the folder `scam` and writes the output to a file called `scam_data.csv`. Do the same for the `nonscam` folder

In [210]:
import pandas as pd
import glob

# List of all CSV files
csv_files = glob.glob("scam/*.csv")

# Read and concatenate all CSVs
df = pd.concat([pd.read_csv(file) for file in csv_files])
df = df["dialogue"]

# Save the combined data to a new CSV
df.to_csv("scam.csv", index=False)

In [212]:
# zip and download folders
import shutil

shutil.make_archive('/content/scam', 'zip', '/content/scam')

from google.colab import files
files.download('/content/scam.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [213]:
from sklearn.model_selection import train_test_split

df_scam = pd.read_csv('scam.csv')
df_scam['label'] = 1
df_scam = df_scam.sample(n=2000)
df_scam_train, df_scam_test = train_test_split(df_scam, test_size=0.2, random_state=42)


df_nonscam = pd.read_csv('non_scam.csv')
df_nonscam['label'] = 0
df_nonscam = df_nonscam.sample(n=2000)
df_nonscam_train, df_nonscam_test = train_test_split(df_nonscam, test_size=0.2, random_state=42)


In [214]:
df_train = pd.concat([df_scam_train, df_nonscam_train])
df_train_shuffled = df_train.sample(frac=1 , random_state=42).reset_index(drop=True)

df_train_shuffled

Unnamed: 0,dialogue,label
0,"caller: Good afternoon, is Mr. Sanders there? ...",0
1,"caller: Good afternoon, this is Brian from the...",0
2,"caller: Hello, I’m calling from Sunny Days Cam...",0
3,caller: This is Microsoft Support. Your PC is ...,1
4,caller: You’re chosen for our exclusive health...,1
...,...,...
3195,caller: You’ve reached your cloud storage limi...,1
3196,caller: You owe a small fee for your social me...,1
3197,caller: This is your financial advisor—market ...,1
3198,caller: I have a parcel for you. Pay the custo...,1


In [220]:
df_train_shuffled.to_csv("train_dataset.csv", index=False)

In [218]:
df_test = pd.concat([df_scam_test, df_nonscam_test])
df_test_shuffled = df_test.sample(frac=1 , random_state=42).reset_index(drop=True)

df_test_shuffled

Unnamed: 0,dialogue,label
0,caller: I’m calling from Valley Dental about y...,0
1,"caller: Hi, your print job is ready to be coll...",0
2,caller: I’ve found a security risk in your onl...,1
3,"caller: Hi, this is Jennifer from the art muse...",0
4,caller: You’ve been chosen for a free vacation...,1
...,...,...
795,caller: We’re calling about your recent bankin...,1
796,caller: Good news! You’re a finalist for a hom...,1
797,caller: Your online dating profile is being re...,1
798,"caller: Hi, is this the Smith family? receiv...",0


In [221]:
df_test_shuffled.to_csv("test_dataset.csv", index=False)