In [1]:
import os

import pandas as pd

# Dataset preparation for pretrained open ai model

In [2]:

df = pd.read_csv("../dataset/input/raw/emails_set_batch.csv", index_col=0)
df = df[["text", "is_phishing"]]
df = df.rename(columns={"text": "prompt", "is_phishing": "completion"})
df = df.dropna()

df.to_json("../dataset/input/emails_set_batch.jsonl", orient='records', lines=True)

In [3]:
!openai tools fine_tunes.prepare_data -f '../dataset/input/emails_set_batch.jsonl' -q

Analyzing...

- Your file contains 326 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 1 duplicated prompt-completion sets. These are rows: [140]
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See 

### Merge files made by open ai preparation tool

In [4]:
# Load preprocessed datasets and combine them together for pretrained model as it does not require any prior training

prepared_valid_jsonl = '../dataset/input/emails_set_batch_prepared_valid.jsonl'
prepared_train_jsonl = '../dataset/input/emails_set_batch_prepared_train.jsonl'

if os.path.exists(prepared_valid_jsonl):
    df_first_batch = pd.read_json(prepared_valid_jsonl, lines=True)
    df_second_batch = pd.read_json(prepared_train_jsonl, lines=True)
    df = pd.concat([df_first_batch, df_second_batch], ignore_index=True)
    df.to_json('../dataset/input/emails_set_batch.jsonl', orient='records', lines=True)

    os.remove(prepared_valid_jsonl)
    os.remove(prepared_train_jsonl)


# Main dataset preparation

In [5]:
df_mixed = pd.read_csv("../dataset/input/raw/emails_set_batch.csv", index_col=0)
df_phishing = pd.read_csv("../dataset/input/raw/phishing_emails_formatted.csv", index_col=False)
df_legitimate = pd.read_csv("../dataset/input/raw/spam_ham_emails.csv", index_col=0)

df_legitimate["is_phishing"] = 0

merged_df = pd.concat([df_mixed, df_phishing, df_legitimate], ignore_index=True)

merged_df = merged_df.loc[:, ['text', 'is_phishing']]
merged_df = merged_df.drop_duplicates()
merged_df = merged_df.sort_values('is_phishing', ascending=False)
merged_df = merged_df.head(2000)

merged_df = merged_df.sample(frac=1)
merged_df.to_csv('../dataset/input/emails_set.csv')

merged_df = merged_df.rename(columns={'text': 'prompt', 'is_phishing': 'completion'})
merged_df.to_json('../dataset/input/emails_set.jsonl', orient='records', lines=True)