In [170]:
import os

import pandas as pd

# Dataset preparation for pretrained open ai model

In [171]:

df = pd.read_csv("../dataset/input/raw/emails_set_batch.csv", index_col=0)
df = df[["text", "is_phishing"]]
df = df.rename(columns={"text": "prompt", "is_phishing": "completion"})
df = df.dropna()

df.to_json("../dataset/input/emails_set_batch.jsonl", orient='records', lines=True)

In [172]:
!openai tools fine_tunes.prepare_data -f '../dataset/input/emails_set_batch.jsonl' -q

Analyzing...

- Your file contains 326 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 1 duplicated prompt-completion sets. These are rows: [140]
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See 

### Merge files made by open ai preparation tool

In [173]:
# Load preprocessed datasets and combine them together for pretrained model as it does not require any prior training

prepared_valid_jsonl = '../dataset/input/emails_set_batch_prepared_valid.jsonl'
prepared_train_jsonl = '../dataset/input/emails_set_batch_prepared_train.jsonl'

if os.path.exists(prepared_valid_jsonl):
    df_first_batch = pd.read_json(prepared_valid_jsonl, lines=True)
    df_second_batch = pd.read_json(prepared_train_jsonl, lines=True)
    df = pd.concat([df_first_batch, df_second_batch], ignore_index=True)
    df.to_json('../dataset/input/emails_set_batch.jsonl', orient='records', lines=True)

    os.remove(prepared_valid_jsonl)
    os.remove(prepared_train_jsonl)


# Main dataset preparation

In [174]:
df_mixed = pd.read_csv("../dataset/input/raw/emails_set_batch.csv", index_col=0)
df_phishing = pd.read_csv("../dataset/input/raw/phishing_emails_formatted.csv", index_col=False)
df_legitimate = pd.read_csv("../dataset/input/raw/spam_ham_emails.csv", index_col=0)

df_legitimate["is_phishing"] = 0

merged_df = pd.concat([df_mixed, df_phishing, df_legitimate], ignore_index=True)

merged_df = merged_df.loc[:, ['text', 'is_phishing']]
merged_df = merged_df.drop_duplicates()

merged_df = merged_df.rename(columns={'text': 'prompt', 'is_phishing': 'completion'})
merged_df.to_json('../dataset/input/emails_set.jsonl', orient='records', lines=True)

prepared_valid_jsonl = '../dataset/input/emails_set_prepared_valid.jsonl'
prepared_train_jsonl = '../dataset/input/emails_set_prepared_train.jsonl'

if os.path.exists(prepared_valid_jsonl):
    os.remove(prepared_valid_jsonl)
    os.remove(prepared_train_jsonl)

In [175]:
!openai tools fine_tunes.prepare_data -f '../dataset/input/emails_set.jsonl' -q

Analyzing...

- Your file contains 5507 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- There are 16 examples that are very long. These are rows: [732, 1333, 1454, 2470, 2527, 3210, 3380, 3503, 3933, 3979, 4640, 4676, 5150, 5257, 5383, 5457]
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended ge

In [176]:
import tiktoken
from sklearn.model_selection import train_test_split

df_first_batch = pd.read_json(prepared_train_jsonl, lines=True, dtype={'completion': str})
df_second_batch = pd.read_json(prepared_valid_jsonl, lines=True, dtype={'completion': str})
df = pd.concat([df_first_batch, df_second_batch], ignore_index=True)

df = df.sort_values('completion', ascending=False)
df = df.head(2000)
df = df.reset_index(drop=True)

In [177]:
ENCODING_NAME = "cl100k_base"

# Define the maximum number of tokens allowed in a text sample
MAX_TOKENS = 8000

# Get the encoding for the specified encoding type
encoding = tiktoken.get_encoding(ENCODING_NAME)

df["n_tokens"] = df.prompt.apply(lambda x: len(encoding.encode(x)))

df = df[df.n_tokens <= MAX_TOKENS]

In [178]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

In [179]:
# Datasets for fine-tuning
df_fine_tune_train = df_train
df_fine_tune_test = df_test

df_fine_tune_train = df_fine_tune_train.drop('n_tokens', axis=1)
df_fine_tune_test = df_fine_tune_test.drop('n_tokens', axis=1)

df_fine_tune_train.to_json(prepared_train_jsonl, orient='records', lines=True)
df_fine_tune_test.to_json(prepared_valid_jsonl, orient='records', lines=True)

In [180]:
# Datasets for the rest of models
df_train['completion'] = df_train['completion'].replace({" 0": False, " 1": True})
df_test['completion'] = df_test['completion'].replace({" 0": False, " 1": True})

# Convert the completion column to boolean
df_train['completion'] = df_train['completion'].astype(bool)
df_test['completion'] = df_test['completion'].astype(bool)

df_train = df_train.rename(columns={"prompt": "text", "completion": "is_phishing"})
df_test = df_test.rename(columns={"prompt": "text", "completion": "is_phishing"})

df_train.to_json('../dataset/input/emails_set_train.jsonl', orient='records', lines=True)
df_test.to_json('../dataset/input/emails_set_test.jsonl', orient='records', lines=True)