In [1]:
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding

In [2]:
# Define constants for the embedding model and encoding to use
MODEL_NAME = "text-embedding-ada-002"
ENCODING_NAME = "cl100k_base"

# Define the maximum number of tokens allowed in a text sample
MAX_TOKENS = 8000

# Define the input and output file paths
INPUT_FILE_PATH = "../dataset/emails_set.csv"
OUTPUT_FILE_PATH = "../dataset/embeddings.csv"

In [None]:
# Load the input data from the CSV file and select the required columns
df = pd.read_csv(INPUT_FILE_PATH, index_col=0, usecols=["text", "is_phishing"]).dropna()

# Get the encoding for the specified encoding type
encoding = tiktoken.get_encoding(ENCODING_NAME)

# Add a column to the DataFrame to store the number of tokens for each text sample
df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))

# Filter out any samples that have more tokens than the specified maximum
df = df[df.n_tokens <= MAX_TOKENS]

In [None]:
# Add a column to the DataFrame to store the text embeddings for each text sample
df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=MODEL_NAME))

# Save the DataFrame with the text embeddings to a CSV file
df.to_csv(OUTPUT_FILE_PATH)