In [1]:
import pandas as pd
import os

from dotenv import load_dotenv
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle, resample


# scikit llm imports
from skllm.config import SKLLMConfig
from skllm.models.gpt.classification.zero_shot import ZeroShotGPTClassifier
from skllm.models.gpt.classification.few_shot import FewShotGPTClassifier

In [2]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

# Load features

In [3]:
# If this is set to true, we use old data and pickle files that have been created
READ_FROM_PICKLE = True

In [4]:
if READ_FROM_PICKLE:
    malicious_df = pd.read_pickle("data/malicious_features_numeric.pkl")
    benign_df = pd.read_pickle("data/benign_features_numeric.pkl")

# Labeling

In [5]:
# add labels, 0 for benign, 1 for malicious
malicious_df["label"] = 1 
benign_df["label"] = 0

# Everything is text

In [6]:
malicious_df["embedding"] = malicious_df.apply(lambda row: ','.join(row.astype(str)), axis=1)
benign_df["embedding"] = benign_df.apply(lambda row: ','.join(row.astype(str)), axis=1)

In [7]:
data = pd.concat([malicious_df[["embedding", "label"]], benign_df[["embedding", "label"]]], ignore_index=True)

In [8]:
data = shuffle(data)

# Downsample

In [10]:
# Assuming you have your data loaded into a Pandas DataFrame called 'data'

# Separate majority and minority classes
majority_class = data[data["label"] == 1]
minority_class = data[data["label"] == 0]

# Downsample majority class
downsampled_majority = resample(
    majority_class,
    replace=False,  # sample without replacement
    n_samples=len(minority_class),  # to match minority class
    random_state=42,
)  # reproducible results

# Combine minority class with downsampled majority class
downsampled_data = pd.concat([downsampled_majority, minority_class])

# Shuffle the data
downsampled_data = downsampled_data.sample(frac=1, random_state=42)

# Now downsampled_data contains your downsampled dataset

In [41]:
# use a subset because this takes loooong!
data_size = 200
X_llm = downsampled_data["embedding"].head(data_size)
y_llm = downsampled_data["label"].head(data_size)
X_test = downsampled_data["embedding"].tail(int(data_size/10))
y_test = downsampled_data["label"].tail(int(data_size/10))

In [42]:
SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org(openai_org)

# Models 

## ZeroShotClassifier

In [43]:
clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

100%|██████████| 20/20 [00:13<00:00,  1.47it/s]

Accuracy: 0.65





In [44]:
clf = ZeroShotGPTClassifier(openai_model="gpt-4o")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

100%|██████████| 20/20 [00:14<00:00,  1.41it/s]

Accuracy: 0.65





## FewShotClassifier

In [45]:
clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

100%|██████████| 20/20 [00:40<00:00,  2.01s/it]

Accuracy: 1.00





In [46]:
clf = FewShotGPTClassifier(openai_model="gpt-4o")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

100%|██████████| 20/20 [01:06<00:00,  3.30s/it]

Accuracy: 1.00





# Fine Tuning

In [47]:
from skllm.models.gpt.classification.tunable import GPTClassifier

clf = GPTClassifier(
    base_model="gpt-3.5-turbo-0613",
    n_epochs=None,  # int or None. When None, will be determined automatically by OpenAI
    default_label="Random",  # optional
)

clf.fit(X_llm, y_llm)  # y_train is a list of labels
labels = clf.predict(X_test)

Created new file. FILE_ID = file-2eMLYaiNVXmKJmSUvY1ihdSu
Waiting for file to be processed ...
Created new tuning job. JOB_ID = ftjob-o2HJHHWHlKXTi3nViIMkEldk
[2024-05-14 12:35:27.696927] Waiting for tuning job to complete. Current status: validating_files
[2024-05-14 12:37:28.101050] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:39:28.503517] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:41:28.793175] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:43:29.183955] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:45:29.468900] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:47:29.944076] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:49:30.140971] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:51:30.584813] Waiting for tuning job to complete. Current status: running
[2024-05-14 12:53:30.902029] Wa

100%|██████████| 20/20 [00:19<00:00,  1.02it/s]


In [48]:
print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

Accuracy: 1.00
