In [4]:
import pandas as pd
import os

from dotenv import load_dotenv
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle


# scikit llm imports
from skllm.config import SKLLMConfig
from skllm.models.gpt.classification.zero_shot import ZeroShotGPTClassifier
from skllm.models.gpt.classification.few_shot import FewShotGPTClassifier

In [5]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [6]:
# If this is set to true, we use old data and pickle files that have been created
READ_FROM_PICKLE = True

In [7]:
if READ_FROM_PICKLE:
    mirai_cleaned_df = pd.read_pickle("data/mirai_cleaned.pkl")
    benign_clean_df = pd.read_pickle("data/benign_cleaned.pkl")

In [8]:
# add labels, 0 for benign, 1 for malicious
mirai_cleaned_df["Labels"] = 1 
benign_clean_df["Labels"] = 0

In [9]:
mirai_payloads = mirai_cleaned_df[["Payload", "Labels"]]
benign_payloads = benign_clean_df[["Payload", "Labels"]]

In [10]:
payloads = pd.concat([mirai_payloads, benign_payloads], ignore_index=True)

In [11]:
payloads = shuffle(payloads)

In [12]:
# use a subset because this takes loooong!
X_llm = payloads["Payload"].head(50)
y_llm = payloads["Labels"].head(50)
X_test = payloads["Payload"].tail(10)
y_test = payloads["Labels"].tail(10)

In [13]:
SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org(openai_org)

In [14]:
clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:47<00:00,  1.06it/s]

Accuracy: 0.82





In [15]:
clf = ZeroShotGPTClassifier(openai_model="gpt-4")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:41<00:00,  1.19it/s]

Accuracy: 0.84





In [16]:
clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:42<00:00,  1.18it/s]

Accuracy: 0.94





In [17]:
clf = FewShotGPTClassifier(openai_model="gpt-4")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:43<00:00,  1.15it/s]

Accuracy: 0.94





In [18]:
from skllm.models.gpt.classification.tunable import GPTClassifier

clf = GPTClassifier(
    base_model="gpt-3.5-turbo-0613",
    n_epochs=None,  # int or None. When None, will be determined automatically by OpenAI
    default_label="Random",  # optional
)

clf.fit(X_llm, y_llm)  # y_train is a list of labels
labels = clf.predict(X_test)

Created new file. FILE_ID = file-QTaop5haWIfaOuZAQnsRSl5G
Waiting for file to be processed ...
Created new tuning job. JOB_ID = ftjob-vNoYVVcfLCkvbwhBn2eMAnZ1
[2024-04-24 10:01:01.644773] Waiting for tuning job to complete. Current status: validating_files
[2024-04-24 10:03:01.986201] Waiting for tuning job to complete. Current status: queued
[2024-04-24 10:05:02.380409] Waiting for tuning job to complete. Current status: running
[2024-04-24 10:07:02.785078] Waiting for tuning job to complete. Current status: running
[2024-04-24 10:09:03.136701] Waiting for tuning job to complete. Current status: running
[2024-04-24 10:11:03.477622] Waiting for tuning job to complete. Current status: running
Finished training.


100%|██████████| 10/10 [00:12<00:00,  1.26s/it]


In [19]:
print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

Accuracy: 0.80
