In [1]:
import pandas as pd
import os

from dotenv import load_dotenv
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle


# scikit llm imports
from skllm.config import SKLLMConfig
from skllm import (
    ZeroShotGPTClassifier,
    FewShotGPTClassifier,
)

In [2]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [3]:
# If this is set to true, we use old data and pickle files that have been created
READ_FROM_PICKLE = True

In [4]:
if READ_FROM_PICKLE:
    mirai_cleaned_df = pd.read_pickle("data/mirai_cleaned.pkl")
    benign_clean_df = pd.read_pickle("data/benign_cleaned.pkl")

In [5]:
# add labels, 0 for benign, 1 for malicious
mirai_cleaned_df["Labels"] = 1 
benign_clean_df["Labels"] = 0

In [6]:
mirai_payloads = mirai_cleaned_df[["Payload", "Labels"]]
benign_payloads = benign_clean_df[["Payload", "Labels"]]

In [7]:
payloads = pd.concat([mirai_payloads, benign_payloads], ignore_index=True)

In [8]:
payloads = shuffle(payloads)

In [9]:
# use a subset because this takes loooong!
X_llm = payloads["Payload"].head(50)
y_llm = payloads["Labels"].head(50)
X_test = payloads["Payload"].tail(10)
y_test = payloads["Labels"].tail(10)

In [10]:
SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org(openai_org)

In [11]:
clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:35<00:00,  1.40it/s]

Accuracy: 0.92





In [12]:
clf = ZeroShotGPTClassifier(openai_model="gpt-4")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [01:04<00:00,  1.28s/it]

Accuracy: 0.92





In [13]:
clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [00:39<00:00,  1.26it/s]

Accuracy: 0.94





In [14]:
clf = FewShotGPTClassifier(openai_model="gpt-4")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

100%|██████████| 50/50 [01:00<00:00,  1.20s/it]

Accuracy: 0.94





In [15]:
from skllm.models.gpt import GPTClassifier

clf = GPTClassifier(
    base_model="gpt-3.5-turbo-0613",
    n_epochs=None,  # int or None. When None, will be determined automatically by OpenAI
    default_label="Random",  # optional
)

clf.fit(X_llm, y_llm)  # y_train is a list of labels
labels = clf.predict(X_test)

Created new file. FILE_ID = file-S8GD5GY5J96s97r3ojWbJMRu
Waiting for file to be processed ...
Created new tuning job. JOB_ID = ftjob-6vXArmtQliY1CP7KL47F59bR
[2023-11-30 09:51:02.454371] Waiting for tuning job to complete. Current status: validating_files
[2023-11-30 09:53:02.807526] Waiting for tuning job to complete. Current status: running
[2023-11-30 09:55:03.585185] Waiting for tuning job to complete. Current status: running
[2023-11-30 09:57:03.891077] Waiting for tuning job to complete. Current status: running
Finished training. Number of trained tokens: 26733.


100%|██████████| 10/10 [00:09<00:00,  1.05it/s]


In [16]:
print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")

Accuracy: 1.00
