In [None]:
import pandas as pd
import numpy as np
import os

# pandasai imports
from pandasai.llm.openai import OpenAI
from pandasai import PandasAI

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

# scikit llm imports
from skllm.config import SKLLMConfig
from skllm import (
    ZeroShotGPTClassifier,
    FewShotGPTClassifier,
    DynamicFewShotGPTClassifier,
)

from dotenv import load_dotenv

In [None]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")

# Load data

In [None]:
mirai_flow_df_numeric = pd.read_pickle("../data/blog_eda/mirai_flow_numeric.pkl")
benign_flow_df_numeric = pd.read_pickle("../data/blog_eda/benign_flow_numeric.pkl")

# EDA with Pandas AI

In [None]:
# Instantiate a LLM
llm = OpenAI(api_token=openai_api_key)
pandas_ai = PandasAI(llm)

In [None]:
mirai_clean_df = pandas_ai.run(mirai_flow_df_numeric, prompt="Copy the dataframe to a new variable named df_cleaned. Do data cleaning. Return df_cleaned.")

In [None]:
benign_clean_df = pandas_ai.run(benign_flow_df_numeric, prompt="Copy the dataframe to a new variable named df_cleaned. Do data cleaning. Return df_cleaned.")


In [None]:
mirai_clean_df

In [None]:
benign_clean_df

In [None]:
top_5_source_IPs = pandas_ai(
    mirai_clean_df, prompt="Which are the 5 most popular source IP addresses?"
)
top_5_source_IPs

In [None]:
top_5_dst_ports = pandas_ai(
    mirai_clean_df, prompt="Which are the 5 most popular destination ports?"
)
top_5_dst_ports

In [None]:
pandas_ai.run(
    mirai_clean_df,
    prompt="Plot the scatter plot of stream durations and number of packets.",
)

In [None]:
pandas_ai.run(benign_clean_df, prompt="Plot a barplot of top 10 destination ports.")


# Feature Engineering with Pandas AI

In [None]:
mirai_cleaned_df.generate_features()

In [None]:
benign_clean_df.generate_features()

# Labeling
We label and concatenate benign and malicious before one-hot because there are different ports in each dataset and concatenating the two after one hot will not work with different columns.

In [None]:
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

In [None]:
# add labels, 0 for benign, 1 for malicious
mirai_df["Labels"] = 1
benign_df["Labels"] = 0

In [None]:
mirai_payloads = mirai_df[["Payload", "Labels"]]
benign_payloads = benign_df[["Payload", "Labels"]]

In [None]:
payloads = pd.concat([mirai_payloads, benign_payloads], ignore_index=True)


# AI Classifier

In [None]:
payloads_subset = payloads.sample(n=100, random_state=1)

X_llm = payloads_subset["Payload"]
y_llm = payloads_subset["Labels"]

In [None]:
SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org("org-HlcxSARQUphcO0tUGmtJJOpD")

In [None]:
clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)

print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")

In [None]:
clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_llm)
print(f"Accuracy: {accuracy_score(y_llm, labels):.2f}")