In [None]:
import pandas as pd
import os

from dotenv import load_dotenv
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import shuffle, resample


# scikit llm imports
from skllm.config import SKLLMConfig
from skllm.models.gpt.classification.zero_shot import ZeroShotGPTClassifier
from skllm.models.gpt.classification.few_shot import FewShotGPTClassifier
from skllm.models.vertex.classification.zero_shot import ZeroShotVertexClassifier

In [None]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

# Load features

In [None]:
# If this is set to true, we use old data and pickle files that have been created
READ_FROM_PICKLE = True

In [None]:
if READ_FROM_PICKLE:
    malicious_df = pd.read_pickle("data/malicious_features_numeric.pkl")
    benign_df = pd.read_pickle("data/benign_features_numeric.pkl")

# Labeling

In [None]:
# add labels, 0 for benign, 1 for malicious
malicious_df["label"] = 1 
benign_df["label"] = 0

# Everything is text

In [None]:
malicious_df["embedding"] = malicious_df.apply(lambda row: ','.join(row.astype(str)), axis=1)
benign_df["embedding"] = benign_df.apply(lambda row: ','.join(row.astype(str)), axis=1)

In [None]:
data = pd.concat([malicious_df[["embedding", "label"]], benign_df[["embedding", "label"]]], ignore_index=True)

In [None]:
data = shuffle(data)

# Downsample

In [None]:
# Assuming you have your data loaded into a Pandas DataFrame called 'data'

# Separate majority and minority classes
majority_class = data[data["label"] == 1]
minority_class = data[data["label"] == 0]

# Downsample majority class
downsampled_majority = resample(
    majority_class,
    replace=False,  # sample without replacement
    n_samples=len(minority_class),  # to match minority class
    random_state=42,
)  # reproducible results

# Combine minority class with downsampled majority class
downsampled_data = pd.concat([downsampled_majority, minority_class])

# Shuffle the data
downsampled_data = downsampled_data.sample(frac=1, random_state=42)

# Now downsampled_data contains your downsampled dataset

In [None]:
# use a subset because this takes loooong!
data_size = 2000
X_llm = data["embedding"].head(data_size)
y_llm = data["label"].head(data_size)
X_test = data["embedding"].tail(int(data_size / 10))
y_test = data["label"].tail(int(data_size / 10))

In [None]:
SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org(openai_org)

# Models

## OpenAI

In [None]:
# all_metrics = []

### ZeroShotClassifier

In [None]:
# clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
# clf.fit(X_llm, y_llm)
# labels = clf.predict(X_test)

# accuracy = accuracy_score(y_test, labels)
# prf = precision_recall_fscore_support(y_test, labels)
# metrics_dict = {
#     "gpt-3.5-turbo-zero-shot": {
#         "Accuracy": accuracy,
#         "Precision": float(prf[0][1]),
#         "Recall": float(prf[1][1]),
#         "F1": float(prf[2][1]),
#     }
# }
# all_metrics.append(metrics_dict)

In [None]:
# clf = ZeroShotGPTClassifier(openai_model="gpt-4o")
# clf.fit(X_llm, y_llm)
# labels = clf.predict(X_test)

# accuracy = accuracy_score(y_test, labels)
# prf = precision_recall_fscore_support(y_test, labels)
# metrics_dict = {
#     "gpt-4o-zero-shot": {
#         "Accuracy": accuracy,
#         "Precision": float(prf[0][1]),
#         "Recall": float(prf[1][1]),
#         "F1": float(prf[2][1]),
#     }
# }
# all_metrics.append(metrics_dict)

### FewShotClassifier

In [None]:
# clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
# clf.fit(X_llm, y_llm)
# labels = clf.predict(X_test)

# accuracy = accuracy_score(y_test, labels)
# prf = precision_recall_fscore_support(y_test, labels)
# metrics_dict = {
#     "gpt-3.5-turbo-few-shot": {
#         "Accuracy": accuracy,
#         "Precision": float(prf[0][1]),
#         "Recall": float(prf[1][1]),
#         "F1": float(prf[2][1]),
#     }
# }
# all_metrics.append(metrics_dict)

In [None]:
# clf = FewShotGPTClassifier(openai_model="gpt-4o")
# clf.fit(X_llm, y_llm)
# labels = clf.predict(X_test)

# accuracy = accuracy_score(y_test, labels)
# prf = precision_recall_fscore_support(y_test, labels)
# metrics_dict = {
#     "gpt-4o-few-shot": {
#         "Accuracy": accuracy,
#         "Precision": float(prf[0][1]),
#         "Recall": float(prf[1][1]),
#         "F1": float(prf[2][1]),
#     }
# }
# all_metrics.append(metrics_dict)

In [None]:
# all_metrics

## VertexAI

In [None]:
all_metrics = []
zero_shot_metrics = []
few_shot_metrics = []

In [None]:
SKLLMConfig.set_google_project("expel-engineering-internal")

In [None]:
clf = ZeroShotVertexClassifier(model="gemini-2.0-flash-001")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Gemini-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)

In [None]:
clf = ZeroShotVertexClassifier(model="chat-bison")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Palm-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)

In [None]:
zero_shot_metrics

## Ollama

In [None]:
from skollama.models.ollama.classification.zero_shot import ZeroShotOllamaClassifier
from skollama.models.ollama.classification.few_shot import FewShotOllamaClassifier

clf = ZeroShotOllamaClassifier(model="llama3")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "LLama-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
clf = FewShotOllamaClassifier(model="llama3")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "LLama-few-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
few_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
clf = ZeroShotOllamaClassifier(model="mistral")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Mistral-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
clf = FewShotOllamaClassifier(model="mistral")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Mistral-few-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
few_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
all_metrics

In [None]:
clf = ZeroShotOllamaClassifier(model="gemma")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Gemma-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)

In [None]:
# clf = FewShotOllamaClassifier(model="gemma")
# clf.fit(X_llm, y_llm)
# labels = clf.predict(X_test)

# accuracy = accuracy_score(y_test, labels)
# prf = precision_recall_fscore_support(y_test, labels)
# metrics_dict = {
#     "Gemma-few-shot": {
#         "Accuracy": accuracy,
#         "Precision": float(prf[0][1]),
#         "Recall": float(prf[1][1]),
#         "F1": float(prf[2][1]),
#     }
# }
# all_metrics.append(metrics_dict)

In [None]:
clf = ZeroShotOllamaClassifier(model="phi")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Phi-zero-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
zero_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
clf = FewShotOllamaClassifier(model="phi")
clf.fit(X_llm, y_llm)
labels = clf.predict(X_test)

accuracy = accuracy_score(y_test, labels)
prf = precision_recall_fscore_support(y_test, labels)
metrics_dict = {
    "Phi-few-shot": {
        "Accuracy": accuracy,
        "Precision": float(prf[0][1]),
        "Recall": float(prf[1][1]),
        "F1": float(prf[2][1]),
    }
}
few_shot_metrics.append(metrics_dict)
all_metrics.append(metrics_dict)

In [None]:
all_metrics

In [None]:
# Convert to long-format DataFrame
all_scores_df_long = pd.DataFrame(
    [
        {"Model": list(item.keys())[0], "Metric": metric, "Score": value}
        for item in all_metrics 
        for metric, value in list(item.values())[0].items()
    ]
)
all_scores_df_long

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(x="Metric", y="Score", hue="Model", data=all_scores_df_long)

# Customize the plot
plt.title("Classification Evaluation Zero Shot vs Few Shot.", fontsize=16)
plt.xlabel("Metrics", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Convert to long-format DataFrame
zero_scores_df_long = pd.DataFrame(
    [
        {"Model": list(item.keys())[0], "Metric": metric, "Score": value}
        for item in zero_shot_metrics
        for metric, value in list(item.values())[0].items()
    ]
)
zero_scores_df_long

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(x="Metric", y="Score", hue="Model", data=zero_scores_df_long)

# Customize the plot
plt.title("Classification Evaluation for Different Models with Zero Shot.", fontsize=16)
plt.xlabel("Metrics", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Convert to long-format DataFrame
few_scores_df_long = pd.DataFrame(
    [
        {"Model": list(item.keys())[0], "Metric": metric, "Score": value}
        for item in few_shot_metrics
        for metric, value in list(item.values())[0].items()
    ]
)
zero_scores_df_long

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(x="Metric", y="Score", hue="Model", data=few_scores_df_long)

# Customize the plot
plt.title("Classification Evaluation for Different Models with Few Shot.", fontsize=16)
plt.xlabel("Metrics", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

# Show the plot
plt.show()

# Fine Tuning

In [None]:
from skllm.models.gpt.classification.tunable import GPTClassifier

clf = GPTClassifier(
    base_model="gpt-3.5-turbo-0613",
    n_epochs=None,  # int or None. When None, will be determined automatically by OpenAI
    default_label="Random",  # optional
)

clf.fit(X_llm, y_llm)  # y_train is a list of labels
labels = clf.predict(X_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, labels):.2f}")