In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import ipaddress
import os

from dotenv import load_dotenv
from pandasai.llm.openai import OpenAI
from pandasai import PandasAI
from scapy.all import PcapReader, IP, TCP, UDP, ICMP
from scipy.stats import ttest_ind, kstest, norm, skew, kurtosis, zscore
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

from skimpy import skim
from summarytools import dfSummary
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")

In [None]:
pcap_reader_mirai = PcapReader("data/mirai.pcap")
pcap_reader_benign = PcapReader("data/benign.pcapng")

# Preprocess

- convert data to streams
- collect some numbers

In [None]:
def pcap_to_dataframe(pcap_reader: PcapReader) -> pd.DataFrame:
    """Converts a packet capture to a Pandas DataFrame.

    Args:
        pcap_reader (PcapReader): packet capture read using scapy

    Returns:
        pd.DataFrame: dataframe with pcap data
    """
    # Create an empty list to store the data
    data = []

    # Iterate through the packets in the pcap file
    for packet in pcap_reader:
        # Get the source and destination IP addresses
        if packet.haslayer(IP):
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst
            protocol = packet[IP].proto
        else:
            src_ip = None
            dst_ip = None
            protocol = None

        # Get the source and destination ports and payload
        if packet.haslayer(TCP):
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
            payload = str(packet[TCP].payload)
            packet_len = len(packet[TCP])
        elif packet.haslayer(UDP):
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport
            payload = str(packet[UDP].payload)
            packet_len = len(packet[UDP])
        elif packet.haslayer(ICMP):
            payload = str(packet[ICMP].payload)
            packet_len = len(packet[ICMP])
            src_port = None
            dst_port = None
        else:
            src_port = None
            dst_port = None
            payload = str(packet.payload)
            packet_len = len(packet)

        # Append the data to the list
        data.append(
            [
                packet.time,
                src_ip,
                dst_ip,
                src_port,
                dst_port,
                payload,
                packet_len,
                protocol,
            ]
        )

    # Convert the list to a pandas dataframe
    df = pd.DataFrame(
        data,
        columns=[
            "Timestamp",
            "Source IP",
            "Destination IP",
            "Source Port",
            "Destination Port",
            "Payload",
            "Packet Length",
            "Protocol",
        ],
    )

    return df

In [None]:
# mirai_df = pcap_to_dataframe(pcap_reader_mirai)
# benign_df = pcap_to_dataframe(pcap_reader_benign)

In [None]:
# mirai_df.to_pickle("../data/bsides_aug/mirai.pkl")
# benign_df.to_pickle("../data/bsides_aug/benign.pkl")

In [None]:
mirai_df = pd.read_pickle("data/mirai.pkl")
benign_df = pd.read_pickle("data/benign.pkl")

In [None]:
mirai_df

In [None]:
benign_df

## AI
Use pandas AI to clean up the data

In [None]:
# Instantiate a LLM
llm = OpenAI(api_token=openai_api_key)
pandas_ai = PandasAI(llm)

In [None]:
mirai_cleaned_df = pandas_ai.clean_data(mirai_df)

In [None]:
benign_clean_df = pandas_ai.clean_data(benign_df)

In [None]:
mirai_cleaned_df

In [None]:
benign_clean_df

## Extract streams
Extract streams using the tuple (src_IP, src_port, dst_ip, dst_port, protocol).

In [None]:
def extract_streams(df: pd.DataFrame) -> pd.DataFrame:
    # Create an empty list to store stream data as separate dataframes
    dfs = []

    # Group packets by src/dst IP and src/dst port
    grouped = df.groupby(
        ["Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol"]
    )

    # Iterate through each group to extract stream data
    for name, group in grouped:
        # Get source/destination IP, port, and protocol
        src_ip, dst_ip, src_port, dst_port, proto = name

        # Get number of packets, total length, and duration of the stream
        num_packets = len(group)
        total_length = group["Packet Length"].sum()
        start_time = group["Timestamp"].min()
        end_time = group["Timestamp"].max()
        duration = float(end_time - start_time)

        # Create a new dataframe with the stream data
        stream_df = pd.DataFrame(
            {
                "Source IP": [src_ip],
                "Destination IP": [dst_ip],
                "Source Port": [src_port],
                "Destination Port": [dst_port],
                "Protocol": [proto],
                "Number of Packets": [num_packets],
                "Total Length": [total_length],
                "Duration": [duration],
            }
        )

        # Add the new dataframe to the list
        dfs.append(stream_df)

    # Concatenate all the dataframes in the list into one dataframe
    stream_df = pd.concat(dfs, ignore_index=True)

    # Return the new dataframe with stream data
    return stream_df

In [None]:
# TODO: find whi timestamp is gone in the cleaned dataframes and use cleaned dataframes here
mirai_stream_df = extract_streams(mirai_df)
benign_stream_df = extract_streams(benign_df)

# EDA

Exploratory Data Analysis approaches the dataset as a black box that we need to visualize and analyze statistically with the following goals:
- get insights about our data
- test hypotheses
- decide on models and further processing, such as feature engineering.

EDA can be performed for benign and malicious data. Here we are looking at EDA only for malicious data, however the same functions can be applied to benign.

## Descriptive statistics & data

- Describe columns and data types
- Descriptive statistics
  -  count, 
  -  mean, 
  -  standard deviation, 
  -  minimum, 
  -  25th percentile, 
  -  median (50th percentile), 
  -  75th percentile, and 
  -  maximum

In [None]:
# describe, summarize etc.
mirai_stream_df.columns

In [None]:
mirai_stream_df.dtypes

In [None]:
# descriptive statistics
mirai_stream_df.describe()

In [None]:
# correlation matrix for numerical values in dataframe
mirai_stream_df.corr()

## Hypothesis testing

- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [None]:
def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(
            col1, col2
        )
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(
            col1, col2
        )

In [None]:
hypothesis_testing(mirai_stream_df, "Number of Packets", "Total Length")

## Outliers

- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [None]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [None]:
outliers = detect_outliers_zscore(mirai_stream_df, "Total Length", threshold=3)
print(outliers)

## AI
Explore the data with AI

In [None]:
top_5_source_IPs = pandas_ai(
    mirai_df, prompt="Which are the 5 most popular source IP addresses?"
)
top_5_source_IPs

In [None]:
top_5_dst_ports = pandas_ai(
    mirai_df, prompt="Which are the 5 most popular destination ports?"
)
top_5_dst_ports

In [None]:
pandas_ai.run(
    mirai_stream_df,
    prompt="Plot the scatter plot of stream durations and number of packets.",
)

In [None]:
pandas_ai.run(benign_stream_df, prompt="Plot a barplot of top 10 destination ports.")

# Feature Engineering

## Numerical

In [None]:
# convert ip address to numeric values
def ip_to_numeric(ip):
    ip_obj = ipaddress.ip_interface(ip)
    return int(ip_obj.network.network_address)

In [None]:
# convert IPs to numeric mirai data
mirai_stream_df["Source IP Numeric"] = mirai_stream_df["Source IP"].apply(ip_to_numeric)
mirai_stream_df["Destination IP Numeric"] = mirai_stream_df["Destination IP"].apply(
    ip_to_numeric
)

In [None]:
# convert IPs to numeric benign data
benign_stream_df["Source IP Numeric"] = benign_stream_df["Source IP"].apply(
    ip_to_numeric
)
benign_stream_df["Destination IP Numeric"] = benign_stream_df["Destination IP"].apply(
    ip_to_numeric
)

In [None]:
# get rid of non numeric columns for IPs
mirai_stream_df_numeric = mirai_stream_df.drop(columns=["Source IP", "Destination IP"])
benign_stream_df_numeric = benign_stream_df.drop(
    columns=["Source IP", "Destination IP"]
)

In [None]:
# convert duration from object to float
mirai_stream_df["Duration"] = mirai_stream_df_numeric["Duration"].astype(float)
benign_stream_df["Duration"] = benign_stream_df_numeric["Duration"].astype(float)

In [None]:
# check if all data types are numeric now
mirai_stream_df_numeric.dtypes

## AI
Generate features using `generate_features` from `pandas_ai` on cleaned up data or subset of original due to long requests.

In [None]:
pandas_ai.generate_features(mirai_cleaned_df)

In [None]:
pandas_ai.generate_features(benign_clean_df)

In [None]:
mirai_subset_df = mirai_df.loc[:1000]
benign_subset_df = benign_df.loc[:1000]

In [None]:
pandas_ai.generate_features(mirai_subset_df)

In [None]:
pandas_ai.generate_features(benign_subset_df)

# Summaries & Visualizations

In [None]:
skim(mirai_stream_df_numeric)

In [None]:
skim(benign_stream_df_numeric)

In [None]:
dfSummary(mirai_stream_df_numeric)

In [None]:
dfSummary(benign_stream_df_numeric)

In [None]:
my_report = sv.analyze(mirai_stream_df_numeric)
my_report.show_html()

In [None]:
my_report = sv.analyze(benign_stream_df_numeric)
my_report.show_html()

# Labeling
We label and concatenate benign and malicious before one-hot because there are different ports in each dataset and concatenating the two after one hot will not work with different columns.

In [None]:
# add labels, 0 for benign, 1 for malicious
mirai_stream_df_numeric["Labels"] = 1
benign_stream_df_numeric["Labels"] = 0

In [None]:
# concatenate dataframes
data_df = pd.concat(
    [mirai_stream_df_numeric, benign_stream_df_numeric], ignore_index=True
)

# Feature engineering (cont.) categorical 
## One-hot

In [None]:
def one_hot_encode(df, feature):
    feature_dummies = pd.get_dummies(df[feature], prefix=feature)
    return pd.concat([df, feature_dummies], axis=1)

In [None]:
data_df = one_hot_encode(data_df, "Source Port")
data_df = one_hot_encode(data_df, "Destination Port")

# Model Training
- models
  - xgboost
  - NN
  - k-NN
  - Random Forest
- k-fold cross validation

In [None]:
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=1)

In [None]:
# randomize the data. TODO: is this needed
from sklearn.utils import shuffle

data_df = shuffle(data_df)

In [None]:
# after feature engineering, drop all the columns that are unnecessary
data_df = data_df.drop(["Source Port", "Destination Port"], axis=1)

In [None]:
# store the features and labels in separate dataframes
X = data_df.drop("Labels", axis=1)
y = data_df["Labels"]

In [None]:
# convert data to array
data_array = data_df.to_numpy()

In [None]:
model = 1
for train, test in kfold.split(data_array):
    print(f"Model {model}")
    print(f"train {train}, test: {test}")
    model += 1

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(max_depth=2, random_state=0)

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=2, max_depth=2, learning_rate=1, objective="binary:logistic"
)

In [None]:
def k_fold_model(model, X, y):
    accuracy_scores = []
    for train, test in kfold.split(X):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        model.fit(X_train, y_train)

        # evaluate model accuracy
        accuracy = model.score(X_test, y_test)
        accuracy_scores.append(accuracy)

    return accuracy_scores

## AI 
Use llm for classification. Can only use payload that is words.

In [None]:
# add labels, 0 for benign, 1 for malicious
mirai_cleaned_df["Labels"] = "positive"
benign_clean_df["Labels"] = "negative"

In [None]:
mirai_payloads = mirai_cleaned_df[["Payload", "Labels"]]
benign_payloads = benign_clean_df[["Payload", "Labels"]]

In [None]:
payloads = pd.concat([mirai_payloads, benign_payloads], ignore_index=True)

In [None]:
# TODO: use sample
from sklearn.utils import shuffle

payloads = shuffle(payloads)

In [None]:
# use a subset because this takes loooong!
X = payloads["Payload"].head(100)
y = payloads["Labels"].head(100)

In [None]:
X.shape

In [None]:
from skllm.config import SKLLMConfig

SKLLMConfig.set_openai_key(openai_api_key)
SKLLMConfig.set_openai_org("org-HlcxSARQUphcO0tUGmtJJOpD")

In [None]:
# TODO: DynamicFewShotGPTClassifier, FewShotGPTClassifier
from skllm import ZeroShotGPTClassifier

clf = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X, y)
labels = clf.predict(X)

In [None]:
from sklearn.metrics import accuracy_score

print(f"Accuracy: {accuracy_score(y, labels):.2f}")

In [None]:
from skllm import FewShotGPTClassifier

clf = FewShotGPTClassifier(openai_model="gpt-3.5-turbo")
clf.fit(X, y)
labels = clf.predict(X)

In [None]:
from sklearn.metrics import accuracy_score

print(f"Accuracy: {accuracy_score(y, labels):.2f}")

In [None]:
from skllm import DynamicFewShotGPTClassifier

clf = DynamicFewShotGPTClassifier(n_examples=3)
clf.fit(X, y)
labels = clf.predict(X)

In [None]:
clf = DynamicFewShotGPTClassifier(n_examples=3)
clf.fit(X, y)
labels = clf.predict(X)

# Model Evaluation

## Accuracy

Accuracy is the proportion of correct predictions to the total number of predictions.

In [None]:
# Calculate the average accuracy score across all folds
accuracy_scores = k_fold_model(knn_model, X, y)
average_accuracy = sum(accuracy_scores) / k
print("Average accuracy k-nn:", average_accuracy)

In [None]:
# Calculate the average accuracy score across all folds
accuracy_scores = k_fold_model(rf_model, X, y)
average_accuracy = sum(accuracy_scores) / k
print("Average accuracy random forest:", average_accuracy)

In [None]:
# Calculate the average accuracy score across all folds
accuracy_scores = k_fold_model(xgb_model, X, y)
average_accuracy = sum(accuracy_scores) / k
print("Average accuracy XGBoost:", average_accuracy)

## Precision

Precision measures the proportion of true positives to the total predicted positives.

In [None]:
def model_evaluation(model, X, y, k, scorer):
    # Perform k-fold cross-validation and calculate precision scores
    # precision_scores = cross_val_score(model, X, y, cv=k, scoring=scorer)
    model_scores = cross_val_score(model, X, y, cv=k)

    # Calculate the average score across all folds
    average_score = model_scores.mean()

    return average_precision

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score

# Assuming model is your trained model and X, y are the input features and labels respectively
# Set the scoring metric to precision
scorer = make_scorer(precision_score)

In [None]:
# TODO: how do we calculate precision for knn?
# print("Average Precision knn:", model_evaluation(knn_model, X, y, k, scorer))

In [None]:
# print("Average Precision random forest:", model_evaluation(rf_model, X, y, k, scorer))

In [None]:
print("Average Precision xgboost", model_evaluation(xgb_model, X, y, k, scorer))

## Recall

Recall (also known as sensitivity or true positive rate) measures the proportion of true positives to the total actual positives.

In [None]:
scorer = make_scorer(recall_score)

In [None]:
print("Average Recall knn:", model_evaluation(knn_model, X, y, k, scorer))

In [None]:
print("Average Recall random forest:", model_evaluation(rf_model, X, y, k, scorer))

In [None]:
print("Average Recall xgboost:", model_evaluation(xgb_model, X, y, k, scorer))

## Confusion matrix

A confusion matrix is a table that visualizes the performance of a classification model by showing the counts of true positive, true negative, false positive, and false negative predictions.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
def plot_confusion_matrix(model, X, y):
    accuracy_scores = []
    for train, test in kfold.split(X):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)

        # Create a heatmap of the confusion matrix
        sns.heatmap(cm, annot=True, cmap="Blues")
        plt.xlabel("Predicted labels")
        plt.ylabel("True labels")
        plt.title("Confusion Matrix")
        plt.show()

In [None]:
plot_confusion_matrix(xgb_model, X, y)

## ROC curve

Use pandas ai ROC curve