# Imports

In [68]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv
from itertools import combinations

# langchain imports
from langchain.llms import Ollama, HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_google_vertexai import VertexAI

# traditional ML imports
from scipy.stats import ttest_ind, zscore
from summarytools import dfSummary

In [2]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [None]:
models = {
    "Gemini": VertexAI(model="gemini-2.0-flash-001"),
    "Palm": VertexAI(model="chat-bison"),
    "Mistral": Ollama(model="mistral"),
    "Gemma": Ollama(model="gemma"),
    "Llama": Ollama(model="llama2"),
    "Phi": Ollama(model="phi"),
    # TODO: Retry gpt-4
}

Non-Gemini models are deprecated. They will be remoced starting from Dec-01-2024. 


# Initialization

In [5]:
READ_FROM_PICKLE = True

In [6]:
if READ_FROM_PICKLE:
    malicious_df = pd.read_pickle("data/malicious_features_numeric.pkl")
    benign_df = pd.read_pickle("data/benign_features_numeric.pkl")

# EDA

## Statistical

### Exploration via prompting

In [20]:
def summarize_dataset(df):
    summary = f"The dataset contains {len(df)} rows and it contains a network packet capture that was generated using Wireshark in an enterprize network. "
    summary += f"Study the first 10 rows of the dataset and try to understand what it describes:\n {df.head(10)}"
    summary += f"Below are the summary statistics of the dataset\n {df.describe()}."
    # summary += f"The correlation of the features of the dataset is given below \n {df.corr()}."
    summary += f"A summary that includes statistics, histograms is given below \n {dfSummary(df)}." 
    summary += "Identify any anomalies in this time series dataset. Justify your conclusions based on known detections and security attacks." 
    return summary

In [21]:
data_summary = summarize_dataset(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["data_summary"],
    template="Analyze this dataset and identify anomalies and trends:\n{data_summary}",
)

In [37]:
import time

results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(data_summary=data_summary))

    end_time = time.time()

    # Store results
    results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

Running analysis with Palm...

Palm Response:
 Based on the provided dataset, here are some observations and potential anomalies:

1. **DNS Queries**: There are several DNS queries observed in the dataset, including requests for "xmpp.samsungsmartcam.com", "time.windows.com", and "device-abnormal.eye4.cn". These DNS queries could indicate attempts to access internal network resources or external services, which should be further investigated to determine their legitimacy.

2. **High Packet Length**: A few packets have relatively high packet lengths compared to the majority of the dataset. For instance, packet number 16 has a length of 166

Time Taken: 2.32 seconds



### Hypothesis testing
- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [53]:
malicious_df.pop("Payload")

0                                                   Padding
1                                                   Padding
7         DNS Qry "b'xmpp.samsungsmartcam.com.Speedport_...
8                                                  DNS Ans 
9                                                   Padding
                                ...                        
764121                                                  Raw
764122                      DNS Qry "b'time.windows.com.'" 
764124                                              Padding
764125                                              Padding
764136    DNS Qry "b'north-america.pool.ntp.org.Speedpor...
Name: Payload, Length: 154090, dtype: object

In [54]:
def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(
            col1, col2
        )
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(
            col1, col2
        )

In [55]:
def get_column_combinations(df):
    df_columns = df.columns.tolist()
    pairs = list(combinations(df_columns, 2))

    return pairs

In [62]:
def all_hypotheses(df):
    # perform hypothesis testing for all combinations of columns
    all_column_pairs = get_column_combinations(malicious_df)
    hypotheses = ""
    for pair in all_column_pairs:
        hypotheses += hypothesis_testing(df, pair[0], pair[1])
        # ask model to explain
    explain = f"Below there is all the hypothesis testing performed with ttest for all the possible combinations of the features of the dataset. Extract logical conclusions based on the hypotheses testings. Is there a difference between two groups of variables that is statistically significant? Can you conclude if there are dependent or independent variables in the dataset? \n ** Hypotheses Tests ** {hypotheses}"
    
    return explain 

In [63]:
hypotheses = all_hypotheses(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["hypotheses"],
    template="Analyze this dataset based on the hypotheses tests and identify anomalies and trends:\n{hypotheses}",
)

In [64]:
import time

results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(hypotheses=hypotheses))

    end_time = time.time()

    # Store results
    results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

Running analysis with Gemini...

Gemini Response:
Okay, let's analyze these hypothesis test results and try to extract meaningful insights.

**Core Findings & Interpretations**

1.  **Interarrival Time Stands Out:** The most striking observation is that *Interarrival time* consistently shows *no statistically significant difference* when compared to any of the other variables. This is a key finding.

2.  **Ubiquitous Significant Differences:**  Almost every variable, *except Interarrival Time*, exhibits a statistically significant difference from every other variable.

**Logical Conclusions**

*   **Interarrival Time is Different:** The Interarrival time appears to be behaving differently from the other features in the dataset. This could indicate it's governed by a different process or is less correlated with the other network characteristics.
*   **Statistical Significance Doesn't Equal Practical Significance:** While nearly all pairs show a statistically significant difference, it's

### Outliers
- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [65]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [66]:
def all_outliers(df):
    outliers = ""
    for feature in df.columns:
        outliers += f"Outlier scores for {feature} are {detect_outliers_zscore(df, feature, threshold=3)}"
        # ask model to explain
    explain = f"Below there is all the outliers scores calculated using zscore, for all the possible combinations of the features of the dataset. Extract logical conclusions based on the outliers. Do you find any interesting observations that stand out in the dataset based on the outlier calculations? What are your conclusions? \n ** Outlier Scores ** {outliers}"

    return explain

In [69]:
outliers = all_outliers(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["outliers"],
    template="Analyze this dataset based on the outlier calculations and identify anomalies and trends:\n{outliers}",
)

In [70]:
import time

results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(outliers=outliers))

    end_time = time.time()

    # Store results
    results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

Running analysis with Gemini...

Gemini Response:
Okay, let's analyze these outlier scores and see what we can conclude about the dataset.

**Summary of Outlier Detections**

*   **Timestamp:** No outliers detected.
*   **Source Port:** No outliers detected.
*   **Destination Port:** Outliers detected.
*   **Packet Length:** Outliers detected.
*   **Protocol:** No outliers detected.
*   **src\_ip\_total\_bytes:** No outliers detected.
*   **dst\_ip\_total\_bytes:** Outliers detected.
*   **Numeric Source IP:** Outliers detected.
*   **Numeric Destination IP:** No outliers detected.
*   **dst\_port\_freq\_encoded:** No outliers detected.
*   **Interarrival:** No outliers detected.

**Observations and Anomalies**

1.  **Destination Port Outliers:**
    *   A large number of outliers are detected based on the `Destination Port` feature.
    *   Many of these outliers share the same `Destination Port` (32100).
    *   These packets are primarily UDP (Protocol 17).
    *   There is a concen

## Visualizations

# Compare datasets