# Imports

In [None]:
import time
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from itertools import combinations

# langchain imports
from langchain.llms import Ollama, HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_google_vertexai import VertexAI

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams, LLMTestCase


# traditional ML imports
from scipy.stats import ttest_ind, zscore
from summarytools import dfSummary

In [None]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [None]:
models = {
    "Gemini": VertexAI(model="gemini-2.0-flash-001"),
    "Palm": VertexAI(model="chat-bison"),
    "Mistral": Ollama(model="mistral"),
    "Gemma": Ollama(model="gemma"),
    # TODO: check if llama3 will be fast enough and substitute?
    "Llama": Ollama(model="llama3"),
    "Phi": Ollama(model="phi"),
}

# Initialization

In [None]:
READ_FROM_PICKLE = True

In [None]:
if READ_FROM_PICKLE:
    malicious_df = pd.read_pickle("data/malicious_features_numeric.pkl")
    benign_df = pd.read_pickle("data/benign_features_numeric.pkl")

# EDA

## Statistical

In [None]:
def summarize_dataset(df):
    summary = f"The dataset contains {len(df)} rows and it contains a network packet capture that was generated using Wireshark in an enterprize network. "
    summary += f"Study the first 100 rows of the dataset and try to understand what it describes:\n {df.head(100)}"
    summary += f"Below are the summary statistics of the dataset\n {df.describe()}."
    # summary += f"The correlation of the features of the dataset is given below \n {df.corr()}."
    with open("data/df_summary_mal.txt", "r") as f:
        summary += f"A summary that includes statistics, histograms is given below \n {f.read()}." 
    summary += "Identify any anomalies in this time series dataset. Justify your conclusions based on known detections and security attacks." 
    return summary

In [None]:
dfSummary(malicious_df)

In [None]:
malicious_data_summary = summarize_dataset(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["malicious_data_summary"],
    template="Analyze this dataset and identify anomalies and trends:\n{malicious_data_summary}",
)

In [None]:
malicious_data_summary

In [None]:
statistical_results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(
        prompt_template.format(malicious_data_summary=malicious_data_summary)
    )

    end_time = time.time()

    # Store results
    statistical_results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

### Hypothesis testing
- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [None]:
malicious_df.pop("Payload")

In [None]:
def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(
            col1, col2
        )
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(
            col1, col2
        )

In [None]:
def get_column_combinations(df):
    df_columns = df.columns.tolist()
    pairs = list(combinations(df_columns, 2))

    return pairs

In [None]:
def all_hypotheses(df):
    # perform hypothesis testing for all combinations of columns
    all_column_pairs = get_column_combinations(malicious_df)
    hypotheses = ""
    for pair in all_column_pairs:
        hypotheses += hypothesis_testing(df, pair[0], pair[1])
        # ask model to explain
    explain = f"Below there is all the hypothesis testing performed with ttest for all the possible combinations of the features of the dataset. Extract logical conclusions based on the hypotheses testings. Is there a difference between two groups of variables that is statistically significant? Can you conclude if there are dependent or independent variables in the dataset? \n ** Hypotheses Tests ** {hypotheses}"
    
    return explain 

In [None]:
hypotheses = all_hypotheses(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["hypotheses"],
    template="Analyze this dataset based on the hypotheses tests and identify anomalies and trends:\n{hypotheses}",
)

In [None]:
hypotheses_results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(hypotheses=hypotheses))

    end_time = time.time()

    # Store results
    hypotheses_results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

In [None]:
hypotheses

### Outliers
- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [None]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [None]:
def all_outliers(df):
    outliers = ""
    for feature in df.columns:
        outliers += f"Outlier scores for {feature} are {detect_outliers_zscore(df, feature, threshold=3)}"
        # ask model to explain
    explain = f"Below there is all the outliers scores calculated using zscore, for all the possible combinations of the features of the dataset. Extract logical conclusions based on the outliers. Do you find any interesting observations that stand out in the dataset based on the outlier calculations? What are your conclusions? \n ** Outlier Scores ** {outliers}"

    return explain

In [None]:
outliers = all_outliers(malicious_df)
prompt_template = PromptTemplate(
    input_variables=["outliers"],
    template="Analyze this dataset based on the outlier calculations and identify anomalies and trends:\n{outliers}",
)

In [None]:
outliers_results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(outliers=outliers))

    end_time = time.time()

    # Store results
    outliers_results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

In [None]:
outliers

## Visualizations

In [None]:
eda_summary = summarize_dataset(malicious_df)    
prompt_template = PromptTemplate(
    input_variables=["eda_summary"],
    template=""" Given the following dataset summary statistics: 
    ### Summary statistics
    {eda_summary}
    Generate a Python script using Matplotlib and Seaborn to visualize:
    1. A time series plot of value over time,
    2. Highlight anomalies in the dataset,
    3. Suggest interesting trends
    """,
)

In [None]:
model_visualizations = {}
for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(prompt_template.format(eda_summary=eda_summary))

    end_time = time.time()

    # Store results
    model_visualizations[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

# Compare datasets

In [None]:
benign_data_summary = summarize_dataset(benign_df)
prompt_template = PromptTemplate(
    input_variables=["malicious_data_summary", "benign_data_summary"],
    template="Analyze the two input datasets that were taken by a network tap using wireshark, based on the summaries given below: \n Dataset1 Summary: {malicious_data_summary}\n Dataset2 Summary: {benign_data_summary}.\n What are the differences, what are the similarities between the two datasets? Can you identify if any of the two datasets exhibits malicious or benign behavior? Why? Give any other interesting observations that you extract from the two datasets.",
)

In [None]:
comparison_results = {}

for model_name, model in models.items():
    print(f"Running analysis with {model_name}...")

    start_time = time.time()

    # Create chain and execute
    response = model.invoke(
        prompt_template.format(
            malicious_data_summary=malicious_data_summary,
            benign_data_summary=benign_data_summary,
        )
    )

    end_time = time.time()

    # Store results
    comparison_results[model_name] = {
        "response": response,
        "time_taken": round(end_time - start_time, 2),
    }

    print(f"\n{model_name} Response:\n{response}\n")
    print(f"Time Taken: {round(end_time - start_time, 2)} seconds\n")

# Evaluate EDA with LLMs

In [None]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should give more points to the responses that are based on the logical analysis of numerical results.",
        "You should penalize responses that are lacking detailed explanations",
        "You should penalize responses that are contradictory to the ground truth",
        "Different numbers in the responses are OK, however contradicting opinions are not OK.",
    ],
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
)

In [None]:
summary_human_output = """1. The source ports seem to have some unusually high frequencies in higher values and the destination ports exhibit high variability. This may indicate scanning.
2. The payload indicates dns queries to domains that resemble IoT devices such as broadlink routers.
3. The packet lengths show an interesting distribution of some unusually high values that may point to exfiltration or malware download. 
4. There is unusually high activity in UDP protocol that may be suspicious of unusual services.
5. There is also some unusual high frequency in destination port 23 and 2323 that point to telnet and IoT telnet.
6. Finally the interarrival is unusually short, and that may indicate Denial of Service attack."""

In [None]:
hypotheses_human_output = """ Based on the hypothesis testing results:
1. All pairs of variables seem unrelated to statistically significant differences.
2. Logically, variables that should correlate are the bytes in/out and interarrival but there seems not to be any correlation.
3. The results are inconclusive and additional analysis may be needed.
"""

In [None]:
outliers_human_output = """ Based on the outliers calculations we reach the following conclusions:
1. There are some destination port outliers that may indicate abnormal malicious behavior.
2. There are outliers in packet size that again may point to unusual activity.
"""

In [None]:
statistics_correctness = {} 
for llm_item in statistical_results.items():
    test_case = LLMTestCase(
        input=malicious_data_summary,
        actual_output=llm_item[1]["response"],
        expected_output=summary_human_output,
    )

    correctness_metric.measure(test_case)
    correctness_dict = {}

    print(f"Model {llm_item[0]} response correctness: {correctness_metric.score}")
    print(correctness_metric.reason)
    statistics_correctness[llm_item[0]] =  correctness_metric.score

In [None]:
hypothesis_correctness = {}
for llm_item in hypotheses_results.items():
    test_case = LLMTestCase(
        input=malicious_data_summary,
        actual_output=llm_item[1]["response"],
        expected_output=hypotheses_human_output,
    )

    correctness_metric.measure(test_case)
    print(f"Model {llm_item[0]} response correctness: {correctness_metric.score}")
    print(correctness_metric.reason)
    hypothesis_correctness[llm_item[0]]  = correctness_metric.score

In [None]:
outlier_correctness = {}
for llm_item in outliers_results.items():
    test_case = LLMTestCase(
        input=malicious_data_summary,
        actual_output=llm_item[1]["response"],
        expected_output=outliers_human_output,
    )

    correctness_metric.measure(test_case)
    print(f"Model {llm_item[0]} response correctness: {correctness_metric.score}")
    print(correctness_metric.reason)
    outlier_correctness[llm_item[0]] = correctness_metric.score

In [None]:
all_scores = []
for model_name, score in statistics_correctness.items():
    score_dict = {}
    score_dict[model_name] = {
        "statistics_correctness": score,
        "hypothesis_correctness": hypothesis_correctness[model_name],
        "outlier_correctness": outlier_correctness[model_name],
    }
    all_scores.append(score_dict)

all_scores

In [None]:
# Convert to long-format DataFrame
all_scores_df_long = pd.DataFrame(
    [
        {"Model": list(item.keys())[0], "Metric": metric, "Score": value}
        for item in all_scores
        for metric, value in list(item.values())[0].items()
    ]
)
all_scores_df_long

In [None]:
# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(x="Metric", y="Score", hue="Model", data=all_scores_df_long)

# Customize the plot
plt.title("Model Scores Across Different Metrics", fontsize=16)
plt.xlabel("Metrics", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.xticks(rotation=45)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

# Show the plot
plt.show()