In [1]:
import pandas as pd
import os

from dotenv import load_dotenv

# pandasai imports
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
 
from utils.pcap import pcap_to_dataframe, extract_streams

# Load Data

In [2]:
malicious_pcap = "./data/mirai.pcap"
benign_pcap = "./data/benign.pcapng"
malicious_pkl = "./data/mirai.pkl"
benign_pkl = "./data/benign.pkl"

In [3]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [4]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [5]:
if READ_FROM_PKL:
    malicious_df = pd.read_pickle(malicious_pkl)
    benign_df = pd.read_pickle(benign_pkl)
else:
    malicious_df = pcap_to_dataframe(malicious_pcap)
    benign_df = pcap_to_dataframe(benign_pcap)
    # save to pkl since dataframe conversion takes a long time
    malicious_df.to_pickle(malicious_pkl)
    benign_df.to_pickle(benign_pkl)

In [6]:
# Instantiate a LLM
llm = OpenAI(api_token=openai_api_key)

In [7]:
# copy the dataframes to a features dataframe while omitting the packets with NaN src/dst ips/ports
malicious_features = malicious_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)
benign_features = benign_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)

# Generate Features

In [8]:
malicious_smart = SmartDataframe(
    df=pd.DataFrame(malicious_features),
    config={"llm": llm, "verbose": True},
    name="Malicious botnet network packet capture.",
    description="A dataframe that is derived from a packet capture of the Mirai botnet network traffic.",
)
malicious_smart_features = malicious_smart.generate_features()

2024-05-06 18:17:37 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-05-06 18:17:38 [INFO] Running PandasAI with openai LLM...
2024-05-06 18:17:38 [INFO] Prompt ID: bbeb7beb-1dbb-4aaa-9568-4009efcd622e
2024-05-06 18:17:38 [INFO] Executing Step 0: CacheLookup
2024-05-06 18:17:38 [INFO] Using cached response
2024-05-06 18:17:38 [INFO] Executing Step 1: PromptGeneration
2024-05-06 18:17:38 [INFO] Executing Step 2: CodeGenerator
2024-05-06 18:17:38 [INFO] Executing Step 3: CachePopulation
2024-05-06 18:17:38 [INFO] Executing Step 4: CodeExecution
2024-05-06 18:17:38 [INFO] Saving charts to /home/drx/sandbox/cyberdata-mlai/exports/charts/temp_chart.png
2024-05-06 18:17:38 [INFO] 
Code running:
```
df_features = dfs[0].copy()
df_features['Hour'] = pd.to_datetime(df_features['Timestamp'], unit='ns').dt.hour
df_features['PacketPayloadRatio'] = df_features['Packet Length'] / df_features['Payload'].str.len()
resul

In [9]:
benign_smart = SmartDataframe(
    df=pd.DataFrame(benign_features),
    config={"llm": llm, "verbose": True},
    name="Benign activity network packet capture.",
    description="A dataframe that is derived from a packet capture of benign network traffic.",
)
benign_smart_features = benign_smart.generate_features()

2024-05-06 18:17:44 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-05-06 18:17:44 [INFO] Running PandasAI with openai LLM...
2024-05-06 18:17:44 [INFO] Prompt ID: caec143a-78a5-4662-847a-a53626ef739b
2024-05-06 18:17:44 [INFO] Executing Step 0: CacheLookup
2024-05-06 18:17:44 [INFO] Using cached response
2024-05-06 18:17:44 [INFO] Executing Step 1: PromptGeneration
2024-05-06 18:17:44 [INFO] Executing Step 2: CodeGenerator
2024-05-06 18:17:44 [INFO] Executing Step 3: CachePopulation
2024-05-06 18:17:44 [INFO] Executing Step 4: CodeExecution
2024-05-06 18:17:44 [INFO] Saving charts to /home/drx/sandbox/cyberdata-mlai/exports/charts/temp_chart.png
2024-05-06 18:17:44 [INFO] 
Code running:
```
df_features = dfs[0].copy()
df_features['Hour'] = pd.to_datetime(df_features['Timestamp'], unit='ns').dt.hour
df_features['PacketPayloadRatio'] = df_features['Packet Length'] / df_features['Payload'].str.len()
resul

In [10]:
malicious_smart_features

"Unfortunately, I was not able to answer your question, because of the following error:\n\n<class 'scapy.utils.EDecimal'> is not convertible to datetime\n"

In [11]:
benign_smart_features

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Hour,PacketPayloadRatio
4,1355253962,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,0,14.000000
5,1355253962,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,0,14.000000
6,1355253962,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,0,14.000000
7,1355253962,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,0,14.000000
26,1355253966,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,0,14.000000
...,...,...,...,...,...,...,...,...,...,...
38585,1671035938,194.247.5.27,194.247.5.1,1812.0,10048.0,Radius,40,17.0,0,6.666667
38586,1671035948,194.247.5.1,194.247.5.27,3860.0,1812.0,Radius,112,17.0,0,18.666667
38587,1671035948,194.247.5.27,194.247.5.1,1812.0,3860.0,Radius,104,17.0,0,17.333333
38588,1671035961,194.247.5.1,194.247.5.27,3373.0,1812.0,Radius,120,17.0,0,20.000000
