In [1]:
import pandas as pd
import os

from dotenv import load_dotenv

# pandasai imports
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
 
from utils.pcap import pcap_to_dataframe, extract_streams



# Load Data

In [2]:
malicious_pcap = "./data/mirai.pcap"
benign_pcap = "./data/benign.pcapng"
malicious_pkl = "./data/mirai.pkl"
benign_pkl = "./data/benign.pkl"

In [3]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [4]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [5]:
if READ_FROM_PKL:
    malicious_df = pd.read_pickle(malicious_pkl)
    benign_df = pd.read_pickle(benign_pkl)
else:
    malicious_df = pcap_to_dataframe(malicious_pcap)
    benign_df = pcap_to_dataframe(benign_pcap)
    # save to pkl since dataframe conversion takes a long time
    malicious_df.to_pickle(malicious_pkl)
    benign_df.to_pickle(benign_pkl)

In [6]:
# Instantiate a LLM
llm = OpenAI(api_token=openai_api_key)

In [7]:
# copy the dataframes to a features dataframe while omitting the packets with NaN src/dst ips/ports
malicious_features = malicious_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)
benign_features = benign_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)

# Generate Features

In [8]:
malicious_smart = SmartDataframe(
    df=pd.DataFrame(malicious_features),
    config={"llm": llm, "verbose": True},
    name="Malicious botnet network packet capture.",
    description="A dataframe that is derived from a packet capture of the Mirai botnet network traffic.",
)
malicious_smart_features = malicious_smart.generate_features()

2024-05-13 20:37:13 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-05-13 20:37:13 [INFO] Running PandasAI with openai LLM...
2024-05-13 20:37:13 [INFO] Prompt ID: 6b3e36fc-bc69-4e02-8fce-e86ab7464fc3
2024-05-13 20:37:13 [INFO] Executing Step 0: CacheLookup
2024-05-13 20:37:13 [INFO] Executing Step 1: PromptGeneration
2024-05-13 20:37:13 [INFO] Using prompt: <dataframe name="Malicious botnet network packet capture." description="A dataframe that is derived from a packet capture of the Mirai botnet network traffic.">
dfs[0]:154090x8
Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
0084838184,192.168.2.115,192.168.2.1,2440.0,53.0,"DNS Qry ""b'xmpp.samsun...",83,17.0
3107002223,192.168.2.108,52.24.43.67,21074.0,80.0,Padding...,26,6.0
3711721340,192.168.2.108,52.25.66.250,20532.0,8280.0,Padding...,26,6.0
</dataframe>




Update this initial code:
```python
# TOD

In [9]:
benign_smart = SmartDataframe(
    df=pd.DataFrame(benign_features),
    config={"llm": llm, "verbose": True},
    name="Benign activity network packet capture.",
    description="A dataframe that is derived from a packet capture of benign network traffic.",
)
benign_smart_features = benign_smart.generate_features()

2024-05-13 20:37:17 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-05-13 20:37:17 [INFO] Running PandasAI with openai LLM...
2024-05-13 20:37:17 [INFO] Prompt ID: 919869f7-455a-4c90-ad3d-8ad282a18dcb
2024-05-13 20:37:17 [INFO] Executing Step 0: CacheLookup
2024-05-13 20:37:17 [INFO] Using cached response
2024-05-13 20:37:17 [INFO] Executing Step 1: PromptGeneration
2024-05-13 20:37:17 [INFO] Executing Step 2: CodeGenerator
2024-05-13 20:37:17 [INFO] Executing Step 3: CachePopulation
2024-05-13 20:37:17 [INFO] Executing Step 4: CodeExecution
2024-05-13 20:37:17 [INFO] 
Code running:
```
df_features = dfs[0].copy()
df_features['Payload_Length'] = df_features['Payload'].apply(lambda x: len(x))
df_features['DNS_Query'] = df_features['Payload'].str.extract('DNS Qry "(.*?)"')
result = {'type': 'dataframe', 'value': df_features}
        ```
2024-05-13 20:37:17 [INFO] Executing Step 5: ResultValidation
2024-05

In [10]:
malicious_smart_features

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Payload_Length,DNS_Query
0,1540446382.933899,192.168.2.108,52.24.43.67,21074.0,80.0,Padding,26,6.0,7,
1,1540446382.933904,192.168.2.108,52.25.66.250,20532.0,8280.0,Padding,26,6.0,7,
7,1540446383.391651,192.168.2.115,192.168.2.1,2440.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.Speedport_...",83,17.0,72,b'xmpp.samsungsmartcam.com.Speedport_W_724V_01...
8,1540446383.393709,192.168.2.1,192.168.2.115,53.0,2440.0,DNS Ans,83,17.0,8,
9,1540446383.435821,192.168.2.108,52.24.43.67,21074.0,80.0,Padding,26,6.0,7,
...,...,...,...,...,...,...,...,...,...,...
764121,1540453519.302461,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0,3,
764122,1540453519.455618,192.168.2.109,8.8.8.8,35284.0,53.0,"DNS Qry ""b'time.windows.com.'""",42,17.0,31,b'time.windows.com.'
764124,1540453519.756754,192.168.2.108,52.24.43.67,47157.0,80.0,Padding,26,6.0,7,
764125,1540453519.756832,192.168.2.108,52.25.66.250,46453.0,8280.0,Padding,26,6.0,7,


In [11]:
benign_smart_features

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Payload_Length,DNS_Query
4,1355253962.658551,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,3,
5,1355253962.658551,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,3,
6,1355253962.845551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3,
7,1355253962.845551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3,
26,1355253966.932551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3,
...,...,...,...,...,...,...,...,...,...,...
38585,1671035938.151976,194.247.5.27,194.247.5.1,1812.0,10048.0,Radius,40,17.0,6,
38586,1671035948.674405,194.247.5.1,194.247.5.27,3860.0,1812.0,Radius,112,17.0,6,
38587,1671035948.67495,194.247.5.27,194.247.5.1,1812.0,3860.0,Radius,104,17.0,6,
38588,1671035961.263167,194.247.5.1,194.247.5.27,3373.0,1812.0,Radius,120,17.0,6,
