In [1]:
import pandas as pd
import os

from dotenv import load_dotenv

# pandasai imports
from pandasai.llm.openai import OpenAI
from pandasai import SmartDataframe
 
from utils.pcap import pcap_to_dataframe, extract_streams

In [2]:
malicious_pcap = "./data/mirai.pcap"
benign_pcap = "./data/benign.pcapng"
malicious_pkl = "./data/mirai.pkl"
benign_pkl = "./data/benign.pkl"

In [3]:
# finds .env file and loads the vars
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY", "Key not found")
openai_org = os.getenv("OPENAI_ORG", "Organization not found")

In [4]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [6]:
if READ_FROM_PKL:
    malicious_df = pd.read_pickle(malicious_pkl)
    benign_df = pd.read_pickle(benign_pkl)
else:
    malicious_df = pcap_to_dataframe(malicious_pcap)
    benign_df = pcap_to_dataframe(benign_pcap)
    # save to pkl since dataframe conversion takes a long time
    malicious_df.to_pickle(malicious_pkl)
    benign_df.to_pickle(benign_pkl)

In [7]:
# Instantiate a LLM
llm = OpenAI(api_token=openai_api_key)

In [8]:
# copy the dataframes to a features dataframe while omitting the packets with NaN src/dst ips/ports
malicious_features = malicious_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)
benign_features = benign_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)

In [10]:
mirai_smart = SmartDataframe(
    df=pd.DataFrame(malicious_features),
    config={"llm": llm, "verbose": True},
    name="Malicious botnet network packet capture.",
    description="A dataframe that is derived from a packet capture of the Mirai botnet network traffic.",
)
mirai_smart_features = mirai_smart.generate_features()

2024-04-29 09:36:15 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-04-29 09:36:15 [INFO] Running PandasAI with openai LLM...
2024-04-29 09:36:15 [INFO] Prompt ID: 8ac62f0e-414e-48bd-b001-3b46e22b0a1b


2024-04-29 09:36:15 [INFO] Executing Step 0: CacheLookup
2024-04-29 09:36:15 [INFO] Executing Step 1: PromptGeneration
2024-04-29 09:36:15 [INFO] Using prompt: <dataframe name="Malicious botnet network packet capture." description="A dataframe that is derived from a packet capture of the Mirai botnet network traffic.">
dfs[0]:154090x8
Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
4230242377,192.168.2.115,192.168.2.1,2440.0,53.0,"DNS Qry ""b'xmpp.samsun...",83,17.0
3441302119,192.168.2.108,52.25.66.250,20532.0,8280.0,Padding...,26,6.0
1074104148,192.168.2.108,52.24.43.67,21074.0,80.0,Padding...,26,6.0
</dataframe>




Update this initial code:
```python
# TODO: import the required dependencies
import pandas as pd

# Write code here

# Declare result var: type (possible values "string", "number", "dataframe", "plot"). Examples: { "type": "string", "value": f"The highest salary is {highest_salary}." } or { "type": "number", "value": 125 } o

In [11]:
benign_smart = SmartDataframe(
    df=pd.DataFrame(benign_features),
    config={"llm": llm, "verbose": True},
    name="Benign activity network packet capture.",
    description="A dataframe that is derived from a packet capture of benign network traffic.",
)
benign_smart_features = benign_smart.generate_features()

2024-04-29 09:36:24 [INFO] Question: 
1. Copy the dataframe to a new variable named df_features.
2. Do feature generation.
3. Return df_features.

2024-04-29 09:36:24 [INFO] Running PandasAI with openai LLM...
2024-04-29 09:36:24 [INFO] Prompt ID: 36a4edfe-c788-4d26-bfc1-ae491e725cd2
2024-04-29 09:36:24 [INFO] Executing Step 0: CacheLookup
2024-04-29 09:36:24 [INFO] Using cached response
2024-04-29 09:36:24 [INFO] Executing Step 1: PromptGeneration
2024-04-29 09:36:24 [INFO] Executing Step 2: CodeGenerator
2024-04-29 09:36:24 [INFO] Executing Step 3: CachePopulation
2024-04-29 09:36:24 [INFO] Executing Step 4: CodeExecution
2024-04-29 09:36:24 [INFO] Saving charts to /home/drx/sandbox/cyberdata-mlai/exports/charts/temp_chart.png
2024-04-29 09:36:24 [INFO] 
Code running:
```
df_features = dfs[0].copy()
df_features['Payload Length'] = df_features['Payload'].apply(lambda x: len(x))
result = {'type': 'dataframe', 'value': df_features}
        ```
2024-04-29 09:36:24 [INFO] Executing Step 5

In [12]:
mirai_smart_features.columns

Index(['Timestamp', 'Source IP', 'Destination IP', 'Source Port',
       'Destination Port', 'Payload', 'Packet Length', 'Protocol',
       'Payload Length'],
      dtype='object')

In [13]:
benign_smart_features

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Payload Length
4,1355253962.658551,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,3
5,1355253962.658551,10.0.0.1,224.0.0.2,646.0,646.0,Raw,42,17.0,3
6,1355253962.845551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3
7,1355253962.845551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3
26,1355253966.932551,10.0.0.2,224.0.0.2,646.0,646.0,Raw,42,17.0,3
...,...,...,...,...,...,...,...,...,...
38585,1671035938.151976,194.247.5.27,194.247.5.1,1812.0,10048.0,Radius,40,17.0,6
38586,1671035948.674405,194.247.5.1,194.247.5.27,3860.0,1812.0,Radius,112,17.0,6
38587,1671035948.67495,194.247.5.27,194.247.5.1,1812.0,3860.0,Radius,104,17.0,6
38588,1671035961.263167,194.247.5.1,194.247.5.27,3373.0,1812.0,Radius,120,17.0,6
