In [None]:
import json
import os
import pickle

import pandas as pd

from src.dataset.dataset_info import DatasetInfo
from src.dataset.dataset_utils import (
    clean_dataset,
    one_dataset_class_num_col,
    undersample_classes,
)
from src.dataset.features_analysis import feature_analysis_pipeline

In [None]:
datasets_main_path = "" # where the original dataset files are found

In [None]:
with_sort_timestamp = False
with_undersample_classes = False

# name = "cic_ids_2017_5_percent"
# original_path = "./testing_dfs/cic_ids_2017_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot_5_percent"
# original_path = "./testing_dfs/cic_ton_iot_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot"
# original_path = datasets_main_path + "cic_ton_iot.parquet"
# file_type = "parquet"

# name = "cic_ids_2017"
# original_path = datasets_main_path + "cic_ids_2017.parquet"
# file_type = "parquet"

# name = "cic_bot_iot"
# original_path = datasets_main_path + "cic_bot_iot.parquet"
# file_type = "parquet"

# name = "cic_ton_iot_modified"
# original_path = datasets_main_path + "CIC-ToN-IoT-Modified.pkl"
# file_type = "pkl"

# name = "nf_ton_iotv2_modified"
# original_path = datasets_main_path + "NF-ToN-IoT-v2-Modified.pkl"
# file_type = "pkl"

# name = "ccd_inid_modified"
# original_path = datasets_main_path + "CCD-INID-Modified.pkl"
# file_type = "pkl"

# name = "nf_uq_nids_modified"
# original_path = datasets_main_path + "NF-UQ-NIDS-Modified.pkl"
# file_type = "pkl"

# name = "edge_iiot"
# original_path = datasets_main_path + "edge_iiot.pkl"
# file_type = "pkl"

# name = "nf_cse_cic_ids2018"
# original_path = datasets_main_path + "NF-CSE-CIC-IDS2018.csv"
# file_type = "csv"

# name = "nf_bot_iotv2"
# original_path = datasets_main_path + "NF-BoT-IoT-v2.csv"
# file_type = "csv"

# name = "nf_uq_nids"
# original_path = datasets_main_path + "NF-UQ-NIDS.csv"
# file_type = "csv"

name = "x_iiot"
original_path = datasets_main_path + "X-IIoTID dataset.csv"
file_type = "csv"

folder_path = os.path.join("datasets", name)
output_path = os.path.join(folder_path, name + ".parquet")

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

print(f"==>> original_path: {original_path}")
print(f"==>> folder_path: {folder_path}")
print(f"==>> output_path: {output_path}")

# Preparing Datasets

### Reading and Cleaning

In [None]:
if file_type == "parquet":
    df = pd.read_parquet(original_path)
if file_type == "csv":
    df = pd.read_csv(original_path)
if file_type == "pkl":
    df = pd.read_pickle(original_path)
    # df = pd.read_pickle(original_path, compression="zip")

In [4]:
pd.options.display.max_columns = df.shape[1]


In [None]:
df.head()


In [6]:
timestamp_format = "mixed"
# timestamp_format="%d/%m/%Y %I:%M:%S %p"


In [None]:
df.columns


In [None]:
list(df.dtypes)

In [None]:
df.describe(include="all")

In [10]:
datesetInfo = DatasetInfo(name="x_iiot",
                          path="./datasets/x_iiot/x_iiot.parquet",
                          file_type="parquet",
                          src_ip_col="Scr_IP",
                          src_port_col="Scr_port",
                          dst_ip_col="Des_IP",
                          dst_port_col="Des_port",
                          flow_id_col=None,
                          timestamp_col="Timestamp",
                          label_col="class3",
                          class_col="class2",
                          class_num_col="Class",
                          timestamp_format="mixed",
                          drop_columns=["Scr_IP", "Scr_port", "Des_IP",
                                        "Des_port", "Timestamp", "class1", "class2"],
                          )

In [None]:
df.shape


In [12]:
# import numpy as np
# df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:
df.describe()


In [None]:
df.isna().sum()


In [None]:
def to_categorical_codes(column):
    return pd.Categorical(column).codes


if name == 'x_iiot':
    # columns_to_convert = [
    #     'Protocol', 'Service', 'is_syn_only', 'Is_SYN_ACK',
    #     'is_pure_ack', 'is_with_payload', 'FIN or RST',
    #     'Bad_checksum', 'is_SYN_with_RST', 'anomaly_alert']

    # for col in columns_to_convert:
    #     if col in df.columns:
    #         df[col] = to_categorical_codes(df[col])
    #     else:
    #         print(f"Warning: Column '{col}' not found in the DataFrame")

    object_columns_to_keep = ['Timestamp', 'Scr_IP', 'Scr_port',
                              'Des_IP', 'Des_port', 'class1', 'class2', 'class3']

    df.drop(['Date'], axis=1, inplace=True)
    # df['Timestamp'] = df['Timestamp'].apply(str)

    df['Timestamp'] = pd.to_datetime(
            df['Timestamp'].str.strip(), unit='s', errors='coerce')
    df.dropna(subset=['Timestamp'], inplace=True)
    
    # Convert all other columns to int
    for col in df.columns:
        if df[col].dtype == 'object' and col not in object_columns_to_keep:
            df[col] = to_categorical_codes(df[col])

    df['class3'] = (df['class3'] == 'Attack').astype(int)
    # df.rename(columns={'class3': 'label'}, inplace=True)

    df = df.applymap(lambda x: 0 if x == '-' else x)
    df = df.applymap(lambda x: 0 if x == '?' else x)
    df = df.applymap(lambda x: 0 if x == '#DIV/0!' else x)
    df = df.applymap(lambda x: 0 if x == 'excel' else x)
    df = df.applymap(lambda x: 0 if x == 'aza' else x)
    df = df.applymap(lambda x: 0 if x == ' ' else x)

if name == 'edge_iiot':
    df.drop(['tcp.options', 'tcp.payload'], axis=1, inplace=True)

    columns_to_convert = ['mqtt.protoname', 'mqtt.topic', 'mqtt.conack.flags', 'mqtt.msg', 'http.request.method', 'dns.qry.name.len', 'arp.src.proto_ipv4',
                          'http.request.full_uri', 'http.file_data', 'http.request.version', 'arp.dst.proto_ipv4', 'http.request.uri.query', 'tcp.srcport', 'http.referer']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")


if name == 'ccd_inid_modified':
    columns_to_convert = ['splt_direction', 'splt_ps', 'splt_piat_ms', 'application_name',
                          'application_category_name', 'requested_server_name', 'client_fingerprint']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    df['traffic_type'] = (df['traffic_type'] == 'attack').astype(int)
    df.drop(['requested_server_name', 'client_fingerprint',
            'server_fingerprint', 'user_agent', 'content_type'], axis=1, inplace=True)

if name == 'cic_ton_iot_modified':
    df.drop(['datetime'], axis=1, inplace=True)

In [None]:
df = clean_dataset(df, flow_id_col=datesetInfo.flow_id_col,
                   timestamp_col=datesetInfo.timestamp_col)

In [17]:
df[datesetInfo.src_ip_col] = df[datesetInfo.src_ip_col].apply(str)
if datesetInfo.src_port_col:

    df[datesetInfo.src_port_col] = df[datesetInfo.src_port_col].apply(str)

df[datesetInfo.dst_ip_col] = df[datesetInfo.dst_ip_col].apply(str)
if datesetInfo.dst_port_col:

    df[datesetInfo.dst_port_col] = df[datesetInfo.dst_port_col].apply(str)

In [None]:
df.head()


In [None]:
df.describe()


In [None]:
_, var_dropped, corr_dropped = feature_analysis_pipeline(
    df=df, drop_columns=datesetInfo.drop_columns, label_col=datesetInfo.label_col)
var_dropped, corr_dropped

In [None]:
print(f"==>> type(var_dropped): {type(var_dropped)}")
print(f"==>> type(corr_dropped): {type(corr_dropped)}")


In [None]:
var_dropped = set(var_dropped)
weak_columns = var_dropped.union(set(corr_dropped))
weak_columns


### Attacks Types

In [None]:
df[datesetInfo.class_col].unique()


In [24]:
# df[class_col] = df[class_col].replace({"BENIGN": "Benign",
#                                        "DDoS": "ddos",
#                                        "Web Attack � Brute Force": "bruteforce",
#                                        "Web Attack � XSS": "xss"})


In [25]:
classes = df[datesetInfo.class_col].unique()


### Sorting (optional)

In [26]:
if with_sort_timestamp and datesetInfo.timestamp_col:
    df[datesetInfo.timestamp_col] = pd.to_datetime(
        df[datesetInfo.timestamp_col].str.strip(), format=timestamp_format)
    df.sort_values(datesetInfo.timestamp_col, inplace=True)

### Encoding Attacks into integers

In [None]:
df, labels_names = one_dataset_class_num_col(
    df, datesetInfo.class_num_col, datesetInfo.class_col)

In [None]:
df.groupby(datesetInfo.class_col).size()


### Undersampling classes (optional)

In [29]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")


In [30]:
if with_undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df[df[datesetInfo.class_col] == class_label]
            # Specify the fraction of samples to keep
            undersampled_df = class_df.sample(frac=0.5)
            dfs.append(undersampled_df)
        else:
            dfs.append(df[df[datesetInfo.class_col] == class_label])

    df = []
    # Optional: shuffle the undersampled DataFrame
    df = pd.concat(dfs).sample(frac=1).reset_index(drop=True)

In [31]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")


### saving labels encodings and datasets properties

In [32]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)


In [None]:
df[datesetInfo.label_col].unique()


In [None]:
total_count = len(df)

properties = {
    "name": name,
    "length": total_count,
}

num_benign = len(df[df[datesetInfo.label_col] == 0])
num_attack = len(df[df[datesetInfo.label_col] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df[datesetInfo.class_col].unique())

properties["weak_columns"] = list(weak_columns)

with open(folder_path + '/df_properties.json', 'w') as f:
    json.dump(properties, f)

properties

In [35]:
# graphs_properties_path = folder_path + '/graphs_properties.json'
# G = nx.from_pandas_edgelist(
#     df,
#     source=src_ip_col,
#     target=dst_ip_col,
#     create_using=nx.DiGraph()
# )
# calculate_graph_measures(G, graphs_properties_path)


In [36]:
df.to_parquet(output_path)


In [None]:
df.shape


In [None]:
df.head()


In [None]:
df.describe(include="all")


In [None]:
df.dtypes


In [None]:
list(df.dtypes)


In [None]:
for column, dtype in df.dtypes.items():
    print(f"Column '{column}': {dtype}")
