In [1]:
import os
import pandas as pd
import pickle
import json


from src.dataset.dataset_info import datasets, DatasetInfo
from src.dataset.clean_dataset import clean_dataset
from src.dataset.create_class_num_col import one_dataset_class_num_col
from src.dataset.features_analysis import feature_analysis_pipeline
from src.dataset.undersample_classes import undersample_classes
from src.graph.graph_measures import calculate_graph_measures
from local_variables import datasets_main_path


In [2]:
with_sort_timestamp = False
with_undersample_classes = False

# name = "cic_ids_2017_5_percent"
# original_path = "./testing_dfs/cic_ids_2017_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot_5_percent"
# original_path = "./testing_dfs/cic_ton_iot_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot"
# original_path = datasets_main_path + "cic_ton_iot.parquet"
# file_type = "parquet"

# name = "cic_ids_2017"
# original_path = datasets_main_path + "cic_ids_2017.parquet"
# file_type = "parquet"

# name = "cic_ton_iot_modified"
# original_path = datasets_main_path + "CIC-ToN-IoT-Modified.pkl"
# file_type = "pkl"

name = "nf_ton_iotv2_modified"
original_path = datasets_main_path + "NF-ToN-IoT-v2-Modified.pkl"
file_type = "pkl"


# name = "ccd_inid_modified"
# original_path = datasets_main_path + "CCD-INID-Modified.pkl"
# file_type = "pkl"

# name = "nf_uq_nids_modified"
# original_path = datasets_main_path + "NF-UQ-NIDS-Modified.pkl"
# file_type = "pkl"


# name = "nf_bot_iot"

# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

folder_path = os.path.join("datasets", name)
output_path = os.path.join(folder_path, name + ".parquet")

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

print(f"==>> original_path: {original_path}")
print(f"==>> folder_path: {folder_path}")
print(f"==>> output_path: {output_path}")


==>> original_path: c:\Users\Administrateur\Desktop\datasets\NF-ToN-IoT-v2-Modified.pkl
==>> folder_path: datasets\nf_ton_iotv2_modified
==>> output_path: datasets\nf_ton_iotv2_modified\nf_ton_iotv2_modified.parquet


# Preparing Datasets

### Reading and Cleaning

In [3]:
if file_type == "parquet":
    df = pd.read_parquet(original_path)
if file_type == "csv":
    df = pd.read_csv(original_path)
if file_type == "pkl":
    df = pd.read_pickle(original_path, compression="zip")

In [4]:
df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,192.168.1.193,49235,192.168.1.33,4444,6,0.0,155392,202,34552,149,...,45555,4805,0,0,0,0,0,0,1,ransomware
1,192.168.1.193,49228,192.168.1.152,1880,6,0.0,1600,40,35741,65,...,16425,237,0,0,0,0,0,0,0,Benign
2,192.168.1.152,0,192.168.1.193,0,1,0.0,212,2,0,0,...,0,0,771,3,0,0,0,0,0,Benign
3,192.168.1.169,65317,239.255.255.250,1900,17,0.0,165,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign
4,192.168.1.79,60766,192.168.1.255,15600,17,0.0,63,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign


In [5]:
timestamp_format = "mixed"
# timestamp_format="%d/%m/%Y %I:%M:%S %p"

In [6]:
df.columns

Index(['IPV4_SRC_ADDR', 'L4_SRC_PORT', 'IPV4_DST_ADDR', 'L4_DST_PORT',
       'PROTOCOL', 'L7_PROTO', 'IN_BYTES', 'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'TCP_FLAGS', 'CLIENT_TCP_FLAGS', 'SERVER_TCP_FLAGS',
       'FLOW_DURATION_MILLISECONDS', 'DURATION_IN', 'DURATION_OUT', 'MIN_TTL',
       'MAX_TTL', 'LONGEST_FLOW_PKT', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN',
       'MAX_IP_PKT_LEN', 'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'Attack'],
      dtype='object')

In [7]:
list(df.dtypes)

[dtype('O'),
 dtype('int64'),
 dtype('O'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('O')]

In [8]:
datesetInfo =   DatasetInfo(name="nf_ton_iotv2_modified",
                path="./datasets/nf_bot_iot.parquet",
                file_type="parquet",
                src_ip_col="IPV4_SRC_ADDR",
                src_port_col="L4_SRC_PORT",
                dst_ip_col="IPV4_DST_ADDR",
                dst_port_col="L4_DST_PORT",
                flow_id_col=None,
                timestamp_col=None,
                label_col="Label",
                class_col="Attack",
                class_num_col="Class",
                timestamp_format="mixed",
                drop_columns=["IPV4_SRC_ADDR", "L4_SRC_PORT", "IPV4_DST_ADDR", "L4_DST_PORT", "Attack"],
                )

In [9]:
df = clean_dataset(df)

In [10]:
def to_categorical_codes(column):
        return pd.Categorical(column).codes
    
if name=='x_iiot':
    columns_to_convert = [
          'Protocol', 'Service', 'is_syn_only', 'Is_SYN_ACK', 
           'is_pure_ack', 'is_with_payload', 'FIN or RST', 
        'Bad_checksum', 'is_SYN_with_RST', 'anomaly_alert']


    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
            
    df['class3'] = (df['class3'] == 'Attack').astype(int)
    df = df.applymap(lambda x: 0 if x == '-' else x)
    df = df.applymap(lambda x: 0 if x == '?' else x)
    df = df.applymap(lambda x: 0 if x == '#DIV/0!' else x)    
    df = df.applymap(lambda x: 0 if x == 'excel' else x)
    df = df.applymap(lambda x: 0 if x == 'aza' else x)
    df = df.applymap(lambda x: 0 if x == ' ' else x)
    
if name=='edge_iiot':
    columns_to_convert = ['mqtt.protoname','mqtt.topic','mqtt.conack.flags','mqtt.msg','http.request.method','dns.qry.name.len','arp.src.proto_ipv4',
                          'http.request.full_uri','http.file_data','http.request.version','arp.dst.proto_ipv4','http.request.uri.query','tcp.srcport','http.referer']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    
if name=='ccd_inid_modified':    
    columns_to_convert = ['splt_direction','splt_ps','splt_piat_ms','application_name','application_category_name','requested_server_name','client_fingerprint']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    df['traffic_type'] = (df['traffic_type'] == 'attack').astype(int)

if name=='cic_ton_iot_modified':
    df.drop(['datetime'], axis=1, inplace=True)
    

In [11]:
df[datesetInfo.src_ip_col] = df[datesetInfo.src_ip_col].apply(str)
df[datesetInfo.src_port_col] = df[datesetInfo.src_port_col].apply(str)
df[datesetInfo.dst_ip_col] = df[datesetInfo.dst_ip_col].apply(str)
df[datesetInfo.dst_port_col] = df[datesetInfo.dst_port_col].apply(str)

In [12]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,...,NUM_PKTS_1024_TO_1514_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label
count,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,...,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0,16929780.0
mean,7.606853,13.77945,725.2034,6.340294,817.4685,3.200351,12.80621,9.49449,11.03255,792912.0,...,0.5004085,10649.71,10370.47,423.2544,1.653336,4646.23,1.519483,4109.499,1.474099,0.6403487
std,3.897187,32.02872,143361.6,509.2211,94665.81,257.1494,11.28024,10.91053,11.53527,1666324.0,...,103.5057,13545.66,18019.71,4077.885,15.92924,13484.94,8.381744,47312.96,25.80524,0.4798982
min,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,0.0,44.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0.0,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,0.0,52.0,1.0,40.0,1.0,17.0,2.0,0.0,0.0,...,0.0,1024.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,6.0,7.0,276.0,4.0,180.0,3.0,22.0,18.0,20.0,0.0,...,0.0,29200.0,28960.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,17.0,248.0,301926200.0,469281.0,164468800.0,410903.0,223.0,223.0,223.0,4294966.0,...,200984.0,65535.0,65535.0,65286.0,255.0,65535.0,255.0,6048000.0,553.0,1.0


In [13]:
_,var_dropped, corr_dropped = feature_analysis_pipeline(df=df, drop_columns=datesetInfo.drop_columns,label_col=datesetInfo.label_col)
var_dropped, corr_dropped

  sqr = np.multiply(arr, arr, out=arr, where=where)


([],
 {'CLIENT_TCP_FLAGS',
  'DURATION_IN',
  'ICMP_TYPE',
  'IN_BYTES',
  'LONGEST_FLOW_PKT',
  'MAX_TTL',
  'MIN_TTL',
  'PROTOCOL',
  'RETRANSMITTED_OUT_BYTES',
  'TCP_FLAGS',
  'TCP_WIN_MAX_IN'})

### Attacks Types

In [14]:
df[datesetInfo.class_col].unique()

array(['ransomware', 'Benign', 'xss', 'scanning', 'password', 'dos',
       'ddos', 'injection', 'mitm', 'backdoor'], dtype=object)

In [15]:
# df[class_col] = df[class_col].replace({"BENIGN": "Benign",
#                                        "DDoS": "ddos",
#                                        "Web Attack � Brute Force": "bruteforce",
#                                        "Web Attack � XSS": "xss"})

In [16]:
classes = df[datesetInfo.class_col].unique()

### Sorting (optional)

In [17]:
if with_sort_timestamp and datesetInfo.timestamp_col:
    df[datesetInfo.timestamp_col] = pd.to_datetime(df[datesetInfo.timestamp_col].str.strip(), format=timestamp_format)
    df.sort_values(datesetInfo.timestamp_col, inplace= True)

### Encoding Attacks into integers

In [18]:
df, labels_names = one_dataset_class_num_col(df, datesetInfo.class_num_col, datesetInfo.class_col)

==>> labels_names: {np.int64(0): np.str_('Benign'), np.int64(1): np.str_('backdoor'), np.int64(2): np.str_('ddos'), np.int64(3): np.str_('dos'), np.int64(4): np.str_('injection'), np.int64(5): np.str_('mitm'), np.int64(6): np.str_('password'), np.int64(7): np.str_('ransomware'), np.int64(8): np.str_('scanning'), np.int64(9): np.str_('xss')}


### Undersampling classes (optional)

In [19]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [20]:
if with_undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df[df[datesetInfo.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df[df[datesetInfo.class_col] == class_label])

    df = []
    # Optional: shuffle the undersampled DataFrame
    df = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [21]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

### saving labels encodings and datasets properties

In [22]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [23]:
total_count = len(df)

properties = {
    "name": name,
    "length": total_count,
}

num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df["Attack"].unique()) 

with open(folder_path + '/df_properties.json', 'w') as f:
    json.dump(properties, f)
    
properties


{'name': 'nf_ton_iotv2_modified',
 'length': 16929777,
 'num_benign': 6088816,
 'percentage_of_benign_records': 35.96512818804406,
 'num_attack': 10840961,
 'percentage_of_attack_records': 64.03487181195594,
 'attacks': ['ransomware',
  'Benign',
  'xss',
  'scanning',
  'password',
  'dos',
  'ddos',
  'injection',
  'mitm',
  'backdoor']}

In [24]:
# graphs_properties_path = folder_path + '/graphs_properties.json'
# G = nx.from_pandas_edgelist(
#     df,
#     source=src_ip_col,
#     target=dst_ip_col,
#     create_using=nx.DiGraph()
# )
# calculate_graph_measures(G, graphs_properties_path)

In [25]:
df.to_parquet(output_path)

In [26]:
df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack,Class
0,192.168.1.193,49235,192.168.1.33,4444,6,0.0,155392,202,34552,149,...,4805,0,0,0,0,0,0,1,ransomware,7
1,192.168.1.193,49228,192.168.1.152,1880,6,0.0,1600,40,35741,65,...,237,0,0,0,0,0,0,0,Benign,0
2,192.168.1.152,0,192.168.1.193,0,1,0.0,212,2,0,0,...,0,771,3,0,0,0,0,0,Benign,0
3,192.168.1.169,65317,239.255.255.250,1900,17,0.0,165,1,0,0,...,0,0,0,0,0,0,0,0,Benign,0
4,192.168.1.79,60766,192.168.1.255,15600,17,0.0,63,1,0,0,...,0,0,0,0,0,0,0,0,Benign,0
