# Angleichung der Spalten beider Datensätze

In [1]:
import os
import pandas as pd
import numpy as np
import logging
import dask.dataframe as dd
import dask.array as da
from dask.distributed import Client
from dask_ml.preprocessing import Categorizer, StandardScaler, LabelEncoder

In [2]:
# Maximale Zeilen und Spalten anzeigen
pd.set_option('display.max_rows', None)  # Zeilen
pd.set_option('display.max_columns', None)  # Spalten

# Logging Parameter
logging.basicConfig(
    filename='21_preprocessing_0.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Verzeichnis der CSV-Dateien
parquet_verzeichnis_ids17 = '../01_Datensaetze/improved_cic-ids-2017/ids17_parquet'
parquet_verzeichnis_ids18 = '../01_Datensaetze/improved_cse-cic-ids-2018/ids18_parquet'

In [3]:
# Dask Client starten
client = Client()  # Dask Client starten
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 7
Total threads: 28,Total memory: 39.17 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41969,Workers: 7
Dashboard: http://127.0.0.1:8787/status,Total threads: 28
Started: Just now,Total memory: 39.17 GiB

0,1
Comm: tcp://127.0.0.1:35395,Total threads: 4
Dashboard: http://127.0.0.1:38327/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:46445,
Local directory: /tmp/dask-scratch-space/worker-hvpnvgkh,Local directory: /tmp/dask-scratch-space/worker-hvpnvgkh

0,1
Comm: tcp://127.0.0.1:46155,Total threads: 4
Dashboard: http://127.0.0.1:42275/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:46363,
Local directory: /tmp/dask-scratch-space/worker-8obk6cii,Local directory: /tmp/dask-scratch-space/worker-8obk6cii

0,1
Comm: tcp://127.0.0.1:40607,Total threads: 4
Dashboard: http://127.0.0.1:33983/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:42729,
Local directory: /tmp/dask-scratch-space/worker-guji1__j,Local directory: /tmp/dask-scratch-space/worker-guji1__j

0,1
Comm: tcp://127.0.0.1:36207,Total threads: 4
Dashboard: http://127.0.0.1:45173/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:40143,
Local directory: /tmp/dask-scratch-space/worker-w_1vcptw,Local directory: /tmp/dask-scratch-space/worker-w_1vcptw

0,1
Comm: tcp://127.0.0.1:36057,Total threads: 4
Dashboard: http://127.0.0.1:34519/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:32867,
Local directory: /tmp/dask-scratch-space/worker-3g8xlrok,Local directory: /tmp/dask-scratch-space/worker-3g8xlrok

0,1
Comm: tcp://127.0.0.1:41993,Total threads: 4
Dashboard: http://127.0.0.1:41099/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:34577,
Local directory: /tmp/dask-scratch-space/worker-p1vbl9tn,Local directory: /tmp/dask-scratch-space/worker-p1vbl9tn

0,1
Comm: tcp://127.0.0.1:34241,Total threads: 4
Dashboard: http://127.0.0.1:46627/status,Memory: 5.60 GiB
Nanny: tcp://127.0.0.1:41155,
Local directory: /tmp/dask-scratch-space/worker-h8ff7x5u,Local directory: /tmp/dask-scratch-space/worker-h8ff7x5u


## Laden der Datensätze

In [4]:
def load_dask_dataframe(file_path):
    # Laden des Dask DataFrames aus einer oder mehreren Parquet-Dateien
    df = dd.read_parquet(file_path + "/*.parquet", assume_missing=True, blocksize='64MB')
    logging.info(f"DataFrame successfully loaded from {file_path}")
    return df

def get_shape(df):
    # Anzahl der Zeilen
    n_rows = df.shape[0].compute()
    n_cols = len(df.columns)
    return n_rows, n_cols

In [5]:
ddf_ids17 = load_dask_dataframe(parquet_verzeichnis_ids17)

In [6]:
ddf_ids18 = load_dask_dataframe(parquet_verzeichnis_ids18)

In [7]:
print(f"Anzahl der Partitionen: {ddf_ids17.npartitions}")
print(f"Anzahl der Partitionen: {ddf_ids18.npartitions}")
ddf_ids17 = ddf_ids17.reset_index(drop=True)
ddf_ids18 = ddf_ids18.reset_index(drop=True)

Anzahl der Partitionen: 15
Anzahl der Partitionen: 557


In [8]:
ddf_ids17.head()

Unnamed: 0,id,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd RST Flags,Bwd RST Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Attempted Category
0,1.0,192.168.10.50-192.168.10.3-56108-3268-6,192.168.10.50,56108.0,192.168.10.3,3268.0,6.0,2017-07-07 11:59:50.315195,112740690.0,32.0,16.0,6448.0,1152.0,403.0,0.0,201.5,204.724205,72.0,72.0,72.0,0.0,67.411331,0.425756,2398738.0,5798698.0,16399772.0,3.0,112740690.0,3636796.0,6848761.0,16399772.0,3.0,112740348.0,7516023.0,8323385.0,16400110.0,3.0,16.0,16.0,0.0,0.0,0.0,0.0,1024.0,512.0,0.283837,0.141919,0.0,403.0,158.333333,177.341758,31450.099291,0.0,0.0,0.0,32.0,48.0,0.0,0.0,0.0,0.5,158.333333,201.5,72.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,134.0,0.0,24.0,377.0,2079.0,15.0,32.0,359.4286,11.99802,380.0,343.0,16105400.0,498804.8,16399772.0,15375229.0,-1.0,-1.0,112740690.0,BENIGN,-1.0
1,2.0,192.168.10.50-192.168.10.3-42144-389-6,192.168.10.50,42144.0,192.168.10.3,389.0,6.0,2017-07-07 11:59:50.316273,112740560.0,32.0,16.0,6448.0,5056.0,403.0,0.0,201.5,204.724205,316.0,316.0,316.0,0.0,102.039585,0.425756,2398735.0,5798710.0,16399782.0,2.0,112740560.0,3636792.0,6848777.0,16399782.0,2.0,112740242.0,7516016.0,8323376.0,16400110.0,4.0,16.0,16.0,0.0,0.0,0.0,0.0,1024.0,512.0,0.283838,0.141919,0.0,403.0,239.666667,174.984174,30619.460993,0.0,0.0,0.0,32.0,48.0,0.0,0.0,0.0,0.5,239.666667,201.5,316.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,134.0,0.0,105.0,955.0,2079.0,15.0,32.0,320.2857,15.74499,330.0,285.0,16105430.0,498793.7,16399782.0,15375263.0,-1.0,-1.0,112740560.0,BENIGN,-1.0
2,3.0,8.6.0.1-8.0.6.4-0-0-0,8.6.0.1,0.0,8.0.6.4,0.0,0.0,2017-07-07 12:00:31.388567,113757377.0,545.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.790898,209112.8,1395543.0,20757030.0,0.0,113757377.0,209112.8,1395543.0,20757030.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.790898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9361829.0,7324646.0,18851791.0,19.0,12210360.0,6935824.0,20757030.0,5504997.0,-1.0,-1.0,0.0,BENIGN,-1.0
3,4.0,192.168.10.25-224.0.0.251-5353-5353-17,192.168.10.25,5353.0,224.0.0.251,5353.0,17.0,2017-07-07 12:00:42.903850,91997219.0,388.0,0.0,37151.0,0.0,227.0,37.0,95.75,55.78532,0.0,0.0,0.0,0.0,403.827424,4.217519,237718.9,1511622.0,19776791.0,0.0,91997219.0,237718.9,1511622.0,19776791.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3104.0,0.0,4.217519,0.0,37.0,227.0,95.75,55.78532,3112.001938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.75,95.75,0.0,1857.0,19.0,9453.0,0.0,0.0,0.0,1.0,95.0,0.0,0.0,0.0,0.0,387.0,8.0,9801664.0,11527820.0,24721964.0,16.0,13197640.0,5826905.0,19776791.0,5817470.0,-1.0,-1.0,0.0,BENIGN,-1.0
4,5.0,192.168.10.25-17.253.14.125-123-123-17,192.168.10.25,123.0,17.253.14.125,123.0,17.0,2017-07-07 12:00:42.430758,66966070.0,6.0,6.0,288.0,288.0,48.0,48.0,48.0,0.0,48.0,48.0,48.0,0.0,8.601371,0.179195,6087825.0,19538230.0,64974431.0,191.0,66942603.0,13388520.0,28861550.0,64997739.0,23560.0,66942654.0,13388530.0,28861630.0,64997898.0,23604.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,48.0,0.089598,0.089598,48.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,24.0,0.0,0.0,5.0,8.0,1968172.0,0.0,1968172.0,1968172.0,64974430.0,0.0,64974431.0,64974431.0,-1.0,-1.0,0.0,BENIGN,-1.0


In [9]:
ddf_ids18.head()

Unnamed: 0,id,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd RST Flags,Bwd RST Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Attempted Category
0,1.0,172.31.66.5-72.21.91.29-49690-80-6,172.31.66.5,49690.0,72.21.91.29,80.0,6.0,2018-03-02 12:47:05.189575,116659398.0,17.0,16.0,883.0,1576.0,436.0,0.0,51.941176,144.555124,788.0,0.0,98.5,269.15324,21.078456,0.282875,3645606.0,4802150.0,10004184.0,30.0,116648091.0,7290506.0,4411618.0,10015474.0,136.0,116648119.0,7776541.0,4123731.0,10019699.0,1159.0,2.0,2.0,0.0,0.0,0.0,0.0,352.0,464.0,0.145723,0.137151,0.0,788.0,74.515152,212.047919,44964.320076,0.0,2.0,0.0,4.0,32.0,0.0,1.0,1.0,0.941176,74.515152,51.941176,98.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,47.0,8192.0,290.0,13.0,20.0,22492.75,26450.571732,88105.0,11264.0,9698181.5,1022854.0,10004184.0,6451005.0,-1.0,-1.0,116659398.0,BENIGN,-1.0
1,2.0,172.31.66.5-54.192.16.165-49685-443-6,172.31.66.5,49685.0,54.192.16.165,443.0,6.0,2018-03-02 12:47:03.395301,117286103.0,52.0,85.0,1022.0,108153.0,250.0,0.0,19.653846,55.105562,1460.0,0.0,1272.388235,460.275412,930.843444,1.168084,862397.8,7056434.0,58862184.0,1.0,117286103.0,2299728.0,11448590.0,58862184.0,14.0,117223889.0,1395522.0,8968484.0,58928101.0,1.0,8.0,10.0,0.0,0.0,0.0,0.0,1052.0,1712.0,0.44336,0.724724,0.0,1460.0,796.89781,710.125976,504278.901245,0.0,2.0,0.0,18.0,136.0,0.0,1.0,1.0,1.634615,796.89781,19.653846,1272.388235,602.0,4.0,467754.0,104288.0,75.0,3661669.0,0.0,7.0,0.0,789.0,8192.0,131.0,8.0,20.0,204789.0,184907.009067,335538.0,74040.0,58407106.5,643576.8,58862184.0,57952029.0,-1.0,-1.0,117286103.0,BENIGN,-1.0
2,3.0,172.31.66.5-52.179.17.38-123-123-17,172.31.66.5,123.0,52.179.17.38,123.0,17.0,2018-03-02 12:47:45.823759,64032780.0,2.0,2.0,96.0,96.0,48.0,48.0,48.0,0.0,48.0,48.0,48.0,0.0,2.998464,0.062468,21344260.0,36937230.0,63995700.0,18485.0,64014185.0,64014180.0,0.0,64014185.0,64014185.0,64014295.0,64014300.0,0.0,64014295.0,64014295.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,16.0,0.031234,0.031234,48.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,24.0,0.0,0.0,1.0,8.0,18485.0,0.0,18485.0,18485.0,63995700.0,0.0,63995700.0,63995700.0,-1.0,-1.0,0.0,BENIGN,-1.0
3,4.0,172.31.66.5-13.89.187.212-49671-443-6,172.31.66.5,49671.0,13.89.187.212,443.0,6.0,2018-03-02 12:47:41.542805,118622137.0,6.0,3.0,216.0,372.0,72.0,0.0,36.0,39.436024,124.0,124.0,124.0,0.0,4.956916,0.075871,14827770.0,27386760.0,59936464.0,30000.0,118622137.0,23724430.0,32384060.0,59936464.0,76978.0,118543468.0,59271730.0,1048096.0,60012850.0,58530618.0,3.0,3.0,0.0,0.0,0.0,0.0,120.0,60.0,0.050581,0.02529,0.0,124.0,65.333333,53.925875,2908.0,0.0,0.0,0.0,6.0,9.0,0.0,0.0,0.0,0.5,65.333333,36.0,124.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,41.0,254.0,7556.0,2.0,20.0,77407.0,606.697618,77836.0,76978.0,59194738.0,1048959.0,59936464.0,58453012.0,-1.0,-1.0,118622137.0,BENIGN,-1.0
4,5.0,172.31.66.5-65.55.44.108-49718-443-6,172.31.66.5,49718.0,65.55.44.108,443.0,6.0,2018-03-02 12:49:36.999592,149794.0,7.0,6.0,1088.0,3852.0,785.0,0.0,155.428571,288.636944,1460.0,0.0,642.0,683.68677,32978.623977,86.785853,12482.83,17470.56,56657.0,7.0,149794.0,24965.67,28523.27,79121.0,339.0,71017.0,14203.4,12935.89,23942.0,19.0,3.0,3.0,0.0,0.0,0.0,0.0,152.0,132.0,46.730844,40.055009,0.0,1460.0,380.0,547.866772,300158.0,0.0,2.0,0.0,6.0,12.0,0.0,1.0,2.0,0.857143,380.0,155.428571,642.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0,0.0,296.0,8192.0,1023.0,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,149794.0,BENIGN,-1.0


In [10]:
print(get_shape(ddf_ids17))
print(get_shape(ddf_ids18))

(2099976, 91)
(63195145, 91)


## Entfernen nicht relevanter Spalten

In [11]:
def drop_columns_from_dask_dataframe(df, columns_to_drop):
    # Entfernen der angegebenen Spalten aus dem DataFrame
    df = df.drop(columns=columns_to_drop)
    logging.info(f"Columns {columns_to_drop} successfully dropped from DataFrame")
    return df

In [12]:
columns_to_remove = ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Timestamp', 'Attempted Category']

In [13]:
ddf_ids17 = drop_columns_from_dask_dataframe(ddf_ids17, columns_to_remove)

In [14]:
ddf_ids18 = drop_columns_from_dask_dataframe(ddf_ids18, columns_to_remove)

In [15]:
shape_ids17 = get_shape(ddf_ids17)
shape_ids18 = get_shape(ddf_ids18)

print(shape_ids17)
print(shape_ids18)

(2099976, 84)
(63195145, 84)


## Dtype Festlegung

In [16]:
def enforce_dtypes(df, dtypes):
    for column, dtype in dtypes.items():
        try:
            df[column] = df[column].astype(dtype)
            logging.info(f"Column '{column}' successfully converted to dtype '{dtype}'")
        except Exception as e:
            logging.error(f"Error converting column '{column}' to dtype '{dtype}': {e}")
    return df

# Datentypen für jede Spalte manuell festlegen
column_dtypes = {
    'Dst Port': 'int32',
    'Protocol': 'category',
    'Flow Duration': 'int64',
    'Total Fwd Packet': 'int64',
    'Total Bwd packets': 'int64',
    'Total Length of Fwd Packet': 'float64',
    'Total Length of Bwd Packet': 'float64',
    'Fwd Packet Length Max': 'float32',
    'Fwd Packet Length Min': 'float32',
    'Fwd Packet Length Mean': 'float32',
    'Fwd Packet Length Std': 'float32',
    'Bwd Packet Length Max': 'float32',
    'Bwd Packet Length Min': 'float32',
    'Bwd Packet Length Mean': 'float32',
    'Bwd Packet Length Std': 'float32',
    'Flow Bytes/s': 'float32',
    'Flow Packets/s': 'float32',
    'Flow IAT Mean': 'float32',
    'Flow IAT Std': 'float32',
    'Flow IAT Max': 'float32',
    'Flow IAT Min': 'float32',
    'Fwd IAT Total': 'float32',
    'Fwd IAT Mean': 'float32',
    'Fwd IAT Std': 'float32',
    'Fwd IAT Max': 'float32',
    'Fwd IAT Min': 'float32',
    'Bwd IAT Total': 'float32',
    'Bwd IAT Mean': 'float32',
    'Bwd IAT Std': 'float32',
    'Bwd IAT Max': 'float32',
    'Bwd IAT Min': 'float32',
    'Fwd PSH Flags': 'int32',
    'Bwd PSH Flags': 'int32',
    'Fwd URG Flags': 'int32',
    'Bwd URG Flags': 'int32',
    'Fwd RST Flags': 'int32',
    'Bwd RST Flags': 'int32',
    'Fwd Header Length': 'int64',
    'Bwd Header Length': 'int64',
    'Packet Length Min': 'float32',
    'Packet Length Max': 'float32',
    'Packet Length Mean': 'float32',
    'Packet Length Std': 'float32',
    'FIN Flag Count': 'int32',
    'SYN Flag Count': 'int32',
    'RST Flag Count': 'int32',
    'PSH Flag Count': 'int32',
    'ACK Flag Count': 'int32',
    'URG Flag Count': 'int32',
    'Label': 'category'
}

In [17]:
ddf_ids17 = enforce_dtypes(ddf_ids17, column_dtypes)

In [18]:
ddf_ids17.dtypes

Dst Port                         int32
Protocol                      category
Flow Duration                    int64
Total Fwd Packet                 int64
Total Bwd packets                int64
Total Length of Fwd Packet     float64
Total Length of Bwd Packet     float64
Fwd Packet Length Max          float32
Fwd Packet Length Min          float32
Fwd Packet Length Mean         float32
Fwd Packet Length Std          float32
Bwd Packet Length Max          float32
Bwd Packet Length Min          float32
Bwd Packet Length Mean         float32
Bwd Packet Length Std          float32
Flow Bytes/s                   float32
Flow Packets/s                 float32
Flow IAT Mean                  float32
Flow IAT Std                   float32
Flow IAT Max                   float32
Flow IAT Min                   float32
Fwd IAT Total                  float32
Fwd IAT Mean                   float32
Fwd IAT Std                    float32
Fwd IAT Max                    float32
Fwd IAT Min              

In [19]:
ddf_ids18 = enforce_dtypes(ddf_ids18, column_dtypes)

In [20]:
ddf_ids18.dtypes

Dst Port                         int32
Protocol                      category
Flow Duration                    int64
Total Fwd Packet                 int64
Total Bwd packets                int64
Total Length of Fwd Packet     float64
Total Length of Bwd Packet     float64
Fwd Packet Length Max          float32
Fwd Packet Length Min          float32
Fwd Packet Length Mean         float32
Fwd Packet Length Std          float32
Bwd Packet Length Max          float32
Bwd Packet Length Min          float32
Bwd Packet Length Mean         float32
Bwd Packet Length Std          float32
Flow Bytes/s                   float32
Flow Packets/s                 float32
Flow IAT Mean                  float32
Flow IAT Std                   float32
Flow IAT Max                   float32
Flow IAT Min                   float32
Fwd IAT Total                  float32
Fwd IAT Mean                   float32
Fwd IAT Std                    float32
Fwd IAT Max                    float32
Fwd IAT Min              

# Angleichung der Label Bezeichnungnen beider Datensätze

Die Angriffe unterscheiden sich teilweise. Nach genauer Analyse werden die Datensätze aneinander angepasst. Dabei werden Angriffe entfernt, zusammengefasst oder Bezeichnungen angepasst. IDS18 hat zwei Angriffe weniger, weil FTP-BruteForce nur als Attempted vorhanden ist und Heartbleed gar nicht vorhanden ist. 

## Vereinheitlichung der Bezeichnungen der Labels
Weil die genaue Bezeichnung/Schreibweise der Labels zwischen IDS17 und IDS18 nicht einheitlich ist, werden zunächst erstmal die Bezeichnungen der Label vereinheitlicht. Dabei werden die Label von IDS18 für IDS17 übernommen.

In [21]:
def get_unique_values_in_column(df, column_name):
    # Ausgabe der eindeutigen Werte in der angegebenen Spalte
    unique_values = df[column_name].unique().compute()
    unique_values_list = list(unique_values)
    logging.info(f"Unique values in column '{column_name}': {unique_values}")
    return unique_values_list

def replace_labels(df, column_name, mapping):
    # Ersetzen der Werte in der angegebenen Spalte gemäß dem Mapping, mit Angabe des Meta-Typs
    df[column_name] = df[column_name].map(mapping, meta=(column_name, 'object')).fillna(df[column_name])
    logging.info(f"Values in column '{column_name}' replaced according to mapping")
    return df

In [22]:
# Mapping von CICIDS2017-Bezeichnungen zu CICIDS2018-Bezeichnungen
mapping_2017_2018_labels = {
    'Botnet': 'Botnet Ares',
    'Botnet - Attempted': 'Botnet Ares - Attempted',
    'DDoS': 'DDoS-LOIC-HTTP',
    'Infiltration - Portscan': 'Infiltration - NMAP Portscan',
    'FTP-Patator - Attempted': 'FTP-BruteForce - Attempted',
    'SSH-Patator': 'SSH-BruteForce',
    'Web Attack - SQL Injection': 'Web Attack - SQL',
    'Web Attack - SQL Injection - Attempted': 'Web Attack - SQL - Attempted'
}

In [23]:
labels_ids17 = get_unique_values_in_column(ddf_ids17, 'Label')



In [24]:
labels_ids18 = get_unique_values_in_column(ddf_ids18, 'Label')

In [25]:
ddf_ids17 = replace_labels(ddf_ids17, 'Label', mapping_2017_2018_labels)

In [26]:
labels_ids17.sort()
labels_ids17

['BENIGN',
 'Botnet',
 'Botnet - Attempted',
 'DDoS',
 'DoS GoldenEye',
 'DoS GoldenEye - Attempted',
 'DoS Hulk',
 'DoS Hulk - Attempted',
 'DoS Slowhttptest',
 'DoS Slowhttptest - Attempted',
 'DoS Slowloris',
 'DoS Slowloris - Attempted',
 'FTP-Patator',
 'FTP-Patator - Attempted',
 'Heartbleed',
 'Infiltration',
 'Infiltration - Attempted',
 'Infiltration - Portscan',
 'Portscan',
 'SSH-Patator',
 'SSH-Patator - Attempted',
 'Web Attack - Brute Force',
 'Web Attack - Brute Force - Attempted',
 'Web Attack - SQL Injection',
 'Web Attack - SQL Injection - Attempted',
 'Web Attack - XSS',
 'Web Attack - XSS - Attempted']

In [27]:
labels_ids18.sort()
labels_ids18

['BENIGN',
 'Botnet Ares',
 'Botnet Ares - Attempted',
 'DDoS-HOIC',
 'DDoS-LOIC-HTTP',
 'DDoS-LOIC-UDP',
 'DDoS-LOIC-UDP - Attempted',
 'DoS GoldenEye',
 'DoS GoldenEye - Attempted',
 'DoS Hulk',
 'DoS Hulk - Attempted',
 'DoS Slowloris',
 'DoS Slowloris - Attempted',
 'FTP-BruteForce - Attempted',
 'Infiltration - Communication Victim Attacker',
 'Infiltration - Dropbox Download',
 'Infiltration - Dropbox Download - Attempted',
 'Infiltration - NMAP Portscan',
 'SSH-BruteForce',
 'Web Attack - Brute Force',
 'Web Attack - Brute Force - Attempted',
 'Web Attack - SQL',
 'Web Attack - SQL - Attempted',
 'Web Attack - XSS',
 'Web Attack - XSS - Attempted']

Die Bezeichnungen der Label sind jetzt angepasst und einheitlich.

## Bestimmte Angriffe entfernen
Folgende Angriffe werden entfernt:
- Angriffe die nicht in beiden Datensätzen vorhanden sind
- Angriffe die nur als Angriffsversuch gelabelt sind

In [28]:
def remove_rows_with_labels(df, column_name, labels_to_remove):
    # Entfernen von Zeilen, deren Spaltenwert im angegebenen Label-Set enthalten ist
    df = df[~df[column_name].isin(labels_to_remove)]
    logging.info(f"Rows with labels {labels_to_remove} in column '{column_name}' successfully removed from DataFrame")
    return df

In [29]:
remove_in_ids17 = (
    'DoS Slowhttptest', 
    'DoS Slowhttptest - Attempted', 
    'FTP-Patator', 
    'Heartbleed', 
    'Infiltration', 
    'Infiltration - Attempted', 
    'Portscan', 
    'SSH-Patator - Attempted',
    'Botnet Ares - Attempted',
    'DoS GoldenEye - Attempted',
    'DoS Hulk - Attempted',
    'DoS Slowloris - Attempted',
    'FTP-BruteForce - Attempted',
    'Web Attack - Brute Force - Attempted',
    'Web Attack - SQL - Attempted',
    'Web Attack - XSS - Attempted'
    )

remove_in_ids18 = (
    'DDoS-HOIC', 
    'DDoS-LOIC-UDP', 
    'DDoS-LOIC-UDP - Attempted', 
    'Infiltration - Communication Victim Attacker', 
    'Infiltration - Dropbox Download', 
    'Infiltration - Dropbox Download - Attempted',
    'Botnet Ares - Attempted',
    'DoS GoldenEye - Attempted',
    'DoS Hulk - Attempted',
    'DoS Slowloris - Attempted',
    'FTP-BruteForce - Attempted',
    'Web Attack - Brute Force - Attempted',
    'Web Attack - SQL - Attempted',
    'Web Attack - XSS - Attempted'
    )

In [30]:
ddf_ids17 = remove_rows_with_labels(ddf_ids17, 'Label', remove_in_ids17)

In [31]:
ddf_ids18 = remove_rows_with_labels(ddf_ids18, 'Label', remove_in_ids18)

In [32]:
labels_ids17 = get_unique_values_in_column(ddf_ids17, 'Label')
labels_ids18 = get_unique_values_in_column(ddf_ids18, 'Label')
labels_ids17.sort()
labels_ids18.sort()
print(labels_ids17)
print(labels_ids18)
labels_ids17 == labels_ids18

['BENIGN', 'Botnet Ares', 'DDoS-LOIC-HTTP', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowloris', 'Infiltration - NMAP Portscan', 'SSH-BruteForce', 'Web Attack - Brute Force', 'Web Attack - SQL', 'Web Attack - XSS']
['BENIGN', 'Botnet Ares', 'DDoS-LOIC-HTTP', 'DoS GoldenEye', 'DoS Hulk', 'DoS Slowloris', 'Infiltration - NMAP Portscan', 'SSH-BruteForce', 'Web Attack - Brute Force', 'Web Attack - SQL', 'Web Attack - XSS']


True

Die verbliebenen Angriffe in beiden Datensätzen sind jetzt die gleichen.

In [33]:
new_shape_ids17 = get_shape(ddf_ids17)
new_shape_ids18 = get_shape(ddf_ids18)
print(new_shape_ids17)
print(new_shape_ids18)

(1923172, 84)
(61803799, 84)


In [34]:
lost_rows_ids17 = shape_ids17[0] - new_shape_ids17[0]
lost_rows_ids18 = shape_ids18[0] - new_shape_ids18[0]

print(f'Anzahl der entfernten Zeilen in ids17: {lost_rows_ids17}')
print(f'Anzahl der entfernten Zeilen in ids18: {lost_rows_ids18}')

Anzahl der entfernten Zeilen in ids17: 176804
Anzahl der entfernten Zeilen in ids18: 1391346


## Inf-Werte entfernen
Da keine NaN-Werte, aber einige wenige Inf-Werte in den Datensätzen vorhanden sind, werden folgend die Zeilen mit Inf-Werten entfernt.

In [35]:
def remove_rows_with_inf_values(df):
    # Entfernen von Zeilen, die unendliche Werte (Inf oder -Inf) enthalten
    df = df[~df.isin([np.inf, -np.inf]).any(axis=1)]
    logging.info("Rows with infinite values successfully removed from DataFrame")
    return df

In [36]:
ddf_ids17 = remove_rows_with_inf_values(ddf_ids17)

In [37]:
ddf_ids18 = remove_rows_with_inf_values(ddf_ids18)

In [38]:
ddf_ids18.shape[0].compute() #57 Zeilen mit inf entfernt

61803742

## One Hot Encoding

In [39]:
def one_hot_encode_column(df, column_name):
    # One-Hot-Encoding einer bestimmten Spalte direkt in Dask
    logging.info(f"One-Hot-Encoding column '{column_name}' using Dask tools")
    # Verwenden des Categorizer von dask_ml, um die Spalte in eine Kategorie umzuwandeln
    categorizer = Categorizer(columns=[column_name])
    df = categorizer.fit_transform(df)
    # Verwenden von get_dummies, um tatsächlich One-Hot-Encoding zu erhalten
    df = dd.get_dummies(df, columns=[column_name])
    logging.info(f"Column '{column_name}' successfully one-hot encoded and added to DataFrame")
    return df

In [40]:
ddf_ids17_encoded = one_hot_encode_column(ddf_ids17, 'Protocol')

In [41]:
ddf_ids17_encoded

Unnamed: 0_level_0,Dst Port,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd RST Flags,Bwd RST Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Protocol_0.0,Protocol_1.0,Protocol_6.0,Protocol_17.0
npartitions=15,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1
,int32,int64,int64,int64,float64,float64,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,int32,int32,int32,int32,int32,int32,int64,int64,float64,float64,float32,float32,float32,float32,float64,int32,int32,int32,int32,int32,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,bool,bool,bool,bool
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [42]:
ddf_ids18_encoded = one_hot_encode_column(ddf_ids18, 'Protocol')

In [43]:
ddf_ids18_encoded

Unnamed: 0_level_0,Dst Port,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd RST Flags,Bwd RST Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWR Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Bwd Segment Size Avg,Fwd Bytes/Bulk Avg,Fwd Packet/Bulk Avg,Fwd Bulk Rate Avg,Bwd Bytes/Bulk Avg,Bwd Packet/Bulk Avg,Bwd Bulk Rate Avg,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,FWD Init Win Bytes,Bwd Init Win Bytes,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Protocol_0.0,Protocol_1.0,Protocol_6.0,Protocol_17.0
npartitions=557,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1
,int32,int64,int64,int64,float64,float64,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,float32,int32,int32,int32,int32,int32,int32,int64,int64,float64,float64,float32,float32,float32,float32,float64,int32,int32,int32,int32,int32,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,category[unknown],bool,bool,bool,bool
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [44]:
print(ddf_ids17.known_divisions)
print(ddf_ids17_encoded.known_divisions)
print(ddf_ids17.npartitions)
print(ddf_ids17_encoded.npartitions)

print(ddf_ids18.known_divisions)
print(ddf_ids18_encoded.known_divisions)
print(ddf_ids18.npartitions)
print(ddf_ids18_encoded.npartitions)

False
False
15
15
False
False
557
557


In [47]:
ddf_ids17_encoded = ddf_ids17_encoded.repartition(npartitions=8)

In [48]:
ddf_ids18_encoded = ddf_ids18_encoded.repartition(npartitions=279)

## Label Encoding für die Label Spalte

In [45]:
def label_encode_column(df, column_name):
    # Label-Encoding einer bestimmten Spalte direkt in Dask
    logging.info(f"Label-Encoding column '{column_name}' using Dask tools")
    encoder = LabelEncoder(use_categorical=False)
    df[column_name] = encoder.fit_transform(df[column_name])
    logging.info(f"Column '{column_name}' successfully label encoded")
    return df, encoder

In [49]:
ddf_ids17_encoded, ids17_encoder = label_encode_column(ddf_ids17_encoded, 'Label')



In [50]:
ddf_ids18_encoded, ids18_encoder = label_encode_column(ddf_ids18_encoded, 'Label')



In [51]:
ids17_classes_list = list(ids17_encoder.classes_.compute())
ids18_classes_list = list(ids18_encoder.classes_.compute())

In [52]:
ids17_classes_list

['BENIGN',
 'Botnet Ares',
 'DDoS-LOIC-HTTP',
 'DoS GoldenEye',
 'DoS Hulk',
 'DoS Slowloris',
 'Infiltration - NMAP Portscan',
 'SSH-BruteForce',
 'Web Attack - Brute Force',
 'Web Attack - SQL',
 'Web Attack - XSS']

In [53]:
ids18_classes_list

['BENIGN',
 'Botnet Ares',
 'DDoS-LOIC-HTTP',
 'DoS GoldenEye',
 'DoS Hulk',
 'DoS Slowloris',
 'Infiltration - NMAP Portscan',
 'SSH-BruteForce',
 'Web Attack - Brute Force',
 'Web Attack - SQL',
 'Web Attack - XSS']

### Encoder Funktionsweise prüfen

In [None]:
ids17_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).compute()

In [None]:
ids18_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).compute()

In [None]:
da_ids17_classes_list = da.from_array(ids17_classes_list) # Dask Array erstellen, weil Encoder nur mit Dask Array arbeitet
ids17_encoder.transform(da_ids17_classes_list).compute()

In [None]:
da_ids18_classes_list = da.from_array(ids18_classes_list) # Dask Array erstellen, weil Encoder nur mit Dask Array arbeitet
ids18_encoder.transform(da_ids18_classes_list).compute()

## Skallierung - Fehler beim Anzeigen aber scheint zu laufen
- Sorgt bei IDS18 dafür dass simpelste Methoden wie head() zu gigantischen Berechnungen führen, weil ein riesiger Graph zur Ausührung benötigt wird
- repartition, index drop helfen nicht

- Spalte Destination Port sollte eigentlich nicht skalliert werden
- Bei den anderen Spalten vllt auch nocht alle? Was ist mit Flag Spalten? Soll man die skallieren? -Y Flag Spalten sind Counts der Flags in dem Flow also Ja
- Generell mal checken, ob überhaupt wirklich skalliert werden muss

In [65]:
# columns = list(ddf_ids17_encoded.columns)
# columns_not_to_scale = ['Dst Port', 'Label', 'Protocol_0.0', 'Protocol_1.0', 'Protocol_6.0', 'Protocol_17.0']
# columns_to_scale = [col for col in columns if col not in columns_not_to_scale]
# columns_to_scale

# def scale_dataframe(df, columns_to_scale):
#     logging.info(f"Scaling the specified numerical columns of the DataFrame: {columns_to_scale} using StandardScaler")
#     scaler = StandardScaler()
#     df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
#     logging.info("Specified columns successfully scaled")
#     return df

In [68]:
# ddf_ids17_enc_sca = scale_dataframe(ddf_ids17_encoded, columns_to_scale)

In [None]:
# ddf_ids17_enc_sca.head()

In [70]:
# ddf_ids18_enc_sca = scale_dataframe(ddf_ids18_encoded, columns_to_scale)

In [None]:
# ddf_ids18_enc_sca.head()

## Speichern der Datensätze als Parquet

In [54]:
def save_dask_dataframe(df, file_path):
    # Speichern des Dask DataFrames als Parquet-Datei für effizientes Wiederladen
    df.to_parquet(file_path, write_index=False)
    logging.info(f"DataFrame successfully saved to {file_path}")

In [55]:
# ddf_ids17_enc_sca = ddf_ids17_encoded.reset_index(drop=True)
# ddf_ids18_enc_sca = ddf_ids18_encoded.reset_index(drop=True)
print(f"Anzahl der Partitionen: {ddf_ids17_encoded.npartitions}")
print(f"Anzahl der Partitionen: {ddf_ids18_encoded.npartitions}")
# ddf_ids17_enc_sca.repartition(npartitions=8)
# ddf_ids18_enc_sca.repartition(npartitions=279)


Anzahl der Partitionen: 8
Anzahl der Partitionen: 279


In [56]:
save_dask_dataframe(ddf_ids17_encoded , parquet_verzeichnis_ids17 + '_prep_0')

In [57]:
save_dask_dataframe(ddf_ids18_encoded, parquet_verzeichnis_ids18 + '_prep_0')

## Dask Client beenden

In [58]:
client.close()