# Untersuchung der Datensätze

In [2]:
import os
import pandas as pd
import logging
import multiprocessing as mp

In [3]:
# Logging Parameter
logging.basicConfig(
    filename='20_eda.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Verzeichnis der CSV-Dateien
verzeichnis_ids17 = '../01_Datensaetze/improved_cic-ids-2017'
verzeichnis_ids18 = '../01_Datensaetze/improved_cse-cic-ids-2018'

nrows = None
chunksize = 500000

In [4]:
def get_data_info(verzeichnis, datei):
    if datei.endswith('.csv'):
        pfad_zur_datei = os.path.join(verzeichnis, datei)
        try:
            df = pd.read_csv(pfad_zur_datei, nrows=nrows)
            logging.info(f"Successfully read {pfad_zur_datei}.")
            return df.info()
        except Exception as e:
            logging.error(f"Error reading {pfad_zur_datei}: {e}")
            return []
    return []

In [5]:
get_data_info(verzeichnis_ids18, 'Friday-02-03-2018.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6311371 entries, 0 to 6311370
Data columns (total 84 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   Dst Port                    int64  
 1   Protocol                    int64  
 2   Flow Duration               int64  
 3   Total Fwd Packet            int64  
 4   Total Bwd packets           int64  
 5   Total Length of Fwd Packet  int64  
 6   Total Length of Bwd Packet  int64  
 7   Fwd Packet Length Max       int64  
 8   Fwd Packet Length Min       int64  
 9   Fwd Packet Length Mean      float64
 10  Fwd Packet Length Std       float64
 11  Bwd Packet Length Max       int64  
 12  Bwd Packet Length Min       int64  
 13  Bwd Packet Length Mean      float64
 14  Bwd Packet Length Std       float64
 15  Flow Bytes/s                float64
 16  Flow Packets/s              float64
 17  Flow IAT Mean               float64
 18  Flow IAT Std                float64
 19  Flow IAT Max         

## Fehlende Werte

In [8]:
df = pd.read_csv(os.path.join(verzeichnis_ids18, 'Friday-02-03-2018.csv'), nrows=nrows)

In [17]:
df.shape

(6311371, 84)

In [10]:
duplicates = df[df.duplicated()]
print(f"Number of duplicates: {len(duplicates)}")

Number of duplicates: 1215757


In [18]:
duplicates.to_csv('duplicates.csv', index=False)

In [19]:
df.value_counts('Label')

Label
BENIGN                     6168188
Botnet Ares                 142921
Botnet Ares - Attempted        262
Name: count, dtype: int64

In [20]:
labels_counts = []

for file in os.listdir(verzeichnis_ids18):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(verzeichnis_ids18, file), nrows=nrows)
        labels_counts.append(df.value_counts('Label'))

labels_counts

[Label
 BENIGN            5764497
 DDoS-LOIC-HTTP     289328
 Name: count, dtype: int64,
 Label
 BENIGN                       5372471
 DoS GoldenEye                  22560
 DoS Slowloris                   8490
 DoS GoldenEye - Attempted       4301
 DoS Slowloris - Attempted       2280
 Name: count, dtype: int64,
 Label
 BENIGN                          6511554
 Infiltration - NMAP Portscan      39634
 Name: count, dtype: int64,
 Label
 BENIGN                                  6070945
 Web Attack - Brute Force - Attempted         76
 Web Attack - Brute Force                     69
 Web Attack - XSS                             40
 Web Attack - SQL                             16
 Web Attack - SQL - Attempted                  4
 Web Attack - XSS - Attempted                  3
 Name: count, dtype: int64,
 Label
 BENIGN    5878399
 Name: count, dtype: int64,
 Label
 BENIGN                        5610799
 FTP-BruteForce - Attempted     193354
 SSH-BruteForce                  94197
 Name: count,