In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from typing import List

In [2]:
# Felesleges oszlopok eltávolítása.

def get_usecols(exclude_indices: List[int], total_cols: int) -> List[int]:
    return [i for i in range(total_cols) if i not in exclude_indices]

total_columns = 88
exclude_indices_definite = [0, 1, 7, 85]
exclude_indices_probable = [] # [2, 4]
exclude_final = exclude_indices_definite + exclude_indices_probable
usecols = get_usecols(exclude_indices=exclude_final, total_cols=total_columns)

In [3]:
# Forrásfájlok beolvasása és kombinálása 1db dataframe-be.

csv_files = ['../UnfilteredData/LDAP.csv', '../UnfilteredData/MSSQL.csv', '../UnfilteredData/NetBIOS.csv', '../UnfilteredData/Portmap.csv', 
             '../UnfilteredData/Syn.csv', '../UnfilteredData/UDP.csv', '../UnfilteredData/UDPLag.csv']

dfs = [pd.read_csv(file, usecols=usecols, encoding='utf-8-sig') for file in csv_files]

combined_df = pd.concat(dfs, ignore_index=True)

In [4]:
# Oszlopokban található megelőző és vég utáni szóközök eltávolítása

combined_df.columns = combined_df.columns.str.strip()

In [14]:
# Szükséges mennyiségű adat kivétele

label_col = combined_df.columns[-1]

df_benign = combined_df[combined_df[label_col] == 'BENIGN']
df_syn = combined_df[combined_df[label_col] == 'Syn']
df_portmap = combined_df[combined_df[label_col] == 'Portmap']
df_netbios = combined_df[combined_df[label_col] == 'NetBIOS']
df_ldap = combined_df[combined_df[label_col] == 'LDAP']
df_mssql = combined_df[combined_df[label_col] == 'MSSQL']
df_udp = combined_df[combined_df[label_col] == 'UDP']
df_udplag = combined_df[combined_df[label_col] == 'UDPLag']

NUM_OF_SAMPLES = 25000

df_benign_sampled = df_benign.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_benign) >= NUM_OF_SAMPLES else df_benign
df_syn_sampled = df_syn.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_syn) >= NUM_OF_SAMPLES else df_syn
df_portmap_sampled = df_portmap.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_portmap) >= NUM_OF_SAMPLES else df_portmap
df_netbios_sampled = df_netbios.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_netbios) >= NUM_OF_SAMPLES else df_netbios
df_ldap_sampled = df_ldap.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_ldap) >= NUM_OF_SAMPLES else df_ldap
df_mssql_sampled = df_mssql.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_mssql) >= NUM_OF_SAMPLES else df_mssql
df_udp_sampled = df_udp.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_udp) >= NUM_OF_SAMPLES else df_udp
df_udplag_sampled = df_udplag.sample(n=NUM_OF_SAMPLES, replace=False, random_state=1) if len(df_udplag) >= NUM_OF_SAMPLES else df_udplag


In [15]:
# Adatok összeillesztése 1db dataframe-be.

df_sampled = pd.concat([df_syn_sampled, df_benign_sampled, df_portmap_sampled, df_netbios_sampled, df_ldap_sampled, df_mssql_sampled, df_udp_sampled, df_udplag_sampled])

In [16]:
# Adatok mennyiségének leellenőrzése.

df_sampled.groupby('Label').count()

Unnamed: 0_level_0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BENIGN,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
LDAP,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
MSSQL,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
NetBIOS,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
Portmap,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
Syn,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
UDP,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000,...,25000,25000,25000,25000,25000,25000,25000,25000,25000,25000
UDPLag,1873,1873,1873,1873,1873,1873,1873,1873,1873,1873,...,1873,1873,1873,1873,1873,1873,1873,1873,1873,1873


In [17]:
# Szűkített mennyiségű adatok fájlba írása.

df_sampled.to_csv('../FilteredData/Filtered_' + str(NUM_OF_SAMPLES) + '.csv', index=False)