# Trening

## Wczytanie danych treningowych


In [None]:
import pandas as pd
import os
from google.colab import drive
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

typelist=["DDoS-ICMP","MQTT-DoS-Connect","MQTT-DDoS-Publish","MQTT-DDoS-Connect","MQTT-Malformed","MQTT-DoS-Publish","DDoS-SYN","DDoS-TCP","DDoS-UDP","DoS-ICMP","DoS-SYN","DoS-TCP","DoS-UDP","OS_Scan","Ping_Sweep","Port_Scan","VulScan"]

# Montowanie Google Drive
drive.mount('/content/drive')

# Ścieżka do folderu z plikami CSV

folder_path = '//content/drive/MyDrive/csv_files/train'

# Funkcja do wczytywania i przetwarzania plików
def preprocess_data(file_path):
    # Wczytywanie danych
    df = pd.read_csv(file_path)

    # Usuwanie zbędnych kolumn, jeśli istnieją
    columns_to_drop = ['No.', 'Info']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

    # Kodowanie zmiennych kategorycznych
    label_encoders = {}
    for col in ['Source', 'Destination']:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # One-Hot Encoding dla kolumny 'Protocol'
    if 'Protocol' in df.columns:
        df = pd.get_dummies(df, columns=['Protocol'], drop_first=True)

    # Normalizacja danych numerycznych, jeśli kolumny istnieją
    scaler = StandardScaler()
    numeric_columns = [col for col in ['IAT','AVG','Tot size','Duration','Rate','Srate'] if col in df.columns]
    if numeric_columns:
        df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

    return df, label_encoders, scaler

# Iterowanie przez wszystkie pliki w folderze
processed_data = {}
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dataframe_name = os.path.splitext(file_name)[0]
        dataframe_name = dataframe_name.replace("TCP_IP-", "")
        dataframe_name = dataframe_name.replace(".pcap", "")
        try:
            df_cleaned, le, scaler = preprocess_data(file_path)
            for typ in typelist:
              if typ in dataframe_name:
                  df_cleaned['Attack'] = pd.Series(typ, index=df_cleaned.index)
                  print(dataframe_name)
                  print(df.shape)
                  processed_data[dataframe_name] = df_cleaned
                  break


            print(f"Plik {dataframe_name} przetworzony!")
        except Exception as e:
            print(f"Błąd przetwarzania pliku {dataframe_name}: {e}")




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
MQTT-DoS-Connect_Flood_train
(12773, 45)
Plik MQTT-DoS-Connect_Flood_train przetworzony!
MQTT-DDoS-Publish_Flood_train
(27623, 45)
Plik MQTT-DDoS-Publish_Flood_train przetworzony!
MQTT-DDoS-Connect_Flood_train
(173036, 45)
Plik MQTT-DDoS-Connect_Flood_train przetworzony!
MQTT-Malformed_Data_train
(5130, 45)
Plik MQTT-Malformed_Data_train przetworzony!
Recon-Ping_Sweep_train
(740, 45)
Plik Recon-Ping_Sweep_train przetworzony!
Recon-OS_Scan_train
(16832, 45)
Plik Recon-OS_Scan_train przetworzony!
Recon-VulScan_train
(2173, 45)
Plik Recon-VulScan_train przetworzony!
MQTT-DoS-Publish_Flood_train
(44376, 45)
Plik MQTT-DoS-Publish_Flood_train przetworzony!
Recon-Port_Scan_train
(83981, 45)
Plik Recon-Port_Scan_train przetworzony!
DDoS-ICMP1_train
(194938, 45)
Plik DDoS-ICMP1_train przetworzony!
DDoS-ICMP2_train
(194818, 45)
Plik DDoS-ICMP2_train przetworzony!
DDoS-

In [None]:
cleaned_data = processed_data['DoS-UDP2_train']
print(cleaned_data.head())

   Header_Length  Protocol Type  Duration      Rate     Srate  Drate  \
0         2525.0           17.0 -0.083312 -0.166451 -0.166451    0.0   
1         7525.0           17.0 -0.083312 -0.126466 -0.126466    0.0   
2        12525.0           17.0 -0.083312 -0.125381 -0.125381    0.0   
3        17525.0           17.0 -0.083312 -0.113300 -0.113300    0.0   
4        22525.0           17.0 -0.083312 -0.178409 -0.178409    0.0   

   fin_flag_number  syn_flag_number  rst_flag_number  psh_flag_number  ...  \
0              0.0              0.0              0.0              0.0  ...   
1              0.0              0.0              0.0              0.0  ...   
2              0.0              0.0              0.0              0.0  ...   
3              0.0              0.0              0.0              0.0  ...   
4              0.0              0.0              0.0              0.0  ...   

   Std  Tot size        IAT  Number  Magnitue  Radius  Covariance  Variance  \
0  0.0 -0.093196  1

In [None]:
print(cleaned_data.columns)

Index(['Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate',
       'fin_flag_number', 'syn_flag_number', 'rst_flag_number',
       'psh_flag_number', 'ack_flag_number', 'ece_flag_number',
       'cwr_flag_number', 'ack_count', 'syn_count', 'fin_count', 'rst_count',
       'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP', 'UDP',
       'DHCP', 'ARP', 'ICMP', 'IGMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max',
       'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius',
       'Covariance', 'Variance', 'Weight', 'Attack'],
      dtype='object')


In [None]:
train_dataset=pd.DataFrame()

In [None]:
# jeden zbior danych
for item in processed_data:
  train_dataset=pd.concat([train_dataset,processed_data[item]])

In [None]:
train_dataset

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,Attack
0,108.04,6.0,-0.068647,2.208988,2.208988,0.0,0.00,0.25,0.09,0.26,...,5.100001,0.328721,23.995741,9.500000,11.963216,7.214759,29.675934,0.90,141.550000,MQTT-DoS-Connect
1,153.48,6.0,-0.068647,-0.022753,-0.022753,0.0,0.03,0.17,0.21,0.27,...,4.996967,0.261181,-0.003672,9.500000,11.883601,7.076315,30.676690,0.87,141.550000,MQTT-DoS-Connect
2,197.02,6.0,-0.068647,2.202698,2.202698,0.0,0.04,0.18,0.14,0.26,...,4.749291,-0.200346,-0.003672,9.500000,11.741353,6.718575,28.271456,0.87,141.550000,MQTT-DoS-Connect
3,172.76,6.0,-0.068647,0.036225,0.036225,0.0,0.03,0.14,0.17,0.33,...,5.595000,0.309960,-0.003672,9.500000,11.945562,7.916080,38.082462,0.91,141.550000,MQTT-DoS-Connect
4,200.76,6.0,-0.068647,4.493931,4.493931,0.0,0.03,0.20,0.11,0.27,...,3.935597,-0.181585,-0.003672,9.500000,11.732178,5.540030,18.947666,0.92,141.550000,MQTT-DoS-Connect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186017,0.00,1.0,-0.076816,-0.252885,-0.252885,0.0,0.00,0.00,0.00,0.00,...,0.000000,-0.083432,0.007955,9.500000,9.165151,0.000000,0.000000,0.00,141.550000,DDoS-ICMP
186018,0.00,1.0,-0.076816,-0.252885,-0.252885,0.0,0.00,0.00,0.00,0.00,...,0.000000,-0.083432,0.007955,9.500000,9.165151,0.000000,0.000000,0.00,141.550000,DDoS-ICMP
186019,0.00,1.0,-0.076816,-0.252885,-0.252885,0.0,0.00,0.00,0.00,0.00,...,0.000000,-0.083432,0.007955,9.500000,9.165151,0.000000,0.000000,0.00,141.550000,DDoS-ICMP
186020,0.00,1.0,-0.076816,-0.252885,-0.252885,0.0,0.00,0.00,0.00,0.00,...,0.000000,-0.083432,0.007955,9.500000,9.165151,0.000000,0.000000,0.00,141.550000,DDoS-ICMP


In [None]:
X=train_dataset[['Protocol Type','Duration','Rate','Srate','syn_flag_number','rst_flag_number','psh_flag_number','ack_flag_number','syn_count','fin_count','rst_count','ICMP','TCP','UDP','ARP','IAT','AVG','Tot size']]

Y=train_dataset['Attack']

In [None]:
X

Unnamed: 0,Protocol Type,Duration,Rate,Srate,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,syn_count,fin_count,rst_count,ICMP,TCP,UDP,ARP,IAT,AVG,Tot size
0,6.0,-0.068647,2.208988,2.208988,0.25,0.09,0.26,0.83,0.35,0.11,1.26,0.0,1.0,0.0,0.0,23.995741,0.680263,0.328721
1,6.0,-0.068647,-0.022753,-0.022753,0.17,0.21,0.27,0.91,0.40,0.26,1.79,0.0,1.0,0.0,0.0,-0.003672,0.339305,0.261181
2,6.0,-0.068647,2.202698,2.202698,0.18,0.14,0.26,0.90,0.37,0.31,2.33,0.0,1.0,0.0,0.0,-0.003672,-0.269145,-0.200346
3,6.0,-0.068647,0.036225,0.036225,0.14,0.17,0.33,0.87,0.29,0.26,2.08,0.0,1.0,0.0,0.0,-0.003672,0.608221,0.309960
4,6.0,-0.068647,4.493931,4.493931,0.20,0.11,0.27,0.87,0.45,0.27,2.34,0.0,1.0,0.0,0.0,-0.003672,-0.315870,-0.181585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186017,1.0,-0.076816,-0.252885,-0.252885,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.007955,-0.083043,-0.083432
186018,1.0,-0.076816,-0.252885,-0.252885,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.007955,-0.083043,-0.083432
186019,1.0,-0.076816,-0.252885,-0.252885,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.007955,-0.083043,-0.083432
186020,1.0,-0.076816,-0.252885,-0.252885,0.00,0.00,0.00,0.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.007955,-0.083043,-0.083432


In [None]:
Y

Unnamed: 0,Attack
0,MQTT-DoS-Connect
1,MQTT-DoS-Connect
2,MQTT-DoS-Connect
3,MQTT-DoS-Connect
4,MQTT-DoS-Connect
...,...
186017,DDoS-ICMP
186018,DDoS-ICMP
186019,DDoS-ICMP
186020,DDoS-ICMP


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler, OneHotEncoder




# Definicja modelu FCNN
model = Sequential([
    Dense(256, input_dim=18, activation='relu'),  # Pierwsza warstwa ukryta
    Dropout(0.3),  # Regularizacja
    Dense(128, activation='relu'),  # Druga warstwa ukryta
    Dropout(0.3),
    Dense(64, activation='relu'),  # Trzecia warstwa ukryta
    Dense(17, activation='softmax')  # Warstwa wyjściowa z 17 klasami
])


model.compile(optimizer='adam', loss='categorical_focal_crossentropy', metrics=['accuracy'])


model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.config.list_physical_devices()

Num GPUs Available:  0


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [None]:
import tensorflow as tf
print(tf.__version__)

2.17.1


In [None]:

label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

# Jeśli chcesz zobaczyć mapowanie:
print(dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))

{'DDoS-ICMP': 0, 'DDoS-SYN': 1, 'DDoS-TCP': 2, 'DDoS-UDP': 3, 'DoS-ICMP': 4, 'DoS-SYN': 5, 'DoS-TCP': 6, 'DoS-UDP': 7, 'MQTT-DDoS-Connect': 8, 'MQTT-DDoS-Publish': 9, 'MQTT-DoS-Connect': 10, 'MQTT-DoS-Publish': 11, 'MQTT-Malformed': 12, 'OS_Scan': 13, 'Ping_Sweep': 14, 'Port_Scan': 15, 'VulScan': 16}


In [None]:
from tensorflow.keras.utils import to_categorical

# Liczba klas
num_classes = len(label_encoder.classes_)
# do one hot encoding bo funkcja straty kategorical crossentropy tak potrzebuje
# Konwersja do one-hot
Y_one_hot = to_categorical(Y_encoded, num_classes=num_classes)

In [None]:
Y_one_hot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [None]:
Y_one_hot.size

118184884

In [None]:
Y_one_hot.shape

(6952052, 17)

In [None]:
(Y_one_hot)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

## Trening modelu

In [None]:
import numpy as np
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, X, Y, batch_size):
        # Konwersja danych na NumPy array
        self.X = X.to_numpy()  # Jeśli X jest DataFrame, konwertujemy na NumPy
        self.Y = Y
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.X))  # Tworzymy indeksy dla X i Y

    def __len__(self):
        # Liczba batchy na epokę
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        # Wczytanie batcha
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.X))


        batch_indexes = self.indexes[start_idx:end_idx]
        batch_X = self.X[batch_indexes]
        batch_Y = self.Y[batch_indexes]

        return batch_X, batch_Y

    def on_epoch_end(self):
        # Losowe przetasowanie danych po każdej epoce
        self.indexes = np.random.permutation(len(self.X))  # Przetasowanie zarówno X, jak i Y

# Przygotowanie generatora
batch_size = 1024
data_gen = DataGenerator(X, Y_one_hot, batch_size)

# Trenowanie modelu z użyciem generatora
history = model.fit(data_gen, epochs=50, verbose=2)

Epoch 1/50
6790/6790 - 118s - 17ms/step - accuracy: 0.9011 - loss: 0.0187
Epoch 2/50
6790/6790 - 122s - 18ms/step - accuracy: 0.9780 - loss: 0.0057
Epoch 3/50
6790/6790 - 143s - 21ms/step - accuracy: 0.9812 - loss: 0.0051
Epoch 4/50
6790/6790 - 139s - 21ms/step - accuracy: 0.9821 - loss: 0.0048
Epoch 5/50
6790/6790 - 142s - 21ms/step - accuracy: 0.9823 - loss: 0.0049
Epoch 6/50
6790/6790 - 141s - 21ms/step - accuracy: 0.9837 - loss: 0.0044
Epoch 7/50
6790/6790 - 141s - 21ms/step - accuracy: 0.9836 - loss: 0.0045
Epoch 8/50
6790/6790 - 140s - 21ms/step - accuracy: 0.9838 - loss: 0.0045
Epoch 9/50
6790/6790 - 121s - 18ms/step - accuracy: 0.9851 - loss: 0.0041
Epoch 10/50
6790/6790 - 118s - 17ms/step - accuracy: 0.9841 - loss: 0.0042
Epoch 11/50
6790/6790 - 141s - 21ms/step - accuracy: 0.9813 - loss: 0.0051
Epoch 12/50
6790/6790 - 145s - 21ms/step - accuracy: 0.9779 - loss: 0.0053
Epoch 13/50
6790/6790 - 142s - 21ms/step - accuracy: 0.9720 - loss: 0.0054
Epoch 14/50
6790/6790 - 141s - 21m

In [None]:
model.save_weights('model.weights.h5')

## Ewaluacja modelu

## Okreslenie ktore wartosci sa dobrze przewidziane

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, Flatten, Dropout,LeakyReLU
from tensorflow.keras.models import Model
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Definicja modelu FCNN
model2 = Sequential([
    Dense(256, input_dim=18, activation='relu'),  # Pierwsza warstwa ukryta
    Dropout(0.3),  # Regularizacja
    Dense(128, activation='relu'),  # Druga warstwa ukryta
    Dropout(0.3),
    Dense(64, activation='relu'),  # Trzecia warstwa ukryta
    Dense(17, activation='softmax')  # Warstwa wyjściowa z 17 klasami
])
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model2.load_weights('model.weights.h5')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
train_loss, train_accuracy = model2.evaluate(data_gen)
print(f"Zbiór treningowy: Strata: {train_loss}, Dokładność: {train_accuracy}")

[1m6790/6790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 7ms/step - accuracy: 0.9566 - loss: 0.1034
Zbiór treningowy: Strata: 0.10340826213359833, Dokładność: 0.9565373063087463


In [None]:
class PredictionDataGenerator(Sequence):
    def __init__(self, X, batch_size):
        # Konwersja danych na NumPy array
        self.X = X.to_numpy()  # Jeśli X jest DataFrame, konwertujemy na NumPy
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.X))  # Tworzymy indeksy dla X

    def __len__(self):
        # Liczba batchy na epokę
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        # Wczytanie batcha
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.X))

        # Pobranie danych tylko dla X
        batch_indexes = self.indexes[start_idx:end_idx]
        batch_X = self.X[batch_indexes]

        return batch_X  # Tylko X, bez Y

    #def on_epoch_end(self):
       # self.indexes = np.random.permutation(len(self.X))

In [None]:
kolumna=model.predict(dt_gen)

In [None]:
kolumna[1]

In [None]:
y_class=np.argmax(Y_one_hot,axis=1)

In [None]:
y_class

In [None]:


# Konwersja na klasy binarne
binary_predictions = np.argmax(kolumna, axis=1)

In [None]:
binary_predictions

In [None]:
y_class-binary_predictions

In [None]:
bad_predictions=[]
for i in range(len(y_class)):
  if y_class[i]!=binary_predictions[i]:
    bad_predictions.append(i)

In [None]:
len(bad_predictions)

In [None]:
god_predictions=[]
for i in range(len(y_class)):
  if y_class[i]==binary_predictions[i]:
    god_predictions.append(i)


In [None]:

    len(god_predictions)


 (6656093*100%)/(6656093+295959)

co równa sie 95% skutecznosci