# Импорты

In [7]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm

Зафиксируем random_seed

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)

Random seed set as 42


# 1 Загрузка датасета

In [3]:
df = pd.read_csv('cybersequrity.csv', sep=',')
df.head()

Unnamed: 0,svcscan.fs_drivers,callbacks.ngeneric,psxview.not_in_eprocess_pool_false_avg,psxview.not_in_eprocess_pool,callbacks.nanonymous,psxview.not_in_session,psxview.not_in_pslist,psxview.not_in_pspcid_list,psxview.not_in_ethread_pool,psxview.not_in_csrss_handles,psxview.not_in_pslist_false_avg,psxview.not_in_pspcid_list_false_avg,psxview.not_in_deskthrd,psxview.not_in_ethread_pool_false_avg,psxview.not_in_session_false_avg,Class
0,26,8,0.0,0,0,2,0,0,0,4,0.0,0.0,6,0.0,0.044444,1
1,26,8,0.0,0,0,5,3,3,3,7,0.073171,0.073171,9,0.073171,0.121951,1
2,26,8,0.0,0,0,9,7,7,7,11,0.152174,0.152174,13,0.152174,0.195652,1
3,26,8,0.0,0,0,3,1,1,2,6,0.022222,0.022222,9,0.044444,0.066667,1
4,26,8,0.0,0,0,2,0,0,0,4,0.0,0.0,6,0.0,0.04878,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   svcscan.fs_drivers                      10000 non-null  int64  
 1   callbacks.ngeneric                      10000 non-null  int64  
 2   psxview.not_in_eprocess_pool_false_avg  10000 non-null  float64
 3   psxview.not_in_eprocess_pool            10000 non-null  int64  
 4   callbacks.nanonymous                    10000 non-null  int64  
 5   psxview.not_in_session                  10000 non-null  int64  
 6   psxview.not_in_pslist                   10000 non-null  int64  
 7   psxview.not_in_pspcid_list              10000 non-null  int64  
 8   psxview.not_in_ethread_pool             10000 non-null  int64  
 9   psxview.not_in_csrss_handles            10000 non-null  int64  
 10  psxview.not_in_pslist_false_avg         10000 non-null  flo

In [5]:
df.shape

(10000, 16)

# 2 Подготовка данных к обучению

Разделим датасет на обучающую, валидационную и тестовую выборку со стратификацией. Целевая переменная — `air_quality`

In [6]:
# Размеры выборок
train_size = 0.6
val_size = 0.2
test_size = 0.2

# Разделение признаков и целевой переменной
X = df.drop(columns=['Class'])
y = df['Class']

# Разбивка
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=train_size, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_size / (test_size + val_size)), random_state=42, stratify=y_temp)

# Выведем результат
print(f'Входное количество ({1.0*100}%): {df.shape[0]}')
print(f'Тренировачный размер ({train_size*100}%): {X_train.shape[0]}')
print(f'Валидационный размер ({val_size*100}%): {X_val.shape[0]}')
print(f'Тестовый размер ({test_size*100}%): {X_test.shape[0]}')

Входное количество (100.0%): 10000
Тренировачный размер (60.0%): 6000
Валидационный размер (20.0%): 2000
Тестовый размер (20.0%): 2000


Стандартизируем признаки

In [8]:
# Инициализируем StandardScaler
scaler = StandardScaler()

# Обучаем на обучающей выборке и применяем к обучающим данным
X_train_scaled = scaler.fit_transform(X_train)

# Применяем тот же scaler к валидационным и тестовым данным
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

Создадим свой `Dateset`

In [9]:
class CustomDataset(Dataset):
    def __init__(self, X, y): 
        """Преобразование из DataFrames в Tensors"""
        self.X = torch.Tensor(X.values)
        self.y = torch.from_numpy(y.values).reshape(-1, 1).float()
    
    def __len__(self):
        """Возвращает длину тензора"""
        return self.X.shape[0]
    
    def __getitem__(self, index):
        """Возвращает 1 строку данных признаки-таргет"""
        return (self.X[index], self.y[index])

Инициализируем датасеты

In [10]:
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

Инициализируем `DataLoader`'s. Пока примем `batch_size` за 32

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
val_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 3 BaselineModel MLP

Создадим BaselineModel

In [15]:
class BaselineModel(nn.Module):
    def __init__(self, inputs_size, hidden_size, out_size, hidden_count):
        super(BaselineModel, self).__init__()
        self.linear_in = nn.Linear(inputs_size, hidden_size)
        self.linear_hiddens = nn.ModuleList()
        for _ in range(hidden_count):
            self.linear_hiddens.append(nn.Linear(hidden_size, hidden_size))
        self.linear_out = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        x = self.linear_in(x)
        x = torch.relu(x)
        for linear_hidden in self.linear_hiddens:
            x = linear_hidden(x)
            x = torch.relu(x)
        x = self.linear_out(x)
        y_pred = torch.sigmoid(x)
        return y_pred

Зададим метод переброса на доступный device

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Создадим объект модели

In [28]:
model = BaselineModel(inputs_size=15, hidden_size=200, out_size=1, hidden_count=4)
model.to(device)

BaselineModel(
  (linear_in): Linear(in_features=15, out_features=200, bias=True)
  (linear_hiddens): ModuleList(
    (0-3): 4 x Linear(in_features=200, out_features=200, bias=True)
  )
  (linear_out): Linear(in_features=200, out_features=1, bias=True)
)

# 4 Метод обучения

In [17]:
def train_loop(model, train_dataloader, val_dataloader, epochs=10, lr=0.001):
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        # Обучающая часть цикла
        for batch in (tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs} [Training]') 
                      if ((epoch+1) % 100 == 0 or epoch == 0) else train_dataloader):
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()  # Обнуляем градиенты
            y_pred = model(X_batch)  # Предсказание модели
            loss = criterion(y_pred, y_batch)  # Вычисляем ошибку
            loss.backward()  # Обратное распространение ошибки
            optimizer.step()  # Шаг оптимизации
            train_loss += loss.item() * X_batch.size(0)  # Учитываем потерю для этой мини-партии
        train_loss /= len(train_dataloader.dataset)  # Средняя ошибка по всему набору
        
        # Валидационная часть цикла
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in (tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{epochs} [Training]') 
                          if ((epoch+1) % 100 == 0 or epoch == 0) else val_dataloader):
                X_batch, y_batch = batch
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)  # Предсказание модели
                loss = criterion(y_pred, y_batch)  # Вычисляем ошибку
                val_loss += loss.item() * X_batch.size(0)  # Учитываем потерю для этой мини-партии
        val_loss /= len(val_dataloader.dataset)  # Средняя ошибка по всему набору
        if (epoch+1) % 100 == 0  or epoch == 0:
            print(f'Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}')
    print('Обучение завершено!')

# 5 Обучение и качество BaselineModel

In [29]:
%%time
train_loop(model, train_dataloader, val_dataloader, 200, 0.001)

Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.6925 - Val Loss: 0.6919


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6434 - Val Loss: 0.6419


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6273 - Val Loss: 0.6246
Обучение завершено!
CPU times: total: 9.72 s
Wall time: 1min 21s


In [32]:
def evaluate_model(model, test_dataloader, threshold=0.5):
    model.to(device)
    model.eval()
    all_preds = []  # Для хранения всех предсказанных меток
    all_labels = []  # Для хранения всех истинных меток
    with torch.no_grad():
        for batch in test_dataloader:
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred_probs = model(X_batch)  
            y_pred_labels = (y_pred_probs >= threshold).float()
            all_preds.extend(y_pred_labels.cpu().numpy().flatten())
            all_labels.extend(y_batch.cpu().numpy().flatten())
    report = classification_report(all_labels, all_preds, target_names=['class 0', 'class 1'])
    print(Counter(all_preds))
    print("\nClassification Report:\n")
    print(report)

In [33]:
evaluate_model(model, val_dataloader)

Counter({np.float32(0.0): 1088, np.float32(1.0): 912})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.62      0.68      0.65       981
     class 1       0.66      0.59      0.62      1019

    accuracy                           0.64      2000
   macro avg       0.64      0.64      0.64      2000
weighted avg       0.64      0.64      0.63      2000



# 6 Улучшенная модель

Соберём новую модель с Dropout и Batchnorm

In [36]:
class BaselineModelUpgrade(nn.Module):
    def __init__(self, inputs_size, hidden_size, out_size, hidden_count, dropout_p=0.5):
        super(BaselineModelUpgrade, self).__init__()
        self.linear_in = nn.Linear(inputs_size, hidden_size)
        self.batchnorm_in = nn.BatchNorm1d(hidden_size)  # BatchNorm после первого линейного слоя
        self.dropout_in = nn.Dropout(dropout_p)  # Dropout после BatchNorm

        self.linear_hiddens = nn.ModuleList()
        self.batchnorm_hiddens = nn.ModuleList()
        self.dropout_hiddens = nn.ModuleList()

        for _ in range(hidden_count):
            self.linear_hiddens.append(nn.Linear(hidden_size, hidden_size))
            self.batchnorm_hiddens.append(nn.BatchNorm1d(hidden_size))
            self.dropout_hiddens.append(nn.Dropout(dropout_p))
            
        self.linear_out = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        x = self.linear_in(x)
        x = self.batchnorm_in(x)
        x = torch.relu(x)
        x = self.dropout_in(x)
        for linear_hidden, batchnorm_hidden, dropout_hidden in zip(self.linear_hiddens, self.batchnorm_hiddens, self.dropout_hiddens):
            x = linear_hidden(x)
            x = batchnorm_hidden(x)
            x = torch.relu(x)
            x = dropout_hidden(x)
        x = self.linear_out(x)
        y_pred = torch.sigmoid(x)
        return y_pred

Инициализируем новую модель

In [37]:
model_upgrade = BaselineModelUpgrade(inputs_size=15, hidden_size=200, out_size=1, hidden_count=4)
model_upgrade.to(device)

BaselineModelUpgrade(
  (linear_in): Linear(in_features=15, out_features=200, bias=True)
  (batchnorm_in): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout_in): Dropout(p=0.5, inplace=False)
  (linear_hiddens): ModuleList(
    (0-3): 4 x Linear(in_features=200, out_features=200, bias=True)
  )
  (batchnorm_hiddens): ModuleList(
    (0-3): 4 x BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (dropout_hiddens): ModuleList(
    (0-3): 4 x Dropout(p=0.5, inplace=False)
  )
  (linear_out): Linear(in_features=200, out_features=1, bias=True)
)

Обучим новую модель

In [39]:
%%time
train_loop(model_upgrade, train_dataloader, val_dataloader, 200, 0.001)

Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.7270 - Val Loss: 0.6958


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6699 - Val Loss: 0.6697


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6606 - Val Loss: 0.6551
Обучение завершено!
CPU times: total: 23.2 s
Wall time: 2min 54s


Проверим качество новой модели

In [40]:
evaluate_model(model_upgrade, val_dataloader)

Counter({np.float32(0.0): 1011, np.float32(1.0): 989})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.59      0.61      0.60       981
     class 1       0.61      0.59      0.60      1019

    accuracy                           0.60      2000
   macro avg       0.60      0.60      0.60      2000
weighted avg       0.60      0.60      0.60      2000



Заметим, что f1-score стал поменьше

## Эксперименты с параметром p

Поробуем изменить параметр p

In [41]:
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
    print(f'Проверяем с p={p}')
    model_upgrade = BaselineModelUpgrade(inputs_size=15, hidden_size=200, out_size=1, hidden_count=4, dropout_p=p)
    model_upgrade.to(device)
    train_loop(model_upgrade, train_dataloader, val_dataloader, 200, 0.001)
    evaluate_model(model_upgrade, val_dataloader)

Проверяем с p=0.1


Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.6932 - Val Loss: 0.6706


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6198 - Val Loss: 0.6077


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6113 - Val Loss: 0.6313
Обучение завершено!
Counter({np.float32(0.0): 1007, np.float32(1.0): 993})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.65      0.67      0.66       981
     class 1       0.67      0.66      0.66      1019

    accuracy                           0.66      2000
   macro avg       0.66      0.66      0.66      2000
weighted avg       0.66      0.66      0.66      2000

Проверяем с p=0.3


Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.7094 - Val Loss: 0.6868


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6465 - Val Loss: 0.6306


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6361 - Val Loss: 0.6292
Обучение завершено!
Counter({np.float32(0.0): 1121, np.float32(1.0): 879})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.62      0.70      0.66       981
     class 1       0.67      0.58      0.62      1019

    accuracy                           0.64      2000
   macro avg       0.64      0.64      0.64      2000
weighted avg       0.64      0.64      0.64      2000

Проверяем с p=0.5


Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.7254 - Val Loss: 0.6923


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6674 - Val Loss: 0.6709


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6570 - Val Loss: 0.6565
Обучение завершено!
Counter({np.float32(0.0): 1205, np.float32(1.0): 795})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.58      0.71      0.63       981
     class 1       0.64      0.50      0.56      1019

    accuracy                           0.60      2000
   macro avg       0.61      0.60      0.60      2000
weighted avg       0.61      0.60      0.60      2000

Проверяем с p=0.7


Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.7528 - Val Loss: 0.6932


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6908 - Val Loss: 0.6956


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6872 - Val Loss: 0.7035
Обучение завершено!
Counter({np.float32(1.0): 1997, np.float32(0.0): 3})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.00      0.00      0.00       981
     class 1       0.51      1.00      0.67      1019

    accuracy                           0.51      2000
   macro avg       0.25      0.50      0.34      2000
weighted avg       0.26      0.51      0.34      2000

Проверяем с p=0.9


Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.8328 - Val Loss: 0.6935


Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.6949 - Val Loss: 0.6932


Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/63 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.6953 - Val Loss: 0.6937
Обучение завершено!
Counter({np.float32(1.0): 2000})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.00      0.00      0.00       981
     class 1       0.51      1.00      0.68      1019

    accuracy                           0.51      2000
   macro avg       0.25      0.50      0.34      2000
weighted avg       0.26      0.51      0.34      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Видно, что с увеличением параметра `p` f1-score для `class 0` вовсе не считается. Делаем вывод, что для текущей задачи подходит небольшое значение `p`.