# Импорты

In [1]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tqdm.notebook import tqdm

Зафиксируем random_seed

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)

Random seed set as 42


# 1 Загрузка данных

In [3]:
df = pd.read_csv('air_quality.csv', sep=',')
df.head()

Unnamed: 0,Temperature,Humidity,PM2.5,PM10,NO2,SO2,CO,Proximity_to_Industrial_Areas,Population_Density,air_quality
0,29.8,59.1,5.2,17.9,18.9,9.2,1.72,6.3,319,1
1,28.3,75.6,2.3,12.2,30.8,9.7,1.64,6.0,611,1
2,23.1,74.7,26.7,33.8,24.4,12.6,1.63,5.2,619,1
3,27.1,39.1,6.1,6.3,13.5,5.3,1.15,11.1,551,1
4,26.5,70.7,6.9,16.0,21.9,5.6,1.01,12.7,303,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Temperature                    5000 non-null   float64
 1   Humidity                       5000 non-null   float64
 2   PM2.5                          5000 non-null   float64
 3   PM10                           5000 non-null   float64
 4   NO2                            5000 non-null   float64
 5   SO2                            5000 non-null   float64
 6   CO                             5000 non-null   float64
 7   Proximity_to_Industrial_Areas  5000 non-null   float64
 8   Population_Density             5000 non-null   int64  
 9   air_quality                    5000 non-null   int64  
dtypes: float64(8), int64(2)
memory usage: 390.8 KB


In [5]:
df.shape

(5000, 10)

# 2 Подготовка данных

Разделим датасет на обучающую, валидационную и тестовую выборку со стратификацией. Целевая переменная — `air_quality`

In [6]:
# Размеры выборок
train_size = 0.6
val_size = 0.2
test_size = 0.2

# Разделение признаков и целевой переменной
X = df.drop(columns=['air_quality'])
y = df['air_quality']

# Разбивка
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=train_size, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_size / (test_size + val_size)), random_state=42, stratify=y_temp)

# Выведем результат
print(f'Входное количество ({1.0*100}%): {df.shape[0]}')
print(f'Тренировачный размер ({train_size*100}%): {X_train.shape[0]}')
print(f'Валидационный размер ({val_size*100}%): {X_val.shape[0]}')
print(f'Тестовый размер ({test_size*100}%): {X_test.shape[0]}')

Входное количество (100.0%): 5000
Тренировачный размер (60.0%): 3000
Валидационный размер (20.0%): 1000
Тестовый размер (20.0%): 1000


Создадим свой `Dateset`

In [7]:
class CustomDataset(Dataset):
    def __init__(self, X, y): 
        """Преобразование из DataFrames в Tensors"""
        self.X = torch.Tensor(X.values)
        self.y = torch.from_numpy(y.values).reshape(-1, 1).float()
    
    def __len__(self):
        """Возвращает длину тензора"""
        return self.X.shape[0]
    
    def __getitem__(self, index):
        """Возвращает 1 строку данных признаки-таргет"""
        return (self.X[index], self.y[index])

Инициализируем датасеты

In [8]:
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

Инициализируем `DataLoader`'s. Пока примем `batch_size` за 32

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
val_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 3 Нейросетевая модель

Создадим шаблон нашей кастомной модели

In [10]:
class CustomModel(nn.Module):
    def __init__(self, inputs_size, hidden_size, out_size, hidden_count):
        super(CustomModel, self).__init__()
        self.linear_in = nn.Linear(inputs_size, hidden_size)
        self.linear_hiddens = nn.ModuleList()
        for _ in range(hidden_count):
            self.linear_hiddens.append(nn.Linear(hidden_size, hidden_size))
        self.linear_out = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        x = self.linear_in(x)
        x = torch.relu(x)
        for linear_hidden in self.linear_hiddens:
            x = linear_hidden(x)
            x = torch.relu(x)
        x = self.linear_out(x)
        y_pred = torch.sigmoid(x)
        return y_pred

Зададим метод переброса на доступный device

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Создадим объект модели

In [66]:
model = CustomModel(inputs_size=9, hidden_size=100, out_size=1, hidden_count=4)
model.to(device)

CustomModel(
  (linear_in): Linear(in_features=9, out_features=100, bias=True)
  (linear_hiddens): ModuleList(
    (0-3): 4 x Linear(in_features=100, out_features=100, bias=True)
  )
  (linear_out): Linear(in_features=100, out_features=1, bias=True)
)

# 4 Шаблон обучения модели

In [61]:
def train_loop(model, train_dataloader, val_dataloader, epochs=10, lr=0.001):
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        # Обучающая часть цикла
        for batch in (tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs} [Training]') 
                      if ((epoch+1) % 100 == 0 or epoch == 0) else train_dataloader):
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()  # Обнуляем градиенты
            y_pred = model(X_batch)  # Предсказание модели
            loss = criterion(y_pred, y_batch)  # Вычисляем ошибку
            loss.backward()  # Обратное распространение ошибки
            optimizer.step()  # Шаг оптимизации
            train_loss += loss.item() * X_batch.size(0)  # Учитываем потерю для этой мини-партии
        train_loss /= len(train_dataloader.dataset)  # Средняя ошибка по всему набору
        
        # Валидационная часть цикла
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in (tqdm(val_dataloader, desc=f'Epoch {epoch+1}/{epochs} [Training]') 
                          if ((epoch+1) % 100 == 0 or epoch == 0) else val_dataloader):
                X_batch, y_batch = batch
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)  # Предсказание модели
                loss = criterion(y_pred, y_batch)  # Вычисляем ошибку
                val_loss += loss.item() * X_batch.size(0)  # Учитываем потерю для этой мини-партии
        val_loss /= len(val_dataloader.dataset)  # Средняя ошибка по всему набору
        if (epoch+1) % 100 == 0  or epoch == 0:
            print(f'Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}')
    print('Обучение завершено!')

# 5 Обучение модели

В качестве оптимизатора был выбран SGD

In [67]:
%%time
train_loop(model, train_dataloader, val_dataloader, 2000, 0.001)

Epoch 1/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1/2000] - Train Loss: 0.6446 - Val Loss: 0.6326


Epoch 100/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 100/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [100/2000] - Train Loss: 0.3244 - Val Loss: 0.2866


Epoch 200/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 200/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [200/2000] - Train Loss: 0.2608 - Val Loss: 0.2343


Epoch 300/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 300/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [300/2000] - Train Loss: 0.2470 - Val Loss: 0.2175


Epoch 400/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 400/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [400/2000] - Train Loss: 0.2316 - Val Loss: 0.3249


Epoch 500/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 500/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [500/2000] - Train Loss: 0.2118 - Val Loss: 0.1936


Epoch 600/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 600/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [600/2000] - Train Loss: 0.1872 - Val Loss: 0.1850


Epoch 700/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 700/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [700/2000] - Train Loss: 0.1762 - Val Loss: 0.1709


Epoch 800/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 800/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [800/2000] - Train Loss: 0.1513 - Val Loss: 0.1946


Epoch 900/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 900/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [900/2000] - Train Loss: 0.1454 - Val Loss: 0.1706


Epoch 1000/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1000/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1000/2000] - Train Loss: 0.1492 - Val Loss: 0.1427


Epoch 1100/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1100/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1100/2000] - Train Loss: 0.1302 - Val Loss: 0.1810


Epoch 1200/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1200/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1200/2000] - Train Loss: 0.1051 - Val Loss: 0.5229


Epoch 1300/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1300/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1300/2000] - Train Loss: 0.1035 - Val Loss: 0.1372


Epoch 1400/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1400/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1400/2000] - Train Loss: 0.1394 - Val Loss: 0.1813


Epoch 1500/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1500/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1500/2000] - Train Loss: 0.0960 - Val Loss: 0.1742


Epoch 1600/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1600/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1600/2000] - Train Loss: 0.0839 - Val Loss: 0.1340


Epoch 1700/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1700/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1700/2000] - Train Loss: 0.0780 - Val Loss: 0.1663


Epoch 1800/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1800/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1800/2000] - Train Loss: 0.0593 - Val Loss: 0.1342


Epoch 1900/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1900/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [1900/2000] - Train Loss: 0.0552 - Val Loss: 0.1461


Epoch 2000/2000 [Training]:   0%|          | 0/94 [00:00<?, ?it/s]

Epoch 2000/2000 [Training]:   0%|          | 0/32 [00:00<?, ?it/s]

Epoch [2000/2000] - Train Loss: 0.0616 - Val Loss: 0.1372
Обучение завершено!
CPU times: total: 45.3 s
Wall time: 5min 30s


In [30]:
def evaluate_model(model, test_dataloader, threshold=0.5):
    model.to(device)
    model.eval()
    all_preds = []  # Для хранения всех предсказанных меток
    all_labels = []  # Для хранения всех истинных меток
    with torch.no_grad():
        for batch in test_dataloader:
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred_probs = model(X_batch)  
            y_pred_labels = (y_pred_probs >= threshold).float()
            all_preds.extend(y_pred_labels.cpu().numpy().flatten())
            all_labels.extend(y_batch.cpu().numpy().flatten())
    report = classification_report(all_labels, all_preds, target_names=['class 0', 'class 1'])
    print(Counter(all_preds))
    print("\nClassification Report:\n")
    print(report)

In [68]:
evaluate_model(model, val_dataloader)

Counter({np.float32(1.0): 721, np.float32(0.0): 279})

Classification Report:

              precision    recall  f1-score   support

     class 0       0.94      0.87      0.91       300
     class 1       0.95      0.98      0.96       700

    accuracy                           0.94      1000
   macro avg       0.94      0.92      0.93      1000
weighted avg       0.94      0.94      0.94      1000



По ходу выполнения ноутбука — менял следующие параметры:
- Количество эпох обучения
- Количество скрытых слоёв
- Размер скрытых слоёв

Сейчас подобраны оптимальные настройки