Импорты

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

Загрузка датасета для классификации

In [2]:
df_heart = pd.read_csv('heart.csv')
X_cls = df_heart.drop('target', axis=1)
y_cls = df_heart['target']

Разделение на тестовые и тренировочные

In [3]:
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

Загрузка датасета для регрессии

In [4]:
df_energy = pd.read_csv('energy_efficiency_data.csv')
X_reg = df_energy.drop(['Heating_Load', 'Cooling_Load'], axis=1)
y_reg = df_energy['Heating_Load']

Определение признаков

In [5]:
num_features = ['Relative_Compactness', 'Surface_Area', 'Wall_Area', 'Roof_Area', 'Overall_Height']
cat_features = ['Orientation', 'Glazing_Area', 'Glazing_Area_Distribution']

Разделение на тестовые и тренировочные

In [6]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

KNN классификатор

In [7]:
class MyKNNClassifier:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        return self

    def _predict_single(self, x):
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_nearest_idx = np.argpartition(distances, self.k)[:self.k]
        k_nearest_labels = self.y_train[k_nearest_idx]
        values, counts = np.unique(k_nearest_labels, return_counts=True)
        return values[np.argmax(counts)]

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

KNN регрессор

In [8]:
class MyKNNRegressor:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        return self

    def _predict_single(self, x):
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_nearest_idx = np.argpartition(distances, self.k)[:self.k]
        return np.mean(self.y_train[k_nearest_idx])

    def predict(self, X):
        return np.array([self._predict_single(x) for x in X])

Метрики для классификации

In [9]:
def evaluate_classification(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_pred)
    }

Метрики для регрессии

In [10]:
def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

Запуск базовой модели KNN sklearn (классификация)

In [11]:
k_base = 5
sk_cls_base = KNeighborsClassifier(n_neighbors=k_base)
sk_cls_base.fit(X_train_cls, y_train_cls)
y_pred_cls_sk_base = sk_cls_base.predict(X_test_cls)

Запуск кастомной базовой модели KNN (классификация)

In [12]:
custom_cls_base = MyKNNClassifier(k=k_base)
custom_cls_base.fit(X_train_cls.values, y_train_cls.values)
y_pred_cls_custom_base = custom_cls_base.predict(X_test_cls.values)

Вывод метрик классификации

In [13]:
print("Sklearn:", evaluate_classification(y_test_cls, y_pred_cls_sk_base))
print("Custom :", evaluate_classification(y_test_cls, y_pred_cls_custom_base))

Sklearn: {'Accuracy': 0.697560975609756, 'F1': 0.6990291262135923, 'ROC-AUC': 0.6978571428571428}
Custom : {'Accuracy': 0.697560975609756, 'F1': 0.6990291262135923, 'ROC-AUC': 0.6978571428571428}


Запуск базовой модели KNN sklearn (регрессия)

In [14]:
sk_reg_base = KNeighborsRegressor(n_neighbors=k_base)
sk_reg_base.fit(X_train_reg, y_train_reg)
y_pred_reg_sk_base = sk_reg_base.predict(X_test_reg)

Запуск кастомной базовой модели KNN (регрессия)

In [15]:
custom_reg_base = MyKNNRegressor(k=k_base)
custom_reg_base.fit(X_train_reg.values, y_train_reg.values)
y_pred_reg_custom_base = custom_reg_base.predict(X_test_reg.values)

Вывод метрик регрессии

In [16]:
print("Sklearn:", evaluate_regression(y_test_reg, y_pred_reg_sk_base))
print("Custom :", evaluate_regression(y_test_reg, y_pred_reg_custom_base))

Sklearn: {'MAE': 1.5660519480519488, 'RMSE': np.float64(2.210241192691452), 'R2': 0.9531315984331779}
Custom : {'MAE': 1.5574155844155846, 'RMSE': np.float64(2.2115974422921654), 'R2': 0.95307406194929}


Применим улучшения

Скейлинг

In [17]:
scaler_cls = StandardScaler()
X_train_cls_scaled = scaler_cls.fit_transform(X_train_cls)
X_test_cls_scaled = scaler_cls.transform(X_test_cls)

Подбор k

In [18]:
param_grid = {'n_neighbors': list(range(3, 16, 2))}
grid_cls = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='f1')
grid_cls.fit(X_train_cls_scaled, y_train_cls)
best_k_cls = grid_cls.best_params_['n_neighbors']

Запуск улучшенной модели KNN sklearn (классификация)

In [19]:
sk_cls_opt = KNeighborsClassifier(n_neighbors=best_k_cls)
sk_cls_opt.fit(X_train_cls_scaled, y_train_cls)
y_pred_cls_sk_opt = sk_cls_opt.predict(X_test_cls_scaled)

Запуск кастомной улучшенной модели KNN (классификация)

In [20]:
custom_cls_opt = MyKNNClassifier(k=best_k_cls)
custom_cls_opt.fit(X_train_cls_scaled, y_train_cls)
y_pred_cls_custom_opt = custom_cls_opt.predict(X_test_cls_scaled)

Вывод метрик классификации

In [21]:
print("Sklearn:", evaluate_classification(y_test_cls, y_pred_cls_sk_opt))
print("Custom :", evaluate_classification(y_test_cls, y_pred_cls_custom_opt))

Sklearn: {'Accuracy': 0.9463414634146341, 'F1': 0.9483568075117371, 'ROC-AUC': 0.9459523809523809}
Custom : {'Accuracy': 0.9463414634146341, 'F1': 0.9483568075117371, 'ROC-AUC': 0.9459523809523809}


Подготовка данных регресии

In [22]:
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', num_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_features)
    ]
)

Повторное разделение на тестовые и тренировочные

In [23]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

Применение препроцессинга

In [24]:
X_train_reg_prep = preprocessor_reg.fit_transform(X_train_reg)
X_test_reg_prep = preprocessor_reg.transform(X_test_reg)

Подбор k по MAE

In [25]:
param_grid = {'n_neighbors': list(range(3, 16, 2))}
grid_reg = GridSearchCV(
    KNeighborsRegressor(),
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error'
)
grid_reg.fit(X_train_reg_prep, y_train_reg)
best_k_reg = grid_reg.best_params_['n_neighbors']

Запуск улучшенной модели KNN sklearn (регрессия)

In [26]:
sk_reg_opt = KNeighborsRegressor(n_neighbors=best_k_reg)
sk_reg_opt.fit(X_train_reg_prep, y_train_reg)
y_pred_sk = sk_reg_opt.predict(X_test_reg_prep)

Запуск кастомной улучшенной модели KNN (регрессия)

In [27]:
custom_reg_opt = MyKNNRegressor(k=best_k_reg)
custom_reg_opt.fit(X_train_reg_prep, y_train_reg)
y_pred_custom = custom_reg_opt.predict(X_test_reg_prep)

Вывод метрик регрессии

In [28]:
print("Sklearn:", evaluate_regression(y_test_reg, y_pred_sk))
print("Custom :", evaluate_regression(y_test_reg, y_pred_custom))

Sklearn: {'MAE': 1.492844155844156, 'RMSE': np.float64(2.2953609398427615), 'R2': 0.9494521394288211}
Custom : {'MAE': 1.42174025974026, 'RMSE': np.float64(2.2894307124289623), 'R2': 0.949712989979414}
