In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from deslib.des import DESKNN, KNORAE, KNORAU, KNOP, DESP
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# грузим
print("Processing HIGGS dataset...")
try:
    higgs = fetch_openml(name='HIGGS', version=1, as_frame=False)
except Exception as e:
    raise ValueError("Error downloading the HIGGS dataset. Check its availability on OpenML.") from e

X_higgs = higgs.data
y_higgs = pd.to_numeric(higgs.target, errors='coerce').astype(int)

# очистка данных от NaN значений для корректной работы
X_higgs_df = pd.DataFrame(X_higgs)

X_higgs_df = X_higgs_df.apply(pd.to_numeric, errors='coerce')  # в числовой формат

# убираем строки с NaN значениями из X и y
mask_higgs = ~pd.isnull(y_higgs)  # делаем маску для строк с NaN в y
X_higgs_clean = X_higgs_df[mask_higgs]  # убираем строки с NaN из X
y_higgs_clean = y_higgs[mask_higgs]  # убираем строки с NaN из y

X_higgs_clean = X_higgs_clean.dropna(axis=0)

y_higgs_clean = y_higgs_clean[X_higgs_clean.index]

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_higgs_clean, y_higgs_clean, test_size=0.2, random_state=42
)

print("Processing Wine Quality – Red dataset...")
try:
    wine = fetch_openml(name='wine-quality-red', version=1, as_frame=True)
except Exception as e:
    raise ValueError("Error downloading the Wine Quality – Red dataset. Check its availability on OpenML.") from e

df_wine = wine.frame.copy()
df_wine['target'] = df_wine['class']
X_wine = df_wine.drop(columns=['class', 'target'])
y_wine = df_wine['target']

X_wine_df = pd.DataFrame(X_wine)  # преобразую, чтобы дальше использовать apply

X_wine_df = X_wine_df.apply(pd.to_numeric, errors='coerce')  # в числовой формат

# убираем строки с NaN значениями из X и y
mask_wine = ~pd.isnull(y_wine)  # делаем маску для строк с NaN в y
X_wine_clean = X_wine_df[mask_wine]  # убираем строки с NaN из X
y_wine_clean = y_wine[mask_wine]  # убираем строки с NaN из y

X_wine_clean = X_wine_clean.dropna(axis=0)

y_wine_clean = y_wine_clean[X_wine_clean.index]

X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(
    X_wine_clean, y_wine_clean, test_size=0.2, random_state=42
)

y_train_w, y_test_w = y_train_w.to_numpy().astype(float), y_test_w.to_numpy().astype(float)


Processing HIGGS dataset...
Processing Wine Quality – Red dataset...


In [8]:
# обучаем классификаторы
svc_model_higgs = SVC(probability=True, random_state=42)  # SVC для ансамбля
svc_model_higgs.fit(X_train_h, y_train_h)

In [9]:
knn_model_higgs = KNeighborsClassifier(n_neighbors=5)  # настройка KNeighborsClassifier
knn_model_higgs.fit(X_train_h, y_train_h)

In [10]:
pool_classifiers_higgs = [knn_model_higgs, svc_model_higgs]

In [12]:
# Параметры для перебора
k_values = [5, 7]
metrics = ['DF', 'Q']
voting_methods = ['hard', 'soft']
knn_metrics = ['minkowski', 'mahalanobis']

best_accuracy = 0
best_mse = float('inf')  # Изначально лучшее MSE бесконечно большое
best_params = {}

# Перебор параметров в цикле
for k in k_values:
    for metric in metrics:
        for voting in voting_methods:
            for knn_metric in knn_metrics:
                try:
                    # Создаем модель DESKNN с текущими параметрами
                    desknn_model = DESKNN(pool_classifiers=pool_classifiers_higgs,
                                          k=k,
                                          metric=metric,
                                          voting=voting,
                                          knn_metric=knn_metric)

                    # Обучаем модель
                    desknn_model.fit(X_train_h, y_train_h)

                    # Оценка accuracy и MSE для HIGGS
                    y_pred_higgs = desknn_model.predict(X_test_h)
                    accuracy = accuracy_score(y_test_h, y_pred_higgs)
                    mse = mean_squared_error(y_test_h, y_pred_higgs)

                    # Сохраняем лучшие параметры для accuracy и MSE
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params['accuracy'] = {
                            'k': k,
                            'metric': metric,
                            'voting': voting,
                            'knn_metric': knn_metric
                        }

                    if mse < best_mse:
                        best_mse = mse
                        best_params['mse'] = {
                            'k': k,
                            'metric': metric,
                            'voting': voting,
                            'knn_metric': knn_metric
                        }

                except ValueError as e:
                    print(f"Warning: ValueError with parameters k={k}, metric={metric}, "
                          f"voting={voting}, knn_metric={knn_metric}: {e}")
                except Exception as e:
                    print(f"Warning: Unexpected error with parameters k={k}, metric={metric}, "
                          f"voting={voting}, knn_metric={knn_metric}: {e}")

# Печать лучших параметров и результатов
if best_params:
    print(f"Best Accuracy Parameters: {best_params['accuracy']}")
    print(f"Best Accuracy: {best_accuracy:.4f}")
    print(f"Best MSE Parameters: {best_params['mse']}")
    print(f"Best MSE: {best_mse:.4f}")
else:
    print("No valid parameters found.")

Best Accuracy: 0.6302
Best Accuracy Parameters: {'k': 7, 'metric': 'DF', 'voting': 'hard', 'knn_metric': 'minkowski'}
Best MSE: 0.3698
Best MSE Parameters: {'k': 7, 'metric': 'DF', 'voting': 'hard', 'knn_metric': 'minkowski'}


In [15]:
from sklearn.model_selection import GridSearchCV

# Параметры для перебора для каждого метода
param_grid_knorae = {
    'k': [5, 7],
    'pct_accuracy': [0.3, 0.5, 0.7],
    'pct_diversity': [0.3, 0.5],
}

param_grid_knorau = {
    'k': [5, 7],
    'pct_accuracy': [0.3, 0.5, 0.7],
    'pct_diversity': [0.3, 0.5],
}

param_grid_knop = {
    'k': [5, 7],
    'metric': ['df', 'q', 'ratio'],
    'with_IH': [True, False],
}

param_grid_desp = {
    'k': [5, 7],
    'metric': ['df', 'q', 'ratio'],
    'voting': ['hard', 'soft'],
}

# Используем GridSearchCV для каждого метода
def grid_search_for_model(model_class, param_grid):
    model = model_class(pool_classifiers=pool_classifiers_higgs)  # Создаем модель с пулом классификаторов
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train_h, y_train_h)
    return grid_search.best_estimator_, grid_search.best_params_

# Перебор для KNORAE
best_knorae_model, best_knorae_params = grid_search_for_model(KNORAE, param_grid_knorae)

# Перебор для KNORAU
best_knorau_model, best_knorau_params = grid_search_for_model(KNORAU, param_grid_knorau)

# Перебор для KNOP
best_knop_model, best_knop_params = grid_search_for_model(KNOP, param_grid_knop)

# Перебор для DESP
best_desp_model, best_desp_params = grid_search_for_model(DESP, param_grid_desp)

# Оценка accuracy и MSE для лучших моделей
models = [best_knorae_model, best_knorau_model, best_knop_model, best_desp_model]
for model in models:
    y_pred_higgs = model.predict(X_test_h)
    accuracy = accuracy_score(y_test_h, y_pred_higgs)
    mse = mean_squared_error(y_test_h, y_pred_higgs)
    print(f"Model: {model.__class__.__name__} - Accuracy: {accuracy:.4f}, MSE: {mse:.4f}")

Model: KNORAE - Accuracy: 0.8329, MSE: 0.2671
Model: KNORAU - Accuracy: 0.6215, MSE: 0.3779
Model: KNOP - Accuracy: 0.6332, MSE: 0.3673
Model: DESP - Accuracy: 0.5410, MSE: 0.4317


Про методы:
desknn использует динамический выбор ансамбля на основе как точности, так и разнообразия классификаторов
knorae выбирает классификаторы, основываясь исключительно на их точности в соответствующем регионе компетенции. Он дает лучший результт на датасете higgs, тк он хорошо подходит для датасетов с множеством признаков
knorau кроме точности учитывает также неопределенность классификаторов
knop выбирает классификаторы, основываясь на их разнообразии
desp использует динамический выбор ансамбля с обрезкой (pruning) для удаления неэффективных классификаторов

In [16]:
knn_model_wine = KNeighborsClassifier(n_neighbors=5)  # настройка KNeighborsClassifier
knn_model_wine.fit(X_train_h, y_train_h)

In [17]:
svc_model_wine = SVC(probability=True, random_state=42)  # SVC для ансамбля
svc_model_wine.fit(X_train_h, y_train_h)

In [19]:
pool_classifiers_wine = [knn_model_wine, svc_model_wine]

param_grid_knorae = {
    'k': [5, 7],
    'pct_accuracy': [0.3, 0.5, 0.7],
    'pct_diversity': [0.3, 0.5],
}

param_grid_knorau = {
    'k': [5, 7],
    'pct_accuracy': [0.3, 0.5, 0.7],
    'pct_diversity': [0.3, 0.5],
}

param_grid_knop = {
    'k': [5, 7],
    'metric': ['df', 'q', 'ratio'],
    'with_IH': [True, False],
}

param_grid_desp = {
    'k': [5, 7],
    'metric': ['df', 'q', 'ratio'],
    'voting': ['hard', 'soft'],
}

param_grid_desknn = {
    'k': [5, 7],
    'metric': ['euclidean', 'manhattan'],
}

# Используем GridSearchCV для каждого метода
def grid_search_for_model(model_class, param_grid):
    model = model_class(pool_classifiers=pool_classifiers_wine)  # Создаем модель с пулом классификаторов
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
    grid_search.fit(X_train_w, y_train_w)
    return grid_search.best_estimator_, grid_search.best_params_

# Перебор для KNORAE
best_knorae_model, best_knorae_params = grid_search_for_model(KNORAE, param_grid_knorae)

# Перебор для KNORAU
best_knorau_model, best_knorau_params = grid_search_for_model(KNORAU, param_grid_knorau)

# Перебор для KNOP
best_knop_model, best_knop_params = grid_search_for_model(KNOP, param_grid_knop)

# Перебор для DESP
best_desp_model, best_desp_params = grid_search_for_model(DESP, param_grid_desp)

# Перебор для DESKNN
best_desknn_model, best_desknn_params = grid_search_for_model(DESKNN, param_grid_desknn)

# Оценка accuracy и MSE для лучших моделей
models = [best_desknn_model, best_knorae_model, best_knorau_model, best_knop_model, best_desp_model]
for model in models:
    y_pred_wine = model.predict(X_test_w)
    accuracy = accuracy_score(y_test_w, y_pred_wine)
    mse = mean_squared_error(y_test_w, y_pred_wine)
    print(f"Model: {model.__class__.__name__} - Accuracy: {accuracy:.4f}, MSE: {mse:.4f}")

Model: DESKNN - Accuracy: 0.7917, MSE: 0.2861
Model: KNORAE - Accuracy: 0.7519, MSE: 0.2245
Model: KNORAU - Accuracy: 0.6621, MSE: 0.3491
Model: KNOP - Accuracy: 0.6484, MSE: 0.3816
Model: DESP - Accuracy: 0.6612, MSE: 0.3375


Тут лучший результат у desknn, тк использует динамический выбор классификаторов