In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import numpy as np

df = pd.read_csv('ml_dataset.csv')

df['d12'] = df['d12'].str.replace(',', '.').astype(float)

df

Unnamed: 0,sd_d_in,sd_d_ex,sd_m_2,d12,N1,N2,time1,time2
0,0.01,0.01,0.01,0.0001,96,1610,1188000,0
1,0.01,0.03,0.01,0.0001,89,1541,729000,0
2,0.01,0.05,0.01,0.0001,92,1517,399000,0
3,0.01,0.07,0.01,0.0001,97,1448,363000,0
4,0.01,0.09,0.01,0.0001,95,1578,471000,0
...,...,...,...,...,...,...,...,...
16589,0.11,0.09,0.14,0.0007,97,1817,273000,0
16590,0.11,0.11,0.14,0.0007,97,1915,294000,0
16591,0.11,0.13,0.14,0.0007,90,1893,207000,0
16592,0.11,0.15,0.14,0.0007,90,1881,192000,0


In [None]:
df.loc[(df['N1'] < 100) & (df['N2'] > 1000), 'N1'] = 0
df.loc[(df['N2'] < 100) & (df['N1'] > 1000), 'N2'] = 0
df

Unnamed: 0,sd_d_in,sd_d_ex,sd_m_2,d12,N1,N2,time1,time2
0,0.01,0.01,0.01,0.0001,0,1610,1188000,0
1,0.01,0.03,0.01,0.0001,0,1541,729000,0
2,0.01,0.05,0.01,0.0001,0,1517,399000,0
3,0.01,0.07,0.01,0.0001,0,1448,363000,0
4,0.01,0.09,0.01,0.0001,0,1578,471000,0
...,...,...,...,...,...,...,...,...
16589,0.11,0.09,0.14,0.0007,0,1817,273000,0
16590,0.11,0.11,0.14,0.0007,0,1915,294000,0
16591,0.11,0.13,0.14,0.0007,0,1893,207000,0
16592,0.11,0.15,0.14,0.0007,0,1881,192000,0


Рассмотрим бинарную классификацию

In [None]:
conditions = [
    (df['N1'] < 100) | (df['N2'] < 100),
    (df['N1'] >= 100) & (df['N2'] >= 100)
]
choices = [0, 1]
df['extinction_status'] = np.select(conditions, choices)

X = df[['sd_d_in', 'sd_d_ex', 'sd_m_2', 'd12']]
y = df['extinction_status']
y.value_counts()

0    14473
1     2121
Name: extinction_status, dtype: int64

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.25, 0.5, 0.75],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 10, 'class_weight': None, 'l1_ratio': 0.25, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.9234709249774028
Confusion Matrix:
 [[2802   77]
 [ 177  263]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      2879
           1       0.77      0.60      0.67       440

    accuracy                           0.92      3319
   macro avg       0.86      0.79      0.82      3319
weighted avg       0.92      0.92      0.92      3319



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 1000, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.9234709249774028
Confusion Matrix:
 [[2802   77]
 [ 177  263]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      2879
           1       0.77      0.60      0.67       440

    accuracy                           0.92      3319
   macro avg       0.86      0.79      0.82      3319
weighted avg       0.92      0.92      0.92      3319



Попытаемся лучше

Попробуем теперь oversampling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.25, 0.5, 0.75],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')


oversampler = SMOTE()
X_train_over, y_train_over = oversampler.fit_resample(X_train_scaled, y_train)

# Обучение модели на сбалансированных данных
grid_search.fit(X_train_over, y_train_over)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 10, 'class_weight': None, 'l1_ratio': 0.25, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.8586923772220548
Confusion Matrix:
 [[2456  423]
 [  46  394]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      2879
           1       0.48      0.90      0.63       440

    accuracy                           0.86      3319
   macro avg       0.73      0.87      0.77      3319
weighted avg       0.92      0.86      0.87      3319



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')


oversampler = SMOTE()
X_train_over, y_train_over = oversampler.fit_resample(X_train_scaled, y_train)

# Обучение модели на сбалансированных данных
grid_search.fit(X_train_over, y_train_over)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.8577884905091895
Confusion Matrix:
 [[2453  426]
 [  46  394]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      2879
           1       0.48      0.90      0.63       440

    accuracy                           0.86      3319
   macro avg       0.73      0.87      0.77      3319
weighted avg       0.92      0.86      0.87      3319



Теперь попробуем undersampling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение undersampling к обучающим данным
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.25, 0.5, 0.75],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 0.1, 'class_weight': None, 'l1_ratio': 0.75, 'penalty': 'elasticnet', 'solver': 'saga'}
Accuracy: 0.8547755347996384
Confusion Matrix:
 [[2443  436]
 [  46  394]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      2879
           1       0.47      0.90      0.62       440

    accuracy                           0.85      3319
   macro avg       0.73      0.87      0.77      3319
weighted avg       0.91      0.85      0.87      3319



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение undersampling к обучающим данным
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

# Создание модели логистической регрессии
logreg = LogisticRegression()

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.8538716480867731
Confusion Matrix:
 [[2438  441]
 [  44  396]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.85      0.91      2879
           1       0.47      0.90      0.62       440

    accuracy                           0.85      3319
   macro avg       0.73      0.87      0.76      3319
weighted avg       0.91      0.85      0.87      3319



Лучше не получилоcь

Рассмотрим kNN

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели kNN
knn = KNeighborsClassifier()

# Определение параметров для GridSearchCV
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.2s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.2s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.1s
[CV] END ...metric=euclidean, n_neighbors=1, weights=uniform; total time=   0.2s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=2, weights=uniform; total time=   0.2s
[CV] END ...metric=euclidean, n_neighbors=2, w

Рассмотрим теперь SVM

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly'],
    'degree': [2, 3],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   2.1s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   2.2s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   3.7s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   2.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   2.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   2.8s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   3.2s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   1.7s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   1.1s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   1.0s
[CV] END C=0.1, class_weight

KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ..C=0.1, class_weight=None, gamma=scale, kernel=rbf; total time=   1.9s
[CV] END ..C=0.1, class_weight=None, gamma=scale, kernel=rbf; total time=   2.3s
[CV] END ..C=0.1, class_weight=None, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END ..C=0.1, class_weight=None, gamma=scale, kernel=rbf; total time=   1.1s
[CV] END ..C=0.1, class_weight=None, gamma=scale, kernel=rbf; total time=   1.0s
[CV] END ...C=0.1, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END ...C=0.1, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END ...C=0.1, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END ...C=0.1, class_weight=None, gamma=auto, kernel=rbf; total time=   1.1s
[CV] END ...C=0.1, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END C=0.1, class_weight=balanced, gamma=scale, kernel=rbf; total time=   2.0s
[CV] END C=0.1, class_weight=balanced, gamma=s

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [100, 1000, 10000],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..C=100, class_weight=None, gamma=scale, kernel=rbf; total time=   1.1s
[CV] END ..C=100, class_weight=None, gamma=scale, kernel=rbf; total time=   0.9s
[CV] END ..C=100, class_weight=None, gamma=scale, kernel=rbf; total time=   0.9s
[CV] END ..C=100, class_weight=None, gamma=scale, kernel=rbf; total time=   0.9s
[CV] END ..C=100, class_weight=None, gamma=scale, kernel=rbf; total time=   1.0s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.9s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.9s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   1.0s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.9s
[CV] END C=100, class_weight=balanced, gamma=scale, kernel=rbf; total time=   1.6s
[CV] END C=100, class_weight=balanced, gamma=s

Используем oversampling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение oversampling к обучающим данным
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly'],
    'degree': [2, 3],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=  10.9s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=  10.1s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=  11.3s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=  11.0s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   9.3s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.1s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.0s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=  10.0s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=  10.1s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=  10.1s
[CV] END C=0.1, class_weight

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение oversampling к обучающим данным
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение oversampling к обучающим данным
oversampler = SMOTE()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly'],
    'degree': [2, 3],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   8.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   6.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   5.8s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   6.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=linear; total time=   5.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=  10.7s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.7s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.3s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.6s
[CV] END C=0.1, class_weight=None, degree=2, gamma=scale, kernel=poly; total time=   9.9s
[CV] END C=0.1, class_weight

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение oversampling к обучающим данным
oversampler = SMOTE()
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Используем undersampling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение undersampling к обучающим данным
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Best parameters: {'C': 100, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}
Accuracy: 0.9255799939740886
Confusion Matrix:
 [[2638  241]
 [   6  434]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96      2879
           1       0.64      0.99      0.78       440

    accuracy                           0.93      3319
   macro avg       0.82      0.95      0.87      3319
weighted avg       0.95      0.93      0.93      3319



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Применение undersampling к обучающим данным
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)

# Создание модели SVM
svm = SVC(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'C': [100, 1000, 10000, 100000],
    'kernel': ['rbf'],
    'gamma': ['auto'],
    'class_weight': [None]
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.3s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.3s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.2s
[CV] END ...C=100, class_weight=None, gamma=auto, kernel=rbf; total time=   0.3s
[CV] END ..C=1000, class_weight=None, gamma=auto, kernel=rbf; total time=   0.9s
[CV] END ..C=1000, class_weight=None, gamma=auto, kernel=rbf; total time=   0.9s
[CV] END ..C=1000, class_weight=None, gamma=auto, kernel=rbf; total time=   0.7s
[CV] END ..C=1000, class_weight=None, gamma=auto, kernel=rbf; total time=   0.4s
[CV] END ..C=1000, class_weight=None, gamma=auto, kernel=rbf; total time=   0.8s
[CV] END .C=10000, class_weight=None, gamma=auto, kernel=rbf; total time=   3.5s
[CV] END .C=10000, class_weight=None, gamma=auto,

Рассмотрим решающее дерево

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание модели решающего дерева
dt = DecisionTreeClassifier(random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 30, 100, 300, 1000],
    'min_samples_split': [2, 5, 10, 30, 100],
    'min_samples_leaf': [1, 2, 4, 20, 100]
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train_scaled, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test_scaled)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, ma

Расмммотрим теперь анасмбли деревьев

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Создание и обучение модели с GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.6s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   2.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   4.1s
[CV] END max_depth=5, min_samples_lea

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'n_estimators': [500, 1500, 5000],
    'max_depth': [20],
    'min_samples_split': [5],
    'min_samples_leaf': [2]
}

# Создание и обучение модели с GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.4s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.5s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.0s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.1s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=500; total time=   3.7s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1500; total time=   9.3s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1500; total time=  10.0s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1500; total time=  10.0s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=1500; total time=   9.9s
[CV] END max_depth=20, min_

Классификация градиентным бустингом

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 7, 11],
    'min_samples_split': [2, 6, 10],
    'min_samples_leaf': [1, 3, 5]
}

# Создание и обучение модели с GridSearchCV
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.8s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.9s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'n_estimators': [500, 1500, 4500],
    'learning_rate': [0.3, 1, 3],
    'max_depth': [1,2,3],
    'min_samples_split': [1, 2],
    'min_samples_leaf': [1]
}

# Создание и обучение модели с GridSearchCV
gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(gb, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=1500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=1500; total time=   0.0s
[CV] END learning_rate=0.3, max_depth=1, min_samples_leaf=1, min_samples_split=1, n_estimators=1500; to

Наивный Байессовский классификатор

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Определение параметров для GridSearchCV
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4]
}

# Создание и обучение модели с GridSearchCV
nb = GaussianNB()
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Предсказание на тестовой выборке с использованием лучшей модели
y_pred = best_model.predict(X_test)

# Оценка качества лучшей модели
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smo

С помощью нейросетей

In [None]:
!pip install keras==2.12.0

Collecting keras==2.12.0
  Downloading keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.15.0
    Uninstalling keras-2.15.0:
      Successfully uninstalled keras-2.15.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 2.12.0 which is incompatible.[0m[31m
[0mSuccessfully installed keras-2.12.0


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import f1_score

# Функция для создания модели
def create_model(optimizer='adam', activation='relu', neurons=32):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание обертки KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=2)

# Определение диапазонов гиперпараметров
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'activation': ['relu', 'tanh'],
    'neurons': [16, 32, 64],
    'epochs': [100]
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Оценка качества лучшей модели на тестовой выборке
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Epoch 33/100
277/277 - 0s - loss: 0.2030 - accuracy: 0.9215 - 449ms/epoch - 2ms/step
Epoch 34/100
277/277 - 0s - loss: 0.2015 - accuracy: 0.9206 - 453ms/epoch - 2ms/step
Epoch 35/100
277/277 - 0s - loss: 0.1924 - accuracy: 0.9237 - 447ms/epoch - 2ms/step
Epoch 36/100
277/277 - 0s - loss: 0.1891 - accuracy: 0.9262 - 461ms/epoch - 2ms/step
Epoch 37/100
277/277 - 0s - loss: 0.1830 - accuracy: 0.9299 - 441ms/epoch - 2ms/step
Epoch 38/100
277/277 - 0s - loss: 0.1880 - accuracy: 0.9247 - 452ms/epoch - 2ms/step
Epoch 39/100
277/277 - 0s - loss: 0.1826 - accuracy: 0.9295 - 435ms/epoch - 2ms/step
Epoch 40/100
277/277 - 0s - loss: 0.1781 - accuracy: 0.9292 - 438ms/epoch - 2ms/step
Epoch 41/100
277/277 - 1s - loss: 0.1760 - accuracy: 0.9296 - 591ms/epoch - 2ms/step
Epoch 42/100
277/277 - 1s - loss: 0.1728 - accuracy: 0.9312 - 640ms/epoch - 2ms/step
Epoch 43/100
277/277 - 1s - loss: 0.1742 - accuracy: 0.9287 - 605ms/

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import f1_score

# Функция для создания модели
def create_model(optimizer='adam', activation='relu', neurons=32):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание обертки KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=2)

# Определение диапазонов гиперпараметров
param_grid = {
    'optimizer': ['rmsprop'],
    'activation': ['relu'],
    'neurons': [64, 128, 256],
    'epochs': [100, 300, 1000]
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Оценка качества лучшей модели на тестовой выборке
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Epoch 511/1000
277/277 - 1s - loss: 0.1257 - accuracy: 0.9418 - 705ms/epoch - 3ms/step
Epoch 512/1000
277/277 - 1s - loss: 0.1272 - accuracy: 0.9400 - 726ms/epoch - 3ms/step
Epoch 513/1000
277/277 - 1s - loss: 0.1261 - accuracy: 0.9424 - 720ms/epoch - 3ms/step
Epoch 514/1000
277/277 - 1s - loss: 0.1255 - accuracy: 0.9403 - 706ms/epoch - 3ms/step
Epoch 515/1000
277/277 - 1s - loss: 0.1229 - accuracy: 0.9418 - 708ms/epoch - 3ms/step
Epoch 516/1000
277/277 - 1s - loss: 0.1229 - accuracy: 0.9436 - 670ms/epoch - 2ms/step
Epoch 517/1000
277/277 - 1s - loss: 0.1236 - accuracy: 0.9421 - 723ms/epoch - 3ms/step
Epoch 518/1000
277/277 - 1s - loss: 0.1249 - accuracy: 0.9421 - 716ms/epoch - 3ms/step
Epoch 519/1000
277/277 - 1s - loss: 0.1219 - accuracy: 0.9438 - 738ms/epoch - 3ms/step
Epoch 520/1000
277/277 - 1s - loss: 0.1238 - accuracy: 0.9427 - 691ms/epoch - 2ms/step
Epoch 521/1000
277/277 - 1s - loss: 0.1235 - acc

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import f1_score

# Функция для создания модели
def create_model(optimizer='adam', activation='relu', neurons=32):
    model = Sequential()
    model.add(Dense(neurons, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание обертки KerasClassifier
model = KerasClassifier(build_fn=create_model, verbose=2)

# Определение диапазонов гиперпараметров
param_grid = {
    'optimizer': ['rmsprop'],
    'activation': ['relu'],
    'neurons': [128],
    'epochs': [1000, 3000, 10000]
}

# Создание и обучение модели с GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='f1_macro', verbose=2)
grid_search.fit(X_train, y_train)

# Получение наилучших параметров и лучшей модели
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best parameters:", best_params)

# Оценка качества лучшей модели на тестовой выборке
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
277/277 - 0s - loss: 0.1067 - accuracy: 0.9502 - 472ms/epoch - 2ms/step
Epoch 1984/3000
277/277 - 0s - loss: 0.1060 - accuracy: 0.9513 - 464ms/epoch - 2ms/step
Epoch 1985/3000
277/277 - 0s - loss: 0.1066 - accuracy: 0.9490 - 478ms/epoch - 2ms/step
Epoch 1986/3000
277/277 - 0s - loss: 0.1071 - accuracy: 0.9496 - 465ms/epoch - 2ms/step
Epoch 1987/3000
277/277 - 1s - loss: 0.1062 - accuracy: 0.9498 - 709ms/epoch - 3ms/step
Epoch 1988/3000
277/277 - 1s - loss: 0.1059 - accuracy: 0.9512 - 709ms/epoch - 3ms/step
Epoch 1989/3000
277/277 - 1s - loss: 0.1045 - accuracy: 0.9513 - 713ms/epoch - 3ms/step
Epoch 1990/3000
277/277 - 0s - loss: 0.1054 - accuracy: 0.9513 - 467ms/epoch - 2ms/step
Epoch 1991/3000
277/277 - 0s - loss: 0.1064 - accuracy: 0.9495 - 463ms/epoch - 2ms/step
Epoch 1992/3000
277/277 - 0s - loss: 0.1056 - accuracy: 0.9521 - 477ms/epoch - 2ms/step
Epoch 1993/3000
277/277 - 0s - loss: 0.1084 - accuracy: 0.9506 - 463ms/