# ***ML-7. Прогнозирование биологического ответа (HW-3)***

#### ***Необходимо***:
1. обучить две модели: 
    - логистическую регрессию
    - случайный лес. 
2. сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. 
Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

***В качестве метрики будем использовать F1-score.***



In [1]:
# Импортируем используемые библиотеки.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

%matplotlib inline
plt.style.use('seaborn')

  from .autonotebook import tqdm as notebook_tqdm
  plt.style.use('seaborn')


In [2]:
# Загружаем датафрейм.
bio_answer = pd.read_csv('data/_train_sem09 (1).csv')
bio_answer

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.000000,0.497009,0.10,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.033300,0.480124,0.00,0.0,0.209791,0.610350,0.356453,0.517720,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.000000,0.538825,0.00,0.5,0.196344,0.724230,0.235606,0.288764,0.805110,...,0,0,0,0,0,0,0,0,0,0
4,0,0.100000,0.517794,0.00,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3746,1,0.033300,0.506409,0.10,0.0,0.209887,0.633426,0.297659,0.376124,0.727093,...,0,0,0,0,0,0,0,0,0,0
3747,1,0.133333,0.651023,0.15,0.0,0.151154,0.766505,0.170876,0.404546,0.787935,...,0,0,1,0,1,0,1,0,0,0
3748,0,0.200000,0.520564,0.00,0.0,0.179949,0.768785,0.177341,0.471179,0.872241,...,0,0,0,0,0,0,0,0,0,0
3749,1,0.100000,0.765646,0.00,0.0,0.536954,0.634936,0.342713,0.447162,0.672689,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Отделяем из датасет целевой признак.
y = bio_answer['Activity']
x = bio_answer.drop(['Activity'], axis=1)

# Разбиваем данные на тренировочную и тестовую.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

## ***1. Логистическая регрессия***

#### Посмотрим на метрику f1-score при устрановленных по умолчанию гиперпараметров.

In [4]:
# Обучаем модель.
log_reg = linear_model.LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Делаем предсказание.
log_reg_train_pred = log_reg.predict(X_train)
log_reg_test_pred = log_reg.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, log_reg_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(log_reg.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, log_reg_test_pred)))

f1_score на тренировочном наборе: 0.88
accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.78


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### ***1.1 GridSearchCV***

In [5]:
# Задаем искомые гиперпараметры в виде словаря.
param_grid = {'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
              'C': list(np.linspace(0.01, 1, 10, dtype=float))
              }

# С помощью GridSearchCV находим наилучшие параметры для логистической регрессии.
grid_search_lr = GridSearchCV(
    estimator=linear_model.LogisticRegression(
    random_state=42,
    max_iter=50
    ), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  

# Обучаем модель и предсказываем результаты.
%time grid_search_lr.fit(X_train, y_train)
y_train_pred = grid_search_lr.predict(X_train)
y_test_pred = grid_search_lr.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(grid_search_lr.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_lr.best_params_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 2.45 s, sys: 248 ms, total: 2.69 s
Wall time: 1min 38s
f1_score на тренировочном наборе: 0.85
accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'C': 0.12, 'penalty': 'l2', 'solver': 'sag'}




_________________
### ***Вывод:***
*С помощью подобранных гиперпараметров классом GridSearchCV нам удалось улучшить метрику f1-score, она составила - 0.79.*
_________________

###  ***1.2 RandomizedSearchCV***

In [6]:
# Задаем искомые гиперпараметры в виде словаря.
param_distributions = {'penalty': ['l2', 'none'] ,
              'solver': ['lbfgs', 'sag'],
               'C': list(np.linspace(0.01, 1, 10, dtype=float))},

# С помощью RandomizedSearchCV находим наилучшие параметры для логистической регрессии.            
random_search_lr = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42), 
    param_distributions=param_distributions, 
    cv=5, 
    n_iter = 15, 
    n_jobs = -1
)  

# Обучаем модель и предсказываем результаты.
%time random_search_lr.fit(X_train, y_train)
y_train_pred = random_search_lr.predict(X_train)
y_test_pred = random_search_lr.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(random_search_lr.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_lr.best_params_))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 930 ms, sys: 98.8 ms, total: 1.03 s
Wall time: 1min 14s
f1_score на тренировочном наборе: 0.85
accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.12}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


_________________
### ***Вывод:***
*С помощью подобранных гиперпараметров классом RandomizedSearchCV нам не удалось преулучшить метрику f1-score, она осталась прежней - 0.79 как и при GridSearchCV. Но при этом время для их поиска немного сократилось.*
_________________

###  ***1.3 Hyperopt***

In [7]:
# Задаем искомые гиперпараметры в виде словаря.
penalties = ['l2', 'none']
solvers = ['sag', 'lbfgs']
space = {'penalty': hp.choice('penalty', penalties),
        'solver': hp.choice('solver', solvers),
        'C': hp.uniform('C',0,1)}


def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=42):
    """Функция получения комбинации гиперпараметров."""

    # Строим модель.
    model = linear_model.LogisticRegression(**params, random_state=42)

    # Обучаем модель с помощью кросс-валидации
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # Метрику необходимо минимизировать, поэтому ставим знак минус.
    return -score


# Залогируем полученные результаты.
trials = Trials() # используется для логирования результатов

# Начинаем подбор гиперпараметров.
best = fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=25,
          trials=trials,
          rstate=np.random.default_rng(42)
         )

#Выводим наилучшие значения гиперпараметров.
print("Наилучшие значения гиперпараметров {}".format(best))

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]



  4%|▍         | 1/25 [00:09<03:53,  9.71s/trial, best loss: -0.7665205856775353]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

  8%|▊         | 2/25 [00:12<02:03,  5.39s/trial, best loss: -0.7706393419675128]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 12%|█▏        | 3/25 [00:22<02:47,  7.63s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 16%|█▌        | 4/25 [00:24<01:57,  5.61s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 20%|██        | 5/25 [00:34<02:19,  6.96s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 24%|██▍       | 6/25 [00:36<01:45,  5.53s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 28%|██▊       | 7/25 [00:39<01:20,  4.49s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 32%|███▏      | 8/25 [00:49<01:49,  6.43s/trial, best loss: -0.7823550164726636]



 36%|███▌      | 9/25 [01:00<02:02,  7.67s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 40%|████      | 10/25 [01:02<01:29,  5.98s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 44%|████▍     | 11/25 [01:12<01:42,  7.30s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 48%|████▊     | 12/25 [01:15<01:15,  5.83s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 52%|█████▏    | 13/25 [01:24<01:23,  6.94s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 56%|█████▌    | 14/25 [01:26<01:00,  5.48s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 60%|██████    | 15/25 [01:28<00:44,  4.47s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 64%|██████▍   | 16/25 [01:39<00:55,  6.18s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 68%|██████▊   | 17/25 [01:41<00:40,  5.03s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 72%|███████▏  | 18/25 [01:44<00:30,  4.29s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

 76%|███████▌  | 19/25 [01:46<00:22,  3.71s/trial, best loss: -0.7823550164726636]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


 80%|████████  | 20/25 [01:56<00:27,  5.58s/trial, best loss: -0.7823550164726636]



 84%|████████▍ | 21/25 [02:06<00:28,  7.06s/trial, best loss: -0.7823550164726636]



 88%|████████▊ | 22/25 [02:17<00:24,  8.12s/trial, best loss: -0.7823550164726636]



 96%|█████████▌| 24/25 [02:34<00:08,  8.40s/trial, best loss: -0.7823550164726636]



100%|██████████| 25/25 [02:47<00:00,  6.69s/trial, best loss: -0.7823550164726636]
Наилучшие значения гиперпараметров {'C': 0.07366102545052411, 'penalty': 0, 'solver': 0}




In [8]:
# Строим логистическую модель.
model = linear_model.LogisticRegression(
    random_state=42, 
    penalty=penalties[best['penalty']],
    solver=solvers[best['solver']],
    C=best['C']
)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.84
accuracy на тестовом наборе: 0.77
f1_score на тестовом наборе: 0.80




In [9]:
# Продолжим подбор наилучших гиперпараметров.
best = fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=30,
          trials=trials,
          rstate=np.random.default_rng(42)
         )

#Выводим наилучшие значения гиперпараметров.
print("Наилучшие значения гиперпараметров {}".format(best))

 83%|████████▎ | 25/30 [00:00<?, ?trial/s, best loss=?]



 87%|████████▋ | 26/30 [00:10<00:40, 10.24s/trial, best loss: -0.7823550164726636]



 90%|█████████ | 27/30 [00:21<00:31, 10.64s/trial, best loss: -0.7823550164726636]



 93%|█████████▎| 28/30 [00:30<00:19,  9.88s/trial, best loss: -0.7823550164726636]



 97%|█████████▋| 29/30 [00:39<00:09,  9.57s/trial, best loss: -0.7823550164726636]



100%|██████████| 30/30 [00:48<00:00,  9.64s/trial, best loss: -0.7823550164726636]
Наилучшие значения гиперпараметров {'C': 0.07366102545052411, 'penalty': 0, 'solver': 0}




In [10]:
# Строим еще раз логистическую модель.
model = linear_model.LogisticRegression(
    random_state=42, 
    penalty=penalties[best['penalty']],
    solver=solvers[best['solver']],
    C=best['C']
)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.84
accuracy на тестовом наборе: 0.77
f1_score на тестовом наборе: 0.80




_________________
### ***Вывод:***
*С помощью Hyperopt и использования кросс-валидации нам удалось улучшить метрику f1-score и она составила - 0.80. Также за счет этого увеличилось и accuracy. Без использования кросс-валидации Hyperopt не справился с поиском лучших гиперпараметров и выдал худше результаты.*
_________________

###  ***1.4 Optuna***

In [11]:
def optuna_rf(trial, cv=5, X=X_train, y=y_train, random_state=42):
    """Функция получения комбинации гиперпараметров."""
    
    # Задаем пространства поиска гиперпараметров.
    penalties = ['l2', 'none']
    solvers = ['sag', 'lbfgs']
    penalty = trial.suggest_categorical('penalty', penalties)
    solver = trial.suggest_categorical('solver', solvers)
    C = trial.suggest_float('C', 0, 1)

    # Создаем модель.
    model = linear_model.LogisticRegression(penalty=penalty,
                                          solver=solver,
                                          C=C,
                                          random_state=42)
    # Обучаем модель.
    model.fit(X_train, y_train)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return score


# Создаем объект исследования.
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")

# Ищем лучшую комбинацию гиперпараметров n_trials раз.
study.optimize(optuna_rf, n_trials=20)

print("Наилучшие значения гиперпараметров {}".format(study.best_params))


[32m[I 2023-04-01 23:27:04,533][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-04-01 23:27:18,403][0m Trial 0 finished with value: 0.7665205856775353 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.262133514948119}. Best is trial 0 with value: 0.7665205856775353.[0m
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_mo

Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.13703743153801257}


In [12]:
# Создаем модель.
model = linear_model.LogisticRegression(**study.best_params,random_state=42)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тренировочном наборе: 0.85
accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


_________________
### ***Вывод:***
*С помощью Optuna и использования кросс-валидации не удалось улучшить метрику f1-score по сравнению с Hyperopt и время поиска гиперпараметров увеличилось.*
_________________

## ***2. Случайный лес***

#### Посмотрим на метрику f1-score при устрановленных по умолчанию гиперпараметров.

In [13]:
# Обучаем модель.
forest = ensemble.RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)

# Делаем предсказание.
forest_train_pred = forest.predict(X_train)
forest_test_pred = forest.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, forest_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(forest.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, forest_test_pred)))

f1_score на тренировочном наборе: 1.00
accuracy на тестовом наборе: 0.79
f1_score на тестовом наборе: 0.81


### ***2.1 GridSearchCV***

In [14]:
# Задаем искомые гиперпараметры в виде словаря.
param_grid = {'n_estimators': list(range(80, 200, 30)),
            'min_samples_leaf': [3],
            'max_depth': list(np.linspace(20, 40, 5, dtype=int)),
            'criterion': ['gini', 'entropy']
            }

# С помощью GridSearchCV находим наилучшие параметры для логистической регрессии.
grid_search_forest = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  

# Обучаем модель и предсказываем результаты.
%time grid_search_forest.fit(X_train, y_train)
y_train_pred = grid_search_forest.predict(X_train)
y_test_pred = grid_search_forest.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(grid_search_forest.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search_forest.best_params_))

CPU times: user 1.94 s, sys: 120 ms, total: 2.06 s
Wall time: 2min 18s
f1_score на тренировочном наборе: 0.98
accuracy на тестовом наборе: 0.80
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 3, 'n_estimators': 110}


_________________
### ***Вывод:***
*С помощью подобранных гипераметров классом GridSearchCV нам удалось улучшить метрику f1-score и она составила 0.82.*
_________________

###  ***2.2 RandomizedSearchCV***

In [15]:
# Задаем искомые гиперпараметры в виде словаря.
param_distributions = {'n_estimators': list(range(80, 200, 30)),
            'min_samples_leaf': [3],
            'max_depth': list(np.linspace(20, 40, 5, dtype=int)),
            'criterion': ['gini', 'entropy']
            }

# С помощью RandomizedSearchCV находим наилучшие параметры для логистической регрессии.            
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_distributions=param_distributions, 
    cv=5, 
    n_iter = 15, 
    n_jobs = -1
)  

# Обучаем модель и предсказываем результаты.
%time random_search_forest.fit(X_train, y_train)
y_train_pred = random_search_forest.predict(X_train)
y_test_pred = random_search_forest.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(random_search_forest.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

CPU times: user 1.65 s, sys: 76.2 ms, total: 1.72 s
Wall time: 56 s
f1_score на тренировочном наборе: 0.98
accuracy на тестовом наборе: 0.80
f1_score на тестовом наборе: 0.82
Наилучшие значения гиперпараметров: {'n_estimators': 110, 'min_samples_leaf': 3, 'max_depth': 20, 'criterion': 'entropy'}


_________________
### ***Вывод:***
*С помощью подобранных гиперпараметров классом RandomizedSearchCV метрика f1-score осталась такая же как и при помощи  GridSearchCV.*
_________________

###  ***2.3 Hyperopt***

In [16]:
# Задаем искомые гиперпараметры в виде словаря.
criterions = ['gini', 'entropy']
space={'n_estimators': hp.quniform('n_estimators', 80, 200, 1),
       'max_depth' : hp.quniform('max_depth', 15, 40, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1),
       'criterion': hp.choice('criterion', criterions)
      }


def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=42):
    """Функция получения комбинации гиперпараметров."""
    params = {'n_estimators': int(params['n_estimators']), 
            'max_depth': int(params['max_depth']), 
            'min_samples_leaf': int(params['min_samples_leaf']),
            'criterion': params['criterion']
            }
    # Строим модель.
    model = ensemble.RandomForestClassifier(**params, random_state=42)

    # Обучаем модель с помощью кросс-валидации
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    # Метрику необходимо минимизировать, поэтому ставим знак минус.
    return -score


# Залогируем полученные результаты.
trials = Trials() # используется для логирования результатов

# Начинаем подбор гиперпараметров.
best = fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=25,
          trials=trials,
          rstate=np.random.default_rng(42)
         )

#Выводим наилучшие значения гиперпараметров.
print("Наилучшие значения гиперпараметров {}".format(best))

100%|██████████| 25/25 [01:51<00:00,  4.45s/trial, best loss: -0.8117572520257845]
Наилучшие значения гиперпараметров {'criterion': 1, 'max_depth': 32.0, 'min_samples_leaf': 2.0, 'n_estimators': 146.0}


In [17]:
# Строим  модель.
model = ensemble.RandomForestClassifier(
    random_state=42, 
    n_estimators=int(best['n_estimators']),
    criterion=criterions[best['criterion']],
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.99
accuracy на тестовом наборе: 0.80
f1_score на тестовом наборе: 0.82


In [18]:
# Продолжим подбор наилучших гиперпараметров.
best = fmin(hyperopt_rf,
          space=space,
          algo=tpe.suggest,
          max_evals=30,
          trials=trials,
          rstate=np.random.default_rng(42)
         )

#Выводим наилучшие значения гиперпараметров.
print("Наилучшие значения гиперпараметров {}".format(best))

100%|██████████| 30/30 [00:33<00:00,  6.64s/trial, best loss: -0.8132438164868298]
Наилучшие значения гиперпараметров {'criterion': 1, 'max_depth': 39.0, 'min_samples_leaf': 2.0, 'n_estimators': 196.0}


In [19]:
# Строим модель.
model = ensemble.RandomForestClassifier(
    random_state=42, 
    n_estimators=int(best['n_estimators']),
    criterion=criterions[best['criterion']],
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на обучающем наборе: 0.99
accuracy на тестовом наборе: 0.81
f1_score на тестовом наборе: 0.82


_________________
### ***Вывод:***
*С помощью Hyperopt и использования кросс-валидации мы смогли преулучшить accuracy. А f1-score осталась такая же как и при помощи  GridSearchCV.*
_________________

###  ***2.4 Optuna***

In [20]:

def optuna_rf(trial, cv=5, X=X_train, y=y_train, random_state=42):
    """Функция получения комбинации гиперпараметров."""
    
    # Задаем пространства поиска гиперпараметров.
    criterions = ['gini', 'entropy']
    n_estimators = trial.suggest_int('n_estimators', 50, 200, 1)
    max_depth = trial.suggest_int('max_depth', 5, 40, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)
    criterion = trial.suggest_categorical('criterion', criterions)

    # Создаем модель.
    model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                        max_depth=max_depth,
                                        min_samples_leaf=min_samples_leaf,
                                        criterion=criterion,
                                        random_state=42)
    # Обучаем модель.
    model.fit(X_train, y_train)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return score


# Создаем объект исследования.
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")

# Ищем лучшую комбинацию гиперпараметров n_trials раз.
study.optimize(optuna_rf, n_trials=30)

print("Наилучшие значения гиперпараметров {}".format(study.best_params))


[32m[I 2023-04-01 23:34:50,134][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-04-01 23:34:56,263][0m Trial 0 finished with value: 0.7906917841425475 and parameters: {'n_estimators': 185, 'max_depth': 20, 'min_samples_leaf': 10, 'criterion': 'gini'}. Best is trial 0 with value: 0.7906917841425475.[0m
[32m[I 2023-04-01 23:35:01,534][0m Trial 1 finished with value: 0.7939125237708222 and parameters: {'n_estimators': 152, 'max_depth': 34, 'min_samples_leaf': 9, 'criterion': 'gini'}. Best is trial 1 with value: 0.7939125237708222.[0m
[32m[I 2023-04-01 23:35:04,219][0m Trial 2 finished with value: 0.7970093147599608 and parameters: {'n_estimators': 58, 'max_depth': 27, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 2 with value: 0.7970093147599608.[0m
[32m[I 2023-04-01 23:35:08,718][0m Trial 3 finished with value: 0.7989428609474201 and parameters: {'n_estimators': 99, 'max_depth': 17, 'min_samples_leaf': 5, 'criterion': 'gini'}. 

Наилучшие значения гиперпараметров {'n_estimators': 86, 'max_depth': 27, 'min_samples_leaf': 2, 'criterion': 'entropy'}


In [21]:
# Создаем модель.
model = ensemble.RandomForestClassifier(**study.best_params,random_state=42)

# Обучаем модель.
model.fit(X_train, y_train)

# Предсказываем результат для тренировочной и тестовой выборки.
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Рассчитываем метрики.
print('f1_score на тренировочном наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

f1_score на тренировочном наборе: 0.99
accuracy на тестовом наборе: 0.80
f1_score на тестовом наборе: 0.82


_________________
### ***Вывод:***
*С помощью Optuna и использования кросс-валидации не удалось найти подходящие гиперпараметры.*

***Наилучший результат мы получили при использовании параметров найденных Hyperopt для случайного леса. {'n_estimators': 86, 'max_depth': 27, 'min_samples_leaf': 2, 'criterion': 'entropy'}. У нас получилось повысить значении метрики f1-score с 0.78 до 0.82.***
_________________