#### <center> Задача
Необходимо обучить две модели: логистическую регрессию и случайный лес. Далее нужно сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

#### <center> Критерий оценивания

| Балл | Критерий                                                                 |
|------|--------------------------------------------------------------------------|
| 0    | Задание не выполнено                                                     |
| 1    | Обучено две модели; гипепараметры подобраны при помощи одного метода     |
| 2    | Обучено две модели; гипепараметры подобраны при помощи двух методов      |
| 3    | Обучено две модели; гипепараметры подобраны при помощи трёх методов      |
| 4    | Обучено две модели; гипепараметры подобраны при помощи четырёх методов   |
| 5    | Обучено две модели; гипепараметры подобраны при помощи четырёх методов; использована кросс-валидация  |

#### <center> Импортирую библиотеки

In [22]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials
import optuna

import warnings 
warnings.filterwarnings('ignore')

#### <center> Выгружаю данные

In [2]:
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


###  Информация о данных:
 
* Activity - содержит экспериментальные данные, описывающие фактический биологический ответ [0, 1]; 
* Столбцы D1 - D1776 представляют собой молекулярные дескрипторы — это вычисляемые свойства, которые могут фиксировать некоторые характеристики молекулы, например размер, форму или состав элементов.


#### <center> Обучение модели

#### Разделяю данные на матрицу наблюдений и вектор правильных ответов

In [5]:
x = data.drop('Activity', axis= 1)
y = data['Activity']

#### Создаю тренировочный и тестовый наборы данных

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### <center> Обучение логистической регрессии

#### Создание модели логистической регрессии

In [9]:
log_model = linear_model.LogisticRegression(random_state=42, max_iter=1000)
log_model.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, log_model.predict(X_test))))

f1 test score: 0.79


#### Подбор гиперпараметров с помощью GridSearchCV

In [20]:
param_grid = [
              {'penalty': ['l2', 'none'], 
              'solver': ['lbfgs', 'sag']}, 
              
              {'penalty': ['l1', 'l2'] ,
              'solver': ['liblinear', 'saga']}
]
grid_search = GridSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
grid_search.fit(X_train, y_train) 
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, grid_search.predict(X_test))))
print('best hyperparameter values: {}'.format(grid_search.best_params_))

f1 test score: 0.78
best hyperparameter values: {'penalty': 'l1', 'solver': 'liblinear'}


#### Подбор гиперпараметров с помощью RandomizedSearchCV

In [21]:
param_distributions = {'penalty': ['l2', 'none'],
              'solver': ['lbfgs', 'sag'],
               'C': list(np.linspace(0.01, 1, 10, dtype=float))}
            
random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000), 
    param_distributions=param_distributions, 
    cv=5, 
    n_iter = 10, 
    n_jobs = -1
)  
random_search.fit(X_train, y_train) 
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, random_search.predict(X_test))))
print('best hyperparameter values: {}'.format(random_search.best_params_))

f1 test score: 0.79
best hyperparameter values: {'solver': 'sag', 'penalty': 'l2', 'C': 0.12}


#### Подбор гиперпараметров с помощью библиотеки Hyperopt

In [28]:
random_state = 42

space = {
    'solver': hp.choice('solver', ['lbfgs', 'sag']),
    'penalty': hp.choice('penalty', ['l2']),
    'C': hp.loguniform('C', np.log(0.001), np.log(1))
}

def hyperopt_log(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    
    params = {'solver': str(params['solver']), 
             'penalty': str(params['penalty']), 
             'C': float(params['C'])
              }
    
    model = linear_model.LogisticRegression(**params, random_state=random_state)
    score = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1).mean()
    return -score

In [30]:
trials = Trials()

best = fmin(
    fn=hyperopt_log,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)

print(f'best hyperparameter values: {best}')

100%|██████████| 20/20 [00:45<00:00,  2.29s/trial, best loss: -0.7852193385451705]
best hyperparameter values: {'C': 0.06675090446914045, 'penalty': 0, 'solver': 1}


In [38]:
model_hyperopt_log = linear_model.LogisticRegression(
    random_state=random_state, 
    penalty='l2',
    solver='sag',
    C=best['C']
)
model_hyperopt_log.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, model_hyperopt_log.predict(X_test))))

f1 test score: 0.79


#### Подбор гиперпараметров с помощью библиотеки Optuna

In [45]:
def optuna_log(trial):
    solver = trial.suggest_categorical('solver', ['lbfgs', 'saga'])
    penalty = trial.suggest_categorical('penalty', ['l2'])
    C = trial.suggest_loguniform('C', 0.001, 1.0)
    

    model = linear_model.LogisticRegression(solver=solver, 
                                            penalty=penalty, 
                                            C=C, 
                                            max_iter=1000, 
                                            random_state=random_state)
    model.fit(X_train, y_train)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1', n_jobs=-1).mean()

    return score

In [46]:
study = optuna.create_study(study_name='LogisticRegression', direction='maximize')
study.optimize(optuna_log, n_trials=20)

[I 2024-06-19 15:08:28,461] A new study created in memory with name: LogisticRegression
[I 2024-06-19 15:08:36,073] Trial 0 finished with value: 0.7622075595841825 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 0.0026634831206733384}. Best is trial 0 with value: 0.7622075595841825.
[I 2024-06-19 15:08:38,474] Trial 1 finished with value: 0.782727851898531 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.023219312349087747}. Best is trial 1 with value: 0.782727851898531.
[I 2024-06-19 15:08:41,789] Trial 2 finished with value: 0.7800286937008891 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.22178003856327774}. Best is trial 1 with value: 0.782727851898531.
[I 2024-06-19 15:08:42,854] Trial 3 finished with value: 0.7744793325663044 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.007072474905209571}. Best is trial 1 with value: 0.782727851898531.
[I 2024-06-19 15:08:43,638] Trial 4 finished with value: 0.757276342785774 and parameters: {'solv

In [48]:
print(f'best hyperparameter values: {study.best_params}')
print('f1 train score: {:.2f}'.format(study.best_value))

best hyperparameter values: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.041840088995997705}
f1 train score: 0.79


In [49]:
model_optuna_log = linear_model.LogisticRegression(**study.best_params,random_state=random_state)
model_optuna_log.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, model_optuna_log.predict(X_test))))

f1 test score: 0.80


#### <center> Обучение случайного леса

#### Создание модели случайного леса

In [50]:
rf_model = ensemble.RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, rf_model.predict(X_test))))

f1 test score: 0.83


#### Подбор гиперпараметров с помощью GridSearchCV

In [54]:
param_grid = {'n_estimators': list(range(100, 200, 30)),
              'min_samples_leaf': [5, 7],
              'max_depth': [10, 15, 20, 25]
              }
            
grid_search_rf = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
grid_search_rf.fit(X_train, y_train) 
print('f1 train score: {:.2f}'.format(metrics.f1_score(y_test, grid_search_rf.predict(X_test))))
print('best hyperparameter values: {}'.format(grid_search_rf.best_params_))

f1 train score: 0.83
best hyperparameter values: {'max_depth': 15, 'min_samples_leaf': 5, 'n_estimators': 130}


#### Подбор гиперпараметров с помощью RandomizedSearchCV

In [55]:
param_distributions = {
    'n_estimators': list(range(100, 200, 30)),
    'min_samples_leaf': [5, 7],
    'max_depth': [10, 15, 20 ,25]
}
random_search_rf = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    cv=5,
    n_iter=10, 
    n_jobs=-1
)
random_search_rf.fit(X_train, y_train)
print('f1 train score: {:.2f}'.format(metrics.f1_score(y_test, random_search_rf.predict(X_test))))
print('best hyperparameter values: {}'.format(random_search_rf.best_params_))

f1 train score: 0.83
best hyperparameter values: {'n_estimators': 130, 'min_samples_leaf': 5, 'max_depth': 15}


#### Подбор гиперпараметров с помощью библиотеки Hyperopt

In [56]:
random_state = 42

space = {
    'n_estimators': hp.quniform('n_estimators', 100, 300, 10),
    'max_depth': hp.quniform('max_depth', 15, 40, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 7, 1)
}

In [59]:
def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state = random_state):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'min_samples_leaf': int(params['min_samples_leaf'])
    }
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    model.fit(X, y)
    score = cross_val_score(model, X, y, cv=5, scoring='f1', n_jobs=-1).mean()
    return -score

In [60]:
trials = Trials()

best = fmin(
    hyperopt_rf,
    space=space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(random_state)
)
print(f'best hyperparameter values{best}')

100%|██████████| 20/20 [01:30<00:00,  4.53s/trial, best loss: -0.806936534807253]
best hyperparameter values{'max_depth': 22.0, 'min_samples_leaf': 3.0, 'n_estimators': 200.0}


In [61]:
model_hyperopt_rf = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)
model_hyperopt_rf.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, model_hyperopt_rf.predict(X_test))))

f1 test score: 0.84


#### Подбор гиперпараметров с помощью библиотеки Optuna

In [62]:
def optuna_rf(trial):
  n_estimators = trial.suggest_int('n_estimators', 100, 300, 10)
  max_depth = trial.suggest_int('max_depth', 15, 40, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 7, 1)

  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  
  model.fit(X_train, y_train)
  score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1', n_jobs=-1).mean()

  return score

In [63]:
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
study.optimize(optuna_rf, n_trials=20)

[I 2024-06-19 16:02:40,104] A new study created in memory with name: RandomForestClassifier
[I 2024-06-19 16:02:43,369] Trial 0 finished with value: 0.8101733070737465 and parameters: {'n_estimators': 120, 'max_depth': 16, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8101733070737465.
[I 2024-06-19 16:02:48,103] Trial 1 finished with value: 0.8013943713980026 and parameters: {'n_estimators': 190, 'max_depth': 39, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8101733070737465.
[I 2024-06-19 16:02:52,587] Trial 2 finished with value: 0.8020087808027588 and parameters: {'n_estimators': 180, 'max_depth': 23, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8101733070737465.
[I 2024-06-19 16:02:56,509] Trial 3 finished with value: 0.7976112308785548 and parameters: {'n_estimators': 170, 'max_depth': 38, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8101733070737465.
[I 2024-06-19 16:03:01,484] Trial 4 finished with value: 0.8013943713980026 and parameters: {'n_

In [64]:
print(f'best hyperparameter values: {study.best_params}')
print('f1 train score: {:.2f}'.format(study.best_value))

best hyperparameter values: {'n_estimators': 120, 'max_depth': 16, 'min_samples_leaf': 3}
f1 train score: 0.81


In [65]:
model_optuna_rf = ensemble.RandomForestClassifier(**study.best_params, random_state=random_state)
model_optuna_rf.fit(X_train, y_train)
print('f1 test score: {:.2f}'.format(metrics.f1_score(y_test, model_optuna_rf.predict(X_test))))

f1 test score: 0.84
