In [74]:
#Loading libreris
import pandas as pd
import numpy as np

#Sklearn libreris
from sklearn import linear_model #линейные модели
from sklearn import tree #деревья решений
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

import optuna
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

#Libreris for visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [75]:
data =  pd.read_csv('data/_train_sem09__1_.zip')

In [76]:
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [77]:
data.isnull().mean().sort_values(ascending=False)

Activity    0.0
D1181       0.0
D1192       0.0
D1191       0.0
D1190       0.0
           ... 
D589        0.0
D588        0.0
D587        0.0
D586        0.0
D1776       0.0
Length: 1777, dtype: float64

In [78]:
X = data.drop(columns='Activity')
y = data['Activity']

In [79]:
#We divide thr initial sample into training and test in the ratio of 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## The basic model of linear regression

In [80]:
# Creating a model with basic parameters
log_reg = linear_model.LogisticRegression(random_state=42, max_iter = 1000)
log_reg.fit(X_train, y_train)

In [81]:
y_train_predict = log_reg.predict(X_train)
y_test_predict = log_reg.predict(X_test)

f1_train_logreg = metrics.f1_score(y_train, y_train_predict)
f1_test_logreg = metrics.f1_score(y_test, y_test_predict)

print('f1 Train:',f1_train_logreg)
print('f1 Test:',f1_test_logreg)

f1 Train: 0.8864831425920198
f1 Test: 0.7890535917901939


# The basic model of random forest

In [82]:
randon_forest = ensemble.RandomForestClassifier()
#Обучаем модель 
randon_forest.fit(X_train, y_train)

In [83]:
y_train_predict = randon_forest.predict(X_train)
y_test_predict = randon_forest.predict(X_test)

f1_train_rf = metrics.f1_score(y_train, y_train_predict)
f1_test_rf = metrics.f1_score(y_test, y_test_predict)

print('f1 Train:',f1_train_rf)
print('f1 Test:',f1_test_rf)

f1 Train: 1.0
f1 Test: 0.8354430379746833


# GridSearchCV

### LogisticRegression

In [113]:
param_grid = [ {'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}              
              ]

grid_search = GridSearchCV(
    estimator = linear_model.LogisticRegression(random_state=42, max_iter = 1000), 
    param_grid = param_grid, 
    cv = 5, 
    n_jobs = -1
)

grid_search.fit(X_train, y_train) 
print("accuracy на тестовом наборе: {:.2f}".format(grid_search.score(X_test, y_test)))

y_test_pred = grid_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(grid_search.best_params_))

lr_f1_grid = metrics.f1_score(y_test, y_test_pred)

accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'C': 0.1}


### RandomForestClassifier

In [114]:
param_grid = {'n_estimators': list(range(100, 200, 30)),
              'min_samples_leaf': list(np.linspace(2,10,1, dtype=int)),
              'max_depth': list(np.linspace(10, 25, 5, dtype=int))
              }
            
random_search_forest = GridSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs = -1
)  
random_search_forest.fit(X_train, y_train) 

y_train_pred = random_search_forest.predict(X_train)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(random_search_forest.score(X_test, y_test)))

y_test_pred = random_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

rf_f1_grid = metrics.f1_score(y_test, y_test_pred)

f1_score на обучающем наборе: 0.99
accuracy на тестовом наборе: 0.80
f1_score на тестовом наборе: 0.83
Наилучшие значения гиперпараметров: {'max_depth': 21, 'min_samples_leaf': 2, 'n_estimators': 100}


# RandomizedSearchCV

### LogisticRegression

In [110]:
#np.linspace(start(от), stop(до), num=50(количество),dtype- тип данных)
param_distributions = [{'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}]

random_search = RandomizedSearchCV(
    estimator=linear_model.LogisticRegression(random_state=42, max_iter=1000), 
    param_distributions=param_distributions, 
    cv=5, 
    n_iter = 10, 
    n_jobs = -1
)

random_search.fit(X_train, y_train) 
print("accuracy на тестовом наборе: {:.2f}".format(random_search.score(X_test, y_test)))

y_test_pred = random_search.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search.best_params_))

lr_f1_randsearch = metrics.f1_score(y_test, y_test_pred)



accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79
Наилучшие значения гиперпараметров: {'C': 0.1}


### RandomForestClassifier

In [90]:
param_distributions = {
                    'n_estimators': list(range(100, 200, 30)),
                    'min_samples_leaf': list(np.linspace(2,10,1, dtype=int)),
                    'max_depth': list(np.linspace(10, 25, 5, dtype=int))
                      }
            
random_search_forest = RandomizedSearchCV(
    estimator=ensemble.RandomForestClassifier(random_state=42), 
    param_distributions=param_distributions, 
    cv=5,
    n_iter = 10, 
    n_jobs = -1
)  
random_search_forest.fit(X_train, y_train) 

y_train_pred = random_search_forest.predict(X_train)
print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(random_search_forest.score(X_test, y_test)))

y_test_pred = random_search_forest.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))
print("Наилучшие значения гиперпараметров: {}".format(random_search_forest.best_params_))

rf_f1_randsearch = metrics.f1_score(y_test, y_test_pred)

f1_score на обучающем наборе: 0.99
accuracy на тестовом наборе: 0.81
f1_score на тестовом наборе: 0.83
Наилучшие значения гиперпараметров: {'n_estimators': 130, 'min_samples_leaf': 2, 'max_depth': 21}


# Hyperopt

In [91]:
from sklearn.model_selection import cross_val_score
import hyperopt
from hyperopt import hp, fmin, tpe, Trials

### LogisticRegression

In [92]:
# зададим пространство поиска гиперпараметров
space={
       'C': hp.quniform('C', 0.1, 1, 0.1)
      }

In [93]:
# зафксируем random_state
random_state = 42

def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'C': params['C']}
    # используем эту комбинацию для построения модели
    model = linear_model.LogisticRegression(**params, random_state=random_state, max_iter= 2000)
    model.fit(X, y)
    # обучать модель можно также с помощью кросс-валидации
    # применим  cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()
    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [94]:
trials = Trials() # используется для логирования результатов

best = fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

100%|██████████| 20/20 [02:31<00:00,  7.57s/trial, best loss: -0.7864364208494703]
Наилучшие значения гиперпараметров {'C': 0.1}


In [95]:
# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(
    random_state=random_state, 
    C=best['C']
)

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)

print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

lr_f1_hyperopt = metrics.f1_score(y_test, y_test_pred)

f1_score на обучающем наборе: 0.85
accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### RandomForestClassifier

In [96]:
# зададим пространство поиска гиперпараметров
space={'n_estimators': hp.quniform('n_estimators', 100, 200, 1),
       'max_depth' : hp.quniform('max_depth', 10, 25, 1),
       'min_samples_leaf': hp.quniform('min_samples_leaf', 2, 10, 1)
      }

In [97]:
# зафксируем random_state
random_state = 42

def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    # функция получает комбинацию гиперпараметров в "params"
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'min_samples_leaf': int(params['min_samples_leaf'])
              }
    # используем эту комбинацию для построения модели
    model = ensemble.RandomForestClassifier(**params, random_state=random_state)
    model.fit(X, y)
    # обучать модель можно также с помощью кросс-валидации
    # применим  cross validation с тем же количеством фолдов
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()
    # метрику необходимо минимизировать, поэтому ставим знак минус
    return -score

In [98]:
trials = Trials() # используется для логирования результатов

best = fmin(hyperopt_rf, # наша функция 
          space=space, # пространство гиперпараметров
          algo=tpe.suggest, # алгоритм оптимизации, установлен по умолчанию, задавать необязательно
          max_evals=20, # максимальное количество итераций
          trials=trials, # логирование результатов
          rstate=np.random.default_rng(random_state)# фиксируем для повторяемости результата
         )
print("Наилучшие значения гиперпараметров {}".format(best))

100%|██████████| 20/20 [02:21<00:00,  7.05s/trial, best loss: -0.8059903096858185]
Наилучшие значения гиперпараметров {'max_depth': 14.0, 'min_samples_leaf': 2.0, 'n_estimators': 103.0}


In [99]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(
    random_state=random_state, 
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf'])
)

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)

print('f1_score на обучающем наборе: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))

y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

rf_f1_hyperopt = metrics.f1_score(y_test, y_test_pred)

f1_score на обучающем наборе: 0.97
accuracy на тестовом наборе: 0.81
f1_score на тестовом наборе: 0.84


# Optuna

In [100]:
import optuna

### LogisticRegression

In [101]:
def optuna_lr(trial, cv=5, X=X_train, y=y_train, random_state=random_state):
  # задаем пространства поиска гиперпараметров
  C = trial.suggest_float("C", 0.1, 1)

  # создаем модель
  model = linear_model.LogisticRegression(C = C, max_iter=2000)
  # обучаем модель
  model.fit(X_train, y_train)
  score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

  return score

In [102]:
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
random_state = 42

study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lr, n_trials=20)

[32m[I 2023-02-20 13:48:57,003][0m A new study created in memory with name: LogisticRegression[0m
[32m[I 2023-02-20 13:49:03,285][0m Trial 0 finished with value: 0.7763397390315823 and parameters: {'C': 0.4549526037381143}. Best is trial 0 with value: 0.7763397390315823.[0m
[32m[I 2023-02-20 13:49:11,250][0m Trial 1 finished with value: 0.7701069795023867 and parameters: {'C': 0.8194968164627985}. Best is trial 0 with value: 0.7763397390315823.[0m
[32m[I 2023-02-20 13:49:16,923][0m Trial 2 finished with value: 0.7792431579427856 and parameters: {'C': 0.324667396756494}. Best is trial 2 with value: 0.7792431579427856.[0m
[32m[I 2023-02-20 13:49:22,321][0m Trial 3 finished with value: 0.7812534362163072 and parameters: {'C': 0.264799863101205}. Best is trial 3 with value: 0.7812534362163072.[0m
[32m[I 2023-02-20 13:49:26,202][0m Trial 4 finished with value: 0.7831189559934645 and parameters: {'C': 0.11733286570736294}. Best is trial 4 with value: 0.7831189559934645.[0m


In [103]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'C': 0.10035461171836424}
f1_score на обучающем наборе: 0.79


In [104]:
# рассчитаем точность для тестовой выборки
model = linear_model.LogisticRegression(**study.best_params,random_state=random_state, max_iter=2000)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

lr_f1_optuna = metrics.f1_score(y_test, y_test_pred)

accuracy на тестовом наборе: 0.76
f1_score на тестовом наборе: 0.79


### RandomForestClassifier

In [105]:
def optuna_rf(trial, cv=5, X=X_train, y=y_train, random_state=random_state):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 300, 10)
  max_depth = trial.suggest_int('max_depth', 15, 40, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 7, 1)

  # создаем модель
  model = ensemble.RandomForestClassifier(n_estimators=n_estimators,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          random_state=random_state)
  # обучаем модель
  model.fit(X_train, y_train)
  score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

  return score

In [106]:
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
random_state = 42

study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

[32m[I 2023-02-20 13:50:52,314][0m A new study created in memory with name: RandomForestClassifier[0m
[32m[I 2023-02-20 13:51:03,030][0m Trial 0 finished with value: 0.800227613802776 and parameters: {'n_estimators': 210, 'max_depth': 37, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.800227613802776.[0m
[32m[I 2023-02-20 13:51:14,306][0m Trial 1 finished with value: 0.8050621970382581 and parameters: {'n_estimators': 180, 'max_depth': 26, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.8050621970382581.[0m
[32m[I 2023-02-20 13:51:27,754][0m Trial 2 finished with value: 0.8013181749714681 and parameters: {'n_estimators': 260, 'max_depth': 34, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.8050621970382581.[0m
[32m[I 2023-02-20 13:51:39,953][0m Trial 3 finished with value: 0.7985614102493408 and parameters: {'n_estimators': 250, 'max_depth': 15, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.8050621970382581.[0m
[32m[I 2023-02-20 13:51:46,065]

In [107]:
# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

Наилучшие значения гиперпараметров {'n_estimators': 100, 'max_depth': 16, 'min_samples_leaf': 3}
f1_score на обучающем наборе: 0.81


In [108]:
# рассчитаем точность для тестовой выборки
model = ensemble.RandomForestClassifier(**study.best_params, random_state=random_state)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("accuracy на тестовом наборе: {:.2f}".format(model.score(X_test, y_test)))
y_test_pred = model.predict(X_test)
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

rf_f1_optuna = metrics.f1_score(y_test, y_test_pred)

accuracy на тестовом наборе: 0.81
f1_score на тестовом наборе: 0.83


In [115]:
lr_comparison = pd.Series({
    'Base': f1_test_logreg,
    'GridSearch': lr_f1_grid,
    'RandomSearch':lr_f1_randsearch,
    'Hyperopt': lr_f1_hyperopt,
    'Optuna':lr_f1_optuna
    })

In [137]:
lr_comparison.sort_values(ascending=False)

GridSearch      0.794552
RandomSearch    0.794552
Optuna          0.794552
Hyperopt        0.792711
Base            0.789054
dtype: float64

### For these settings, GridSearch showed the best result

In [139]:
rf_comparison = pd.Series({
    'Base': f1_test_rf,
    'GridSearch': rf_f1_grid,
    'RandomSearch':rf_f1_randsearch,
    'Hyperopt': rf_f1_hyperopt,
    'Optuna':rf_f1_optuna
    })

In [140]:
rf_comparison.sort_values(ascending=False)

Hyperopt        0.837963
Base            0.835443
RandomSearch    0.832184
Optuna          0.832168
GridSearch      0.829885
dtype: float64

### For these settings, Hyperopt showed the best result