In [1]:
import pandas as pd
import numpy as np
import warnings
import hyperopt
import optuna

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from hyperopt import hp, fmin, tpe, Trials

warnings.filterwarnings('ignore')
random_state = 42

# 1. Загрузка и подготовка данных

In [2]:
data = pd.read_csv('data/_train_sem09 (1).csv')
data.head(3)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# проверка сбалансированности целевого признака
print(data['Activity'].value_counts(normalize=True)
)# матрица значений признаков
X = data.drop(['Activity'], axis=1)
# вектор правильных ответов
y = data['Activity']

Activity
1    0.542255
0    0.457745
Name: proportion, dtype: float64


*Данные относительно сбалансированы по классам*

In [4]:
# разбивка данных на train/test выборки
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y,
    train_size=0.8,
    random_state=random_state
)

# 2. Обучаем базовые модели

In [5]:
# логистическая регрессия
log_reg_model = linear_model.LogisticRegression(
    random_state=random_state,
    max_iter=1000
)


log_reg_model.fit(X_train, y_train)


print(f'log_reg train f1 scores {metrics.f1_score(y_train, log_reg_model.predict(X_train))}')
print(f'log_reg test f1 scores {metrics.f1_score(y_test, log_reg_model.predict(X_test))}')

# случайный лес
rand_forest_model = ensemble.RandomForestClassifier(
    random_state=random_state
)

rand_forest_model.fit(X_train, y_train)


print(f'forest train f1 scores {metrics.f1_score(y_train, rand_forest_model.predict(X_train))}')
print(f'forest test f1 scores {metrics.f1_score(y_test, rand_forest_model.predict(X_test))}')


log_reg train f1 scores 0.8875154511742892
log_reg test f1 scores 0.7890535917901939
forest train f1 scores 1.0
forest test f1 scores 0.8279816513761468


# 3. GridSearchCV

In [7]:
%%time
# для логистической регрессии

param_grid = [
        {
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'C': [0.9, 0.95, 1, 1.1]
        }
]

grid_search_lr = model_selection.GridSearchCV(
        estimator=log_reg_model,
        param_grid=param_grid,
        scoring='f1',
        cv=5,
        n_jobs=1
)

grid_search_lr.fit(X_train, y_train)


print(f'оптимальные значения параметров {grid_search_lr.best_params_}')
print('train/test метрика f1 соответственно: ',
      grid_search_lr.score(X_train, y_train),
      grid_search_lr.score(X_test, y_test),
      sep='\n'
      )

In [None]:
%%time
# для случайного леса

param_grid = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 8, 11, 13, 16, 19],
    'min_samples_leaf': [1, 3, 6, 9, 11, 15]
}

grid_search_rf = model_selection.GridSearchCV(
    estimator=rand_forest_model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

grid_search_rf.fit(X_train, y_train)
print(f'оптимальные значения параметров {grid_search_rf.best_params_}')
print('train/test метрика f1 соответственно: ',
      round(grid_search_rf.score(X_train, y_train), 2),
      round(grid_search_rf.score(X_test, y_test), 2),
      sep='\n'
      )

CPU times: total: 16 s
Wall time: 10min 48s
оптимальные значения параметров {'criterion': 'gini', 'max_depth': 16, 'min_samples_leaf': 1, 'n_estimators': 200}
train/test метрика f1 соответственно: 
1.0
0.84


*Очень большие затраты по ресурсам на этот метод уходит, метрика логистической регрессии не выросла, метрика на случайном лесе стала лучше, но переобученность модели видно не вооруженным глазом*

# 4. RandomizedSearchCV

In [None]:
%%time
# для логистической регрессии

param_distributions = [
        {
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'C': [0.8, 0.85, 0.9, 0.95, 1, 1.05, 1.1]
        },
        {
        'penalty': ['elasticnet'],
        'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }
]

random_search_lr = model_selection.RandomizedSearchCV(
    estimator=log_reg_model,
    param_distributions=param_distributions,
    n_iter=25,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

random_search_lr.fit(X_train, y_train)
print(f'оптимальные значения параметров {random_search_lr.best_params_}')
print('train/test метрика f1 соответственно: ',
      round(random_search_lr.score(X_train, y_train), 2),
      round(random_search_lr.score(X_test, y_test), 2),
      sep='\n'
      )

CPU times: total: 1min 16s
Wall time: 12min 56s
оптимальные значения параметров {'solver': 'saga', 'penalty': 'l1', 'C': 0.8}
train/test метрика f1 соответственно: 
0.87
0.79


In [None]:
%%time
# для случайного леса

param_distributions = {'min_samples_leaf': list(np.linspace(1, 100, 50, dtype=int)),
              'max_depth': list(np.linspace(1, 30, 50, dtype=int)),
              'criterion':['entropy','gini']
              }
            
random_search_rf = model_selection.RandomizedSearchCV(
    estimator=rand_forest_model, 
    param_distributions=param_distributions, 
    cv=5,
    n_iter = 25, 
    n_jobs = -1
)

random_search_rf.fit(X_train, y_train)
print(f'оптимальные значения параметров {random_search_rf.best_params_}')
print('train/test метрика f1 соответственно: ',
      round(random_search_rf.score(X_train, y_train), 2),
      round(random_search_rf.score(X_test, y_test), 2),
      sep='\n'
      )

CPU times: total: 14 s
Wall time: 30.1 s
оптимальные значения параметров {'min_samples_leaf': 7, 'max_depth': 13, 'criterion': 'entropy'}
train/test метрика f1 соответственно: 
0.91
0.8


*На логистической регрессии без улучшений, на случайном лесе тестовая метрика чуть ухудшилась, но переобучение снизилось (если так корректно сказать). Времязатраты снизились в разы*

# 5. Hyperopt

In [6]:
%%time
# для логистической регрессии
pena = ['l1', 'l2']
solv = ['liblinear', 'saga']
space = {
    'penalty' : hp.choice('penalty', pena),
    'solver': hp.choice('solver', solv),
    'C': hp.uniform('C', 0.01, 1)
}

def hyperopt_lr(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'penalty' : str(params['penalty']),
        'solver': str(params['solver']),
        'C': float(params['C'])
}

    model = linear_model.LogisticRegression(**params, random_state=random_state)

    model.fit(X, y)

    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score


trials = Trials()

best=fmin(hyperopt_lr, 
          space=space, 
          algo=tpe.suggest, 
          max_evals=30, 
          trials=trials,
          rstate=np.random.default_rng(random_state))

print("Наилучшие значения гиперпараметров {}".format(best))


hyper_model_lr = linear_model.LogisticRegression(
    penalty=pena[best['penalty']],
    solver=solv[best['solver']],
    C=float(best['C']),
    max_iter=1000,
    random_state=random_state
)


hyper_model_lr.fit(X_train, y_train)

print(f'метрика на тренировочной выборке {metrics.f1_score(y_train, hyper_model_lr.predict(X_train))}')
print(f'метрика на тестовой выборке {metrics.f1_score(y_test, hyper_model_lr.predict(X_test))}')


  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 30/30 [04:40<00:00,  9.36s/trial, best loss: -0.7849372073971516]
Наилучшие значения гиперпараметров {'C': 0.358197306683544, 'penalty': 0, 'solver': 0}
метрика на тестовой выборке 0.7972665148063781
метрика на кросс-валидации 0.7849372073971516
CPU times: total: 1min 24s
Wall time: 4min 42s


In [7]:
%%time
# для случайного леса
crit = ['gini', 'entropy']

space = {
    'criterion': hp.choice('criterion', crit),
    'n_estimators': hp.quniform('n_estimators', 10, 250, 10),
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 100, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1)
}


def hyperopt_rf(params, cv=5, X=X_train, y=y_train, random_state=random_state):
    params = {
        'criterion': str(params['criterion']),
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'min_samples_leaf': int(params['min_samples_leaf']),
        'min_samples_split': int(params['min_samples_split'])
}

    model = ensemble.RandomForestClassifier(**params, random_state=random_state)

    score = model_selection.cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1).mean()

    return -score


trials = Trials()

best=fmin(hyperopt_rf, 
          space=space, 
          algo=tpe.suggest, 
          max_evals=30, 
          trials=trials,
          rstate=np.random.default_rng(random_state))

print("Наилучшие значения гиперпараметров {}".format(best))


hyper_model_rf = ensemble.RandomForestClassifier(
    criterion=crit[best['criterion']],
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    min_samples_leaf=int(best['min_samples_leaf']),
    min_samples_split=int(best['min_samples_split']),
    random_state=random_state
)


hyper_model_rf.fit(X_train, y_train)

print(f'метрика на тестовой выборке {metrics.f1_score(y_train, hyper_model_rf.predict(X_train))}')
print(f'метрика на тестовой выборке {metrics.f1_score(y_test, hyper_model_rf.predict(X_test))}')


  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 30/30 [01:31<00:00,  3.06s/trial, best loss: -0.8100956483985531]
Наилучшие значения гиперпараметров {'criterion': 1, 'max_depth': 25.0, 'min_samples_leaf': 2.0, 'min_samples_split': 10.0, 'n_estimators': 150.0}
метрика на тестовой выборке 0.8348837209302326
метрика на кросс-валидации 0.8100956483985531
CPU times: total: 4.34 s
Wall time: 1min 41s


# 6. Optuna

In [None]:
%%time
# для логистической регрессии
def optuna_lr(trial):
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])
    C = trial.suggest_uniform('C', 0.01, 1)
    
    model = linear_model.LogisticRegression(
        penalty=penalty,
        solver=solver,
        C=C,
        random_state=random_state,
        max_iter=1000
    )
    
    return model_selection.cross_val_score(
        estimator=model,
        X=X_train,
        y=y_train,
        cv=5,
        scoring='f1',
        n_jobs=-1
    ).mean()
    

study = optuna.create_study(study_name='LogisticRegression', direction='maximize')
study.optimize(optuna_lr, n_trials=25)

print("значения гиперпараметров {}".format(study.best_params))
print("метрика на кросс-валидации: {:.2f}".format(study.best_value))

optuna_model_lr = linear_model.LogisticRegression(
    **study.best_params,
    random_state=random_state
)
optuna_model_lr.fit(X_train, y_train)
print(f'метрика на тестовой выборке {metrics.f1_score(y_test, optuna_model_lr.predict(X_test))}')

In [6]:
%%time
# для случайного леса
def optuna_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 300, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int('max_depth', 5, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30, 1)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 30, 1)
    
    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=random_state
    )
    
    return model_selection.cross_val_score(
        estimator=model,
        X=X_train,
        y=y_train,
        scoring='f1',
        cv=5,
        n_jobs=-1
    ).mean()
    
study = optuna.create_study(
    study_name='RandomForestClassifier',
    direction='maximize')
study.optimize(
    optuna_rf,
    n_trials=50,
    n_jobs=-1
)

print("значения гиперпараметров {}".format(study.best_params))
print("метрика на кросс-валидации: {:.2f}".format(study.best_value))

optuna_model_rf = ensemble.RandomForestClassifier(
    **study.best_params,
    random_state=random_state
)
optuna_model_rf.fit(X_train, y_train)
print(f'метрика на тестовой выборке {metrics.f1_score(y_test, optuna_model_rf.predict(X_test))}')


[I 2024-08-25 20:58:53,020] A new study created in memory with name: RandomForestClassifier
[I 2024-08-25 20:59:05,674] Trial 1 finished with value: 0.7790015456246401 and parameters: {'n_estimators': 130, 'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 23, 'min_samples_split': 25}. Best is trial 1 with value: 0.7790015456246401.
[I 2024-08-25 20:59:09,582] Trial 0 finished with value: 0.791925715318489 and parameters: {'n_estimators': 120, 'criterion': 'gini', 'max_depth': 28, 'min_samples_leaf': 10, 'min_samples_split': 21}. Best is trial 0 with value: 0.791925715318489.
[I 2024-08-25 20:59:10,086] Trial 2 finished with value: 0.782807022890545 and parameters: {'n_estimators': 180, 'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 15, 'min_samples_split': 26}. Best is trial 0 with value: 0.791925715318489.
[I 2024-08-25 20:59:16,667] Trial 5 finished with value: 0.7782317655172789 and parameters: {'n_estimators': 240, 'criterion': 'gini', 'max_depth': 17, 'min

значения гиперпараметров {'n_estimators': 100, 'criterion': 'gini', 'max_depth': 23, 'min_samples_leaf': 5, 'min_samples_split': 9}
метрика на кросс-валидации: 0.80
метрика на тестовой выборке 0.8344827586206897
CPU times: total: 11 s
Wall time: 2min 24s
