# Strojenie hiperparametrów

## Wczytanie pakietów i danych

In [None]:
pip install light

In [8]:
import pandas as pd
import numpy as np

from catboost  import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve, auc, plot_roc_curve
from sklearn.metrics import plot_confusion_matrix

from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_csv('preprocessed_data.csv')
X = df.drop(columns=['income_level'])
y = df['income_level']

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45222 entries, 0 to 45221
Data columns (total 49 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   45222 non-null  float64
 1   workclass_State-gov                   45222 non-null  int64  
 2   workclass_Self-emp-not-inc            45222 non-null  int64  
 3   workclass_Private                     45222 non-null  int64  
 4   workclass_Federal-gov                 45222 non-null  int64  
 5   workclass_Local-gov                   45222 non-null  int64  
 6   workclass_Self-emp-inc                45222 non-null  int64  
 7   workclass_Without-pay                 45222 non-null  int64  
 8   education_num                         45222 non-null  int64  
 9   marital_status_Never-married          45222 non-null  int64  
 10  marital_status_Married-civ-spouse     45222 non-null  int64  
 11  marital_status_

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45, stratify=y)

#### Funkcje pomocnicze

In [167]:
def simple_fit(clfs, cols=None):
    for i, clf in enumerate(clfs):
        if cols is not None and cols[i] is not None:              
            clf.fit(X_train.loc[:, cols[i]], y_train)
        
            y_pred = clf.predict(X_test.loc[:, cols[i]])
            y_pred_train = clf.predict(X_train.loc[:, cols[i]])
        else:
            clf.fit(X_train, y_train)  
            
            y_pred = clf.predict(X_test)
            y_pred_train = clf.predict(X_train)
        
        name = clf.__class__.__name__
        print(f"{name} train set accuracy score : {accuracy_score(y_train, y_pred_train):.4f}")
        print(f"{name} test set accuracy score : {accuracy_score(y_test, y_pred):.4f}")
        print("")

In [81]:
def cross_val(clf, cols=None):
        print("5-fold CV results")
        if cols is not None:
            cv_score = cross_validate(clf, X_train.loc[:, cols], y_train, scoring=["accuracy", "f1", "roc_auc"], return_train_score=True, n_jobs=-1)
        else:
            cv_score = cross_validate(clf, X_train, y_train, scoring=["accuracy", "f1", "roc_auc"], return_train_score=True, n_jobs=-1)

        name = clf.__class__.__name__

        print(f'{name} train accuracy : {np.mean(cv_score["train_accuracy"]) * 100:.2f}% +- {np.std(cv_score["train_accuracy"]) * 100:.2f}%')
        print(f'{name} test accuracy : {np.mean(cv_score["test_accuracy"]) * 100:.2f}% +- {np.std(cv_score["test_accuracy"]) * 100:.2f}%')
        print('')
        print(f'{name} test F1 : {np.mean(cv_score["test_f1"]) * 100:.2f}% +- {np.std(cv_score["test_f1"]) * 100:.2f}%')
        print(f'{name} test ROC AUC : {np.mean(cv_score["test_roc_auc"]) * 100:.2f}% +- {np.std(cv_score["test_roc_auc"]) * 100:.2f}%')
        print(f'{name} fit time : {np.mean(cv_score["fit_time"])}')
        print('')
        return cv_score

In [19]:
def insert_results_to_df(df, res, row_name):
    df.loc[row_name] = [np.mean(res["test_accuracy"]), \
                                np.mean(res["test_roc_auc"]), np.mean(res["test_f1"])]

In [20]:
indicators = ['accuracy', 'roc auc', 'f1']
t = pd.DataFrame(columns = indicators) 

## Wybór modeli
Po analizie wyników z naszej wcześniejszej pracy do dalszej analizy wybraliśmy modele CatBoostClassifier oraz XGBClassifier. Cechowały się one najwyższą dokładnością.
Poniżej przypomnimy jakie wyniki średnio osiągały testowane przez nas modele.

In [15]:
classifiers1 = [
    DecisionTreeClassifier(random_state=0, max_depth=3),
    LogisticRegression(random_state=0, max_iter=1000),
    AdaBoostClassifier(random_state=0, n_estimators=1000),
    RandomForestClassifier(random_state=0, n_estimators=750, max_depth=4),
    CatBoostClassifier(random_state=0, depth=3, silent=True, n_estimators=100),
    XGBClassifier(random_state=0, learning_rate=0.4, booster='gbtree', max_depth=4, eval_metric="logloss", use_label_encoder=False)
]

In [16]:
simple_fit(classifiers1)

DecisionTreeClassifier train set accuracy score : 0.8390
DecisionTreeClassifier test set accuracy score : 0.8404

LogisticRegression train set accuracy score : 0.8492
LogisticRegression test set accuracy score : 0.8496

AdaBoostClassifier train set accuracy score : 0.8703
AdaBoostClassifier test set accuracy score : 0.8685

RandomForestClassifier train set accuracy score : 0.8348
RandomForestClassifier test set accuracy score : 0.8389

CatBoostClassifier train set accuracy score : 0.8757
CatBoostClassifier test set accuracy score : 0.8687

XGBClassifier train set accuracy score : 0.8799
XGBClassifier test set accuracy score : 0.8707



## XGBClassifier
Aby móc zobaczyć różnicę, pokażemy jak ten model radzi sobie z dotychczasowymi ustawieniami parametrów. Następnie skorzystamy z metod dostrajania parametrów oraz doboru znaczących zmiennych i ocenimy różnicę.

### Model przed zmianami
Widzimy, że można posądzać model o lekki overfitting.

In [30]:
xg_boost = XGBClassifier(random_state=0,
                    learning_rate=0.4, 
                    booster='gbtree', 
                    max_depth=4,  
                    eval_metric="logloss",
                    use_label_encoder=False)

In [31]:
res = cross_val(xg_boost)

insert_results_to_df(t, res, 'XGBoost initial')

5-fold CV results
XGBClassifier train accuracy : 88.11% +- 0.12%
XGBClassifier test accuracy : 86.96% +- 0.40%

XGBClassifier test F1 : 71.50% +- 0.86%
XGBClassifier test ROC AUC : 92.71% +- 0.30%
XGBClassifier fit time : 2.273709297180176



### Grid Search
Przechodzimy do strojenia parametrów. Poniżej wybraliśmy parametry, które uważamy za najistotniejsze.

In [57]:
xg_boost = XGBClassifier()

parameters = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    'booster': ['gbtree', 'gblinear', 'dart'],
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'random_state': [1],
    'eval_metric': ['logloss', 'rmse', 'error'],
    'use_label_encoder': [False]
}

xgb_grid = GridSearchCV(xg_boost, parameters, cv=3, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=-1,
             param_grid={'booster': ['gbtree', '

In [61]:
df = pd.DataFrame(xgb_grid.cv_results_)
df = df.loc[:,['params', 'mean_test_score']]

grid_search_results = df.sort_values('mean_test_score', ascending=False) \
                        .reset_index(drop=True)
grid_search_results

Unnamed: 0,params,mean_test_score
0,"{'booster': 'gbtree', 'eval_metric': 'rmse', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
1,"{'booster': 'dart', 'eval_metric': 'error', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
2,"{'booster': 'dart', 'eval_metric': 'logloss', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
3,"{'booster': 'gbtree', 'eval_metric': 'logloss', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
4,"{'booster': 'gbtree', 'eval_metric': 'error', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
...,...,...
3775,"{'booster': 'gblinear', 'eval_metric': 'rmse', 'gamma': 2, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'random_state': 1, 'use_label_encoder': False}",0.827603
3776,"{'booster': 'gblinear', 'eval_metric': 'logloss', 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 10, 'random_state': 1, 'use_label_encoder': False}",0.827603
3777,"{'booster': 'gblinear', 'eval_metric': 'error', 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 10, 'random_state': 1, 'use_label_encoder': False}",0.827603
3778,"{'booster': 'gblinear', 'eval_metric': 'rmse', 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 10, 'random_state': 1, 'use_label_encoder': False}",0.827574


Wyniki nie poprawiły się w znaczący sposób. Widać też, że wiele modeli osiąga identyczne wyniki. 
Przyjrzymy się pierwszym 10 modelom oraz przeprowadzimy crossvalidację na najlepszym modelu by móc porównać go z modelem początkowym.

In [63]:
top10 = grid_search_results.iloc[:10, :]

top10

Unnamed: 0,params,mean_test_score
0,"{'booster': 'gbtree', 'eval_metric': 'rmse', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
1,"{'booster': 'dart', 'eval_metric': 'error', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
2,"{'booster': 'dart', 'eval_metric': 'logloss', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
3,"{'booster': 'gbtree', 'eval_metric': 'logloss', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
4,"{'booster': 'gbtree', 'eval_metric': 'error', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
5,"{'booster': 'dart', 'eval_metric': 'rmse', 'gamma': 0.5, 'learning_rate': 0.4, 'max_depth': 3, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.872155
6,"{'booster': 'dart', 'eval_metric': 'rmse', 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.871536
7,"{'booster': 'gbtree', 'eval_metric': 'logloss', 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.871536
8,"{'booster': 'gbtree', 'eval_metric': 'rmse', 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.871536
9,"{'booster': 'dart', 'eval_metric': 'logloss', 'gamma': 2, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 1, 'random_state': 1, 'use_label_encoder': False}",0.871536


Wybieramy najlepszy model i testujemy go tak jak model początkowy.

In [62]:
xg_boost_best = XGBClassifier(booster='gbtree', 
                              eval_metric='rmse',
                              gamma=0.5,
                              learning_rate=0.4,
                              max_depth=3,
                              min_child_weight=1,
                              random_state=1,
                              use_label_encoder=False)

res = cross_val(xg_boost_best)

insert_results_to_df(t, res, 'XGBoost best GS')

5-fold CV results
XGBClassifier train accuracy : 87.69% +- 0.06%
XGBClassifier test accuracy : 87.05% +- 0.35%

XGBClassifier test F1 : 71.45% +- 0.79%
XGBClassifier test ROC AUC : 92.73% +- 0.32%
XGBClassifier fit time : 1.7649950981140137



In [63]:
t

Unnamed: 0,accuracy,roc auc,f1
XGBoost initial,0.86959,0.927138,0.715033
XGBoost best GS,0.870474,0.92728,0.714495


### Wnioski
W każdym z najlepszych modeli jest `learning rate=0.4`, `max_depth=3`,`min_child_weight=1`oraz `gamma=0.5`, widać że metryka nie ma wielkiego wpływu i że booster `gbtree` i `dart` są zbliżone. 

Najlepszy model osiąga lepsze wyniki niż model bazowy.

## Selekcja zmiennych

### SelectKbest

In [77]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=30)
selector.fit_transform(X_train, y_train)

selected = selector.get_support()

In [82]:
res = cross_val(xg_boost_best, cols=selected)
insert_results_to_df(t, res, 'XGBoost best 30')

5-fold CV results
XGBClassifier train accuracy : 87.16% +- 0.06%
XGBClassifier test accuracy : 86.65% +- 0.27%

XGBClassifier test F1 : 70.32% +- 0.60%
XGBClassifier test ROC AUC : 92.50% +- 0.36%
XGBClassifier fit time : 1.3052058219909668



Zostawienie jedynie 30 zmiennych trochę pogorszyło jakość modelu.

In [83]:
selector = SelectKBest(k=40)
selector.fit_transform(X_train, y_train)

selected = selector.get_support()

res = cross_val(xg_boost_best, cols=selected)

insert_results_to_df(t, res, 'XGBoost best 40')

5-fold CV results
XGBClassifier train accuracy : 87.60% +- 0.07%
XGBClassifier test accuracy : 87.00% +- 0.30%

XGBClassifier test F1 : 71.35% +- 0.74%
XGBClassifier test ROC AUC : 92.71% +- 0.32%
XGBClassifier fit time : 1.5544206619262695



Zostawienie 40 zmiennych daje lepsze efekty niż przy 30, jednak nadal jest to gorszy wynik niż model ze wszystkimi zmiennymi.

In [84]:
selector = SelectKBest(k=46)
selector.fit_transform(X_train, y_train)

selected = selector.get_support()

res = cross_val(xg_boost_best, cols=selected)

insert_results_to_df(t, res, 'XGBoost best 46')

5-fold CV results
XGBClassifier train accuracy : 87.69% +- 0.07%
XGBClassifier test accuracy : 87.04% +- 0.35%

XGBClassifier test F1 : 71.44% +- 0.74%
XGBClassifier test ROC AUC : 92.74% +- 0.34%
XGBClassifier fit time : 1.109172248840332



In [88]:
t.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,accuracy,roc auc,f1
XGBoost best GS,0.870474,0.92728,0.714495
XGBoost best GS2,0.870415,0.927032,0.714842
XGBoost best 46,0.870415,0.927431,0.714378
XGBoost best 40,0.870002,0.927106,0.713468
XGBoost initial,0.86959,0.927138,0.715033
XGBoost best 30,0.866494,0.925001,0.703236


#### Wnioski
W przypadku tego modelu redukcja zmiennych nie wnosi korzyści.
Spróbujemy teraz dokonać redukcji metodą Recursive Feature Elimination.

### Recursive Feature Elimination

In [89]:
from sklearn.feature_selection import RFE

rfe_selector = RFE(xg_boost_best, n_features_to_select=40, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)

res = cross_val(xg_boost_best, cols=rfe_selector.support_)

insert_results_to_df(t, res, 'XGBoost best 40 RFE')

5-fold CV results
XGBClassifier train accuracy : 87.73% +- 0.06%
XGBClassifier test accuracy : 87.03% +- 0.44%

XGBClassifier test F1 : 71.42% +- 1.00%
XGBClassifier test ROC AUC : 92.75% +- 0.32%
XGBClassifier fit time : 1.6363305568695068



In [113]:
rfe_selector2 = RFE(xg_boost_best, n_features_to_select=45, step=1)
rfe_selector2 = rfe_selector2.fit(X_train, y_train) 

res = cross_val(xg_boost_best, cols=rfe_selector2.support_)

insert_results_to_df(t, res, 'XGBoost best 45 RFE')

5-fold CV results
XGBClassifier train accuracy : 87.69% +- 0.08%
XGBClassifier test accuracy : 87.07% +- 0.39%

XGBClassifier test F1 : 71.49% +- 0.90%
XGBClassifier test ROC AUC : 92.71% +- 0.33%
XGBClassifier fit time : 1.728564977645874



In [91]:
rfe_selector = RFE(xg_boost_best, n_features_to_select=47, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train) 

res = cross_val(xg_boost_best, cols=rfe_selector.support_)

insert_results_to_df(t, res, 'XGBoost best 47 RFE')

5-fold CV results
XGBClassifier train accuracy : 87.69% +- 0.06%
XGBClassifier test accuracy : 87.05% +- 0.35%

XGBClassifier test F1 : 71.45% +- 0.79%
XGBClassifier test ROC AUC : 92.73% +- 0.32%
XGBClassifier fit time : 1.6473766326904298



In [92]:
t.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,accuracy,roc auc,f1
XGBoost best 45 RFE,0.87074,0.927096,0.714907
XGBoost best GS,0.870474,0.92728,0.714495
XGBoost best 47 RFE,0.870474,0.92728,0.714495
XGBoost best GS2,0.870415,0.927032,0.714842
XGBoost best 46,0.870415,0.927431,0.714378
XGBoost best 40 RFE,0.870268,0.927484,0.714233
XGBoost best 40,0.870002,0.927106,0.713468
XGBoost initial,0.86959,0.927138,0.715033
XGBoost best 30,0.866494,0.925001,0.703236


### Potencjalne kolumny do odrzucenia

In [114]:
cols = -(rfe_selector.support_.astype('int16') - 1)
cols = cols.astype('bool')
X_train.loc[:,cols].columns

Index(['occupation_Armed-Forces', 'sex_Female'], dtype='object')

In [115]:
cols2 = -(rfe_selector2.support_.astype('int16') - 1)
cols2 = cols2.astype('bool')
X_train.loc[:,cols2].columns

Index(['workclass_Without-pay', 'marital_status_Separated',
       'occupation_Armed-Forces', 'sex_Female'],
      dtype='object')

### Obserwacje
Przy usunięciu 2 zmiennych - `sex_Female`oraz `occupation_Armed-Forces`  model osiąga nieznacznie wyższe wyniki niż z nimi. Jednak różnice są tak małe, że może to być kwestia doboru zbioru treningowego.

### Weryfikacja na zbiorze testowym

In [118]:
xg_boost_best.fit(X_train, y_train)
y_pred = xg_boost_best.predict(X_test)
accuracy_score(y_pred, y_test)

0.8712188218644967

Weryfikacja okazała się miłym zaskoczeniem. Nie doświadczyliśmy overfittingu.
Sprawdźmy co jesli usuniemy kolumny `sex_Female` oraz `occupation_Armed-Forces`.

In [122]:
xg_boost_best = XGBClassifier(booster='gbtree', 
                              eval_metric='rmse',
                              gamma=0.5,
                              learning_rate=0.4,
                              max_depth=3,
                              min_child_weight=1,
                              random_state=1,
                              use_label_encoder=False)
xg_boost_best.fit(X_train.drop(columns=['occupation_Armed-Forces', 'sex_Female']), y_train)

y_pred = xg_boost_best.predict(X_test.drop(columns=['occupation_Armed-Forces', 'sex_Female']))
accuracy_score(y_pred, y_test)

0.8712188218644967

Wynik się nie zmienił dlatego porzucimy te dwie kolumny w finalnym modelu.

## CatBoostClassifier

In [159]:
catboost_base = CatBoostClassifier(learning_rate=0.04, depth=6, verbose = False)

res = cross_val(catboost_base)
insert_results_to_df(t, res, 'CatBoost initial')

5-fold CV results
CatBoostClassifier train accuracy : 88.85% +- 0.14%
CatBoostClassifier test accuracy : 87.04% +- 0.35%

CatBoostClassifier test F1 : 71.47% +- 0.87%
CatBoostClassifier test ROC AUC : 92.77% +- 0.32%
CatBoostClassifier fit time : 14.27340931892395



### Optymalizowane parametry:
- iterations (100)
- l2_leaf_reg
- min_data_in_leaf
- grow_policy
- learning_rate
- depth

### Grid Search

In [145]:
cat_boost = CatBoostClassifier(loss_function='Logloss', silent=True)
grid = {'l2_leaf_reg': [1, 2],
        'min_data_in_leaf': [2, 3, 4],
        'grow_policy' : ['SymmetricTree', 'Depthwise'],
        'learning_rate': [0.04, 0.1, 0.2, 0.3, 0.4, 0.5],
        'depth': [3, 4, 5, 6]}

cat_grid = GridSearchCV(model, grid, cv=3, n_jobs=-1)
cat_grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=<catboost.core.CatBoostClassifier object at 0x000002B1014474F0>,
             n_jobs=-1,
             param_grid={'depth': [3, 4, 5, 6],
                         'grow_policy': ['SymmetricTree', 'Depthwise'],
                         'l2_leaf_reg': [1, 2],
                         'learning_rate': [0.04, 0.1, 0.2, 0.3, 0.4, 0.5],
                         'min_data_in_leaf': [2, 3, 4]})

In [146]:
df = pd.DataFrame(cat_grid.cv_results_)
df = df.loc[:,['params', 'mean_test_score']]

grid_search_results = df.sort_values('mean_test_score', ascending=False) \
                        .reset_index(drop=True)
grid_search_results.iloc[0:4, 0] #najlepsze uzyskane wyniki
grid_search_results.iloc[0, 0]

{'depth': 3,
 'grow_policy': 'Depthwise',
 'l2_leaf_reg': 2,
 'learning_rate': 0.04,
 'min_data_in_leaf': 2}

In [151]:
cat_boost_best = CatBoostClassifier(loss_function='Logloss', silent=True, depth=3, grow_policy='Depthwise', l2_leaf_reg=2, learning_rate=0.04, min_data_in_leaf=2)
res = cross_val(cat_boost_best)
insert_results_to_df(t, res, 'CatBoost best GS')

5-fold CV results
CatBoostClassifier train accuracy : 87.84% +- 0.06%
CatBoostClassifier test accuracy : 87.14% +- 0.34%

CatBoostClassifier test F1 : 71.63% +- 0.79%
CatBoostClassifier test ROC AUC : 92.79% +- 0.32%
CatBoostClassifier fit time : 12.363013505935669



In [158]:
t

Unnamed: 0,accuracy,roc auc,f1
XGBoost initial,0.86959,0.927138,0.715033
XGBoost best GS,0.870474,0.92728,0.714495
XGBoost best GS2,0.870415,0.927032,0.714842
XGBoost best 30,0.866494,0.925001,0.703236
XGBoost best 40,0.870002,0.927106,0.713468
XGBoost best 46,0.870415,0.927431,0.714378
XGBoost best 40 RFE,0.870268,0.927484,0.714233
XGBoost best 45 RFE,0.87074,0.927096,0.714907
XGBoost best 47 RFE,0.870474,0.92728,0.714495
CatBoost best GS,0.871418,0.927888,0.716308


 #### Najlepsze parametry :
`depth=3`, `grow_policy=Depthwise`,
 `iterations=100`,
 `l2_leaf_reg=2`,
 `learning_rate=0.04`,
 `min_data_in_leaf=2`

### Selekcja Zmiennych

In [135]:
selector = SelectKBest(k=44)
selector.fit_transform(X_train, y_train)
selected = selector.get_support()
selected

X_train.columns[selected]
selector.pvalues_[selected]

imp_df = pd.DataFrame({"name":X_train.columns[selected], "importance":selector.pvalues_[selected]}).sort_values(by = 'importance')
imp_df

Unnamed: 0,name,importance
0,age,0.0
29,relationship_Own-child,0.0
33,sex_Male,0.0
14,occupation_Exec-managerial,0.0
27,relationship_Husband,0.0
35,capital_gain,0.0
8,marital_status_Married-civ-spouse,0.0
34,sex_Female,0.0
6,education_num,0.0
7,marital_status_Never-married,0.0


### Sprawdzenie po odrzuceniu 5 najmniej ważnych cech

In [139]:
res = cross_val(cat_boost_best, cols=selected)
insert_results_to_df(t, res, 'CatBoost best 44')

5-fold CV results
CatBoostClassifier train accuracy : 87.79% +- 0.06%
CatBoostClassifier test accuracy : 87.12% +- 0.31%

CatBoostClassifier test F1 : 71.60% +- 0.74%
CatBoostClassifier test ROC AUC : 92.73% +- 0.27%
CatBoostClassifier fit time : 1.2009706020355224



Odrzucenie zmiennych delikatnie pogorszyło wynik modelu. Dlatego w przypadku CatBoost zrezygnujemy z redukcji zmiennych.

### Końcowa ewaluacja modelu

In [161]:
cat_boost_best.fit(X_train, y_train)
y_pred = cat_boost_best.predict(X_test)

accuracy_score(y_test, y_pred)

0.8710419246417831

### Tymczasem bazowy model na danych testowych...

In [144]:
catboost_base.fit(X_train, y_train)
y_pred = catboost_base.predict(X_test)

accuracy_score(y_test, y_pred)

0.8708650274190696

### Wnioski
Strojenie hiperparametrów w klasyfikatorze CatBoost nie przyniosło efektów. Podobnie redukcja zmiennych nie wpływa na poprawę modelu.

## Podsumowanie

### Wyniki krosswalidacji

In [164]:
t.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,accuracy,roc auc,f1
CatBoost best GS,0.871418,0.927888,0.716308
CatBoost best 44,0.871182,0.927262,0.716044
XGBoost best 45 RFE,0.87074,0.927096,0.714907
XGBoost best GS,0.870474,0.92728,0.714495
XGBoost best 47 RFE,0.870474,0.92728,0.714495
CatBoost initial,0.870445,0.927683,0.714682
XGBoost best GS2,0.870415,0.927032,0.714842
XGBoost best 46,0.870415,0.927431,0.714378
XGBoost best 40 RFE,0.870268,0.927484,0.714233
XGBoost best 40,0.870002,0.927106,0.713468


### Porównanie wyników na zbiorze testowym

In [174]:
classifiers = [xg_boost, xg_boost_best, xg_boost_best, xg_boost_best, catboost_base, cat_boost_best, cat_boost_best]
cols = [None, None, rfe_selector.support_, rfe_selector2.support_, None, None, selected]
simple_fit(classifiers, cols)

XGBClassifier train set accuracy score : 0.8871
XGBClassifier test set accuracy score : 0.8691

XGBClassifier train set accuracy score : 0.8765
XGBClassifier test set accuracy score : 0.8712

XGBClassifier train set accuracy score : 0.8765
XGBClassifier test set accuracy score : 0.8712

XGBClassifier train set accuracy score : 0.8763
XGBClassifier test set accuracy score : 0.8710

CatBoostClassifier train set accuracy score : 0.8851
CatBoostClassifier test set accuracy score : 0.8709

CatBoostClassifier train set accuracy score : 0.8770
CatBoostClassifier test set accuracy score : 0.8710

CatBoostClassifier train set accuracy score : 0.8773
CatBoostClassifier test set accuracy score : 0.8698



Wyniki są bardzo zbliżone. Strojenie parametrów wprowadziło minimalne zmiany. Różnice między modelami mogą zależeć od podziału zbioru na testowy i treningowy.

Ponieważ dokładność modeli jest prawie identyczna uznaliśmy, że wybierzemy model, pod kątem tego aby był jak najszybszy.

Ostatecznie zdecydujemy się na model XGBClassifier o parametrach: `learning rate=0.4`, `max_depth=3`,`min_child_weight=1`, `gamma=0.5` oraz `booster=gbtree`.
Zrezygnujemy także z kolumn `sex_Female` oraz z kolumny `occupation_Armed-Forces`.