## Dobór Hiperpaprametrów

In [9]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
df_train = pd.read_csv('../data_with_features/train_with_features.csv')
df_test = pd.read_csv('../data_with_features/test_with_features.csv')

In [4]:
y = df_train['phase']
x = df_train.drop(['cellid', 'phase', 'order_within_phase', 'order'], axis = 1)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

### HistGradBoost

In [4]:
hgb_clf = HistGradientBoostingClassifier()

In [5]:
param_grid = {
    'max_iter': sp_randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': sp_randint(3, 15),
    'min_samples_leaf': sp_randint(1, 20),
    'max_leaf_nodes': sp_randint(10, 100),
    'max_bins': sp_randint(100, 255),
    'l2_regularization': uniform(0, 1),
}

random_search = RandomizedSearchCV(
    estimator=hgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [6]:
random_search.best_params_

{'l2_regularization': np.float64(0.3858918742087857),
 'learning_rate': np.float64(0.06498615487691643),
 'max_bins': 112,
 'max_depth': 13,
 'max_iter': 464,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 12}

In [7]:
random_search.best_score_

np.float64(0.5775)

In [9]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search.xlsx', index=False)

In [10]:
param_grid = {
    'l2_regularization': [0.4, 0.5, 0.6],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_bins': [50,100,150],
    'max_depth': [5,10,15],
    'max_iter': [100,1000],
    'max_leaf_nodes': [50,100],
    'min_samples_leaf': [10,20],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


In [11]:
cv_search.best_params_

{'l2_regularization': 0.6,
 'learning_rate': 0.1,
 'max_bins': 50,
 'max_depth': 10,
 'max_iter': 100,
 'max_leaf_nodes': 50,
 'min_samples_leaf': 10}

In [12]:
param_grid = {
    'l2_regularization': [0.6,0.7],
    'learning_rate': [0.1],
    'max_bins': [25, 50, 75],
    'max_depth': [10],
    'max_iter': [50,100,150],
    'max_leaf_nodes': [40,50,60],
    'min_samples_leaf': [8,15],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [13]:
cv_search.best_params_

{'l2_regularization': 0.6,
 'learning_rate': 0.1,
 'max_bins': 25,
 'max_depth': 10,
 'max_iter': 50,
 'max_leaf_nodes': 60,
 'min_samples_leaf': 8}

In [14]:
param_grid = {
    'l2_regularization': [0.55,0.6,0.65],
    'learning_rate': [0.1],
    'max_bins': [20,30],
    'max_depth': [10],
    'max_iter': [40,50,60],
    'max_leaf_nodes': [60,70,80,90],
    'min_samples_leaf': [8,10,12],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [15]:
cv_search.best_params_

{'l2_regularization': 0.65,
 'learning_rate': 0.1,
 'max_bins': 20,
 'max_depth': 10,
 'max_iter': 40,
 'max_leaf_nodes': 60,
 'min_samples_leaf': 10}

In [16]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [15,20,25],
    'max_depth': [10],
    'max_iter': [30,35,40],
    'max_leaf_nodes': [50,55,60],
    'min_samples_leaf': [9,10,11],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [17]:
cv_search.best_params_

{'l2_regularization': 0.65,
 'learning_rate': 0.1,
 'max_bins': 20,
 'max_depth': 10,
 'max_iter': 40,
 'max_leaf_nodes': 55,
 'min_samples_leaf': 10}

In [18]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [18,20,22],
    'max_depth': [10],
    'max_iter': [38,40,42,44],
    'max_leaf_nodes': [52,55,57],
    'min_samples_leaf': [10],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [19]:
cv_search.best_params_

{'l2_regularization': 0.65,
 'learning_rate': 0.1,
 'max_bins': 20,
 'max_depth': 10,
 'max_iter': 42,
 'max_leaf_nodes': 55,
 'min_samples_leaf': 10}

In [20]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [20,21],
    'max_depth': [10],
    'max_iter': [41,42,43],
    'max_leaf_nodes': [53,54,55,56],
    'min_samples_leaf': [10],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [21]:
cv_search.best_params_

{'l2_regularization': 0.65,
 'learning_rate': 0.1,
 'max_bins': 20,
 'max_depth': 10,
 'max_iter': 43,
 'max_leaf_nodes': 54,
 'min_samples_leaf': 10}

In [22]:
hgb_clf = HistGradientBoostingClassifier(
    l2_regularization=0.65,
    learning_rate=0.1,
    max_bins=20,
    max_depth=10,
    max_iter=43,
    max_leaf_nodes=54,
    min_samples_leaf=10,
)

hgb_clf.fit(x_train, y_train)

y_pred_hgb = hgb_clf.predict(x_val)

accuracy_score(y_val, y_pred_hgb)

0.5813953488372093

### Losowe lasy

In [5]:
rf_clf = RandomForestClassifier()

param_grid = {
    'n_estimators': sp_randint(50, 500),
    'max_depth': [None] + list(sp_randint(3, 20).rvs(10)),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [6]:
random_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': np.int64(16),
 'max_features': None,
 'min_samples_leaf': 17,
 'min_samples_split': 2,
 'n_estimators': 471}

In [7]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_randomforest.xlsx', index=False)

In [10]:
param_grid = {
    'n_estimators': [100,300,500],
    'max_depth': [None,8,10,12,14],
    'min_samples_split': [3,8,12,18],
    'min_samples_leaf': [3,8,12,18],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


In [11]:
cv_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 12,
 'max_features': None,
 'min_samples_leaf': 18,
 'min_samples_split': 8,
 'n_estimators': 100}

In [13]:
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [11,12,13],
    'min_samples_split': [6,8,10],
    'min_samples_leaf': [15,18,21],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [14]:
cv_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 11,
 'max_features': None,
 'min_samples_leaf': 21,
 'min_samples_split': 8,
 'n_estimators': 200}

In [15]:
param_grid = {
    'n_estimators': [150,200,250],
    'max_depth': [11],
    'min_samples_split': [7,8,9],
    'min_samples_leaf': [21,22,23],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [16]:
cv_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 11,
 'max_features': None,
 'min_samples_leaf': 23,
 'min_samples_split': 7,
 'n_estimators': 150}

In [17]:
param_grid = {
    'n_estimators': [125,150,175],
    'max_depth': [11],
    'min_samples_split': [5,6,7],
    'min_samples_leaf': [23,25,27],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [18]:
cv_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 11,
 'max_features': None,
 'min_samples_leaf': 25,
 'min_samples_split': 7,
 'n_estimators': 125}

In [19]:
param_grid = {
    'n_estimators': [110,125,140],
    'max_depth': [11],
    'min_samples_split': [7],
    'min_samples_leaf': [24,25,26],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [20]:
cv_search.best_params_

{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 11,
 'max_features': None,
 'min_samples_leaf': 24,
 'min_samples_split': 7,
 'n_estimators': 110}

In [23]:
rf_clf = RandomForestClassifier(
    random_state=42,
    bootstrap=True,
    criterion='gini',
    max_depth=11,
    max_features=None,
    min_samples_leaf=24,
    min_samples_split=7,
    n_jobs=-1,
    n_estimators=110,
)

rf_clf.fit(x_train, y_train)

y_pred_rf = rf_clf.predict(x_val)

accuracy_score(y_val, y_pred_rf)

0.584717607973422

### XGBoost

In [24]:
xgb_clf = XGBClassifier()

In [25]:
le = LabelEncoder()
y_numeric_train = le.fit_transform(y_train)  # Zamienia 'G1' -> 0 itd
y_numeric_val = le.fit_transform(y_val)

In [26]:
param_grid = {
    'n_estimators': sp_randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': sp_randint(3, 10),
    'min_child_weight': sp_randint(1, 10),
    'gamma': uniform(0, 0.5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1)
}

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_numeric_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [27]:
random_search.best_params_

{'colsample_bytree': np.float64(0.9307195311236314),
 'gamma': np.float64(0.4795373972820139),
 'learning_rate': np.float64(0.11275727715356301),
 'max_depth': 8,
 'min_child_weight': 8,
 'n_estimators': 72,
 'reg_alpha': np.float64(0.4620058036441327),
 'reg_lambda': np.float64(0.9472833396118153),
 'subsample': np.float64(0.661340561246432)}

In [None]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_xgboost.xlsx', index=False)

In [28]:
xgb_clf = XGBClassifier(
    colsample_bytree=1,
    gamma=0.5,
    learning_rate=0.1,
    max_depth=8,
    min_child_weight=8,
    n_estimators=100,
    reg_alpha=0.5,
    reg_lambda=1,
    subsample=0.66,
)

In [29]:
xgb_clf.fit(x_train, y_numeric_train)

y_pred_xgb = xgb_clf.predict(x_val)

accuracy_score(y_numeric_val, y_pred_xgb)

0.5714285714285714

### Regresja logistyczna

In [30]:
log_reg = LogisticRegression(max_iter=10000)

# pipeline z imputerem Nanow i logistyczna regresja
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('log_reg', LogisticRegression(max_iter=10000))
])

#
param_grid = {
    'log_reg__C': uniform(0.1, 10),
    'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'log_reg__penalty': ['l2']
}

# 3
random_search = RandomizedSearchCV(
    estimator=pipeline,  # Use the pipeline as the estimator
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(x_train, y_numeric_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [31]:
random_search.best_params_

{'log_reg__C': np.float64(3.845401188473625),
 'log_reg__penalty': 'l2',
 'log_reg__solver': 'saga'}

In [33]:
log_reg = LogisticRegression(max_iter=10000, C=3.845, solver='saga')

log_reg.fit(x_train, y_numeric_train)

y_pred_log_reg = log_reg.predict(x_val)

accuracy_score(y_numeric_val, y_pred_log_reg)

0.53156146179402

### Klasyfikator głosujący

In [14]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('hgb', hgb_clf), ('logreg', log_reg)],
    voting='soft'
)

voting_clf.fit(x_train, y_train)

y_pred_voting = voting_clf.predict(x_val)

accuracy_score(y_val, y_pred_voting)

0.5813953488372093