## Dobór Hiperpaprametrów

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [None]:
df_train = pd.read_csv('../data_with_features/train_with_features.csv')
df_test = pd.read_csv('../data_with_features/test_with_features.csv')

In [None]:
y = df_train['phase']
x = df_train.drop(['cellid', 'phase', 'order_within_phase', 'order'], axis = 1)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

### HistGradBoost

In [None]:
hgb_clf = HistGradientBoostingClassifier()

In [None]:
param_grid = {
    'max_iter': sp_randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': sp_randint(3, 15),
    'min_samples_leaf': sp_randint(1, 20),
    'max_leaf_nodes': sp_randint(10, 100),
    'max_bins': sp_randint(100, 255),
    'l2_regularization': uniform(0, 1),
}

random_search = RandomizedSearchCV(
    estimator=hgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_train)

In [None]:
random_search.best_params_

In [None]:
random_search.best_score_

In [None]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search.xlsx', index=False)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'l2_regularization': [0.4, 0.5, 0.6],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_bins': [50,100,150],
    'max_depth': [5,10,15],
    'max_iter': [100,1000],
    'max_leaf_nodes': [50,100],
    'min_samples_leaf': [10,20],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'l2_regularization': [0.6,0.7],
    'learning_rate': [0.1],
    'max_bins': [25, 50, 75],
    'max_depth': [10],
    'max_iter': [50,100,150],
    'max_leaf_nodes': [40,50,60],
    'min_samples_leaf': [8,15],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'l2_regularization': [0.55,0.6,0.65],
    'learning_rate': [0.1],
    'max_bins': [20,30],
    'max_depth': [10],
    'max_iter': [40,50,60],
    'max_leaf_nodes': [60,70,80,90],
    'min_samples_leaf': [8,10,12],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [15,20,25],
    'max_depth': [10],
    'max_iter': [30,35,40],
    'max_leaf_nodes': [50,55,60],
    'min_samples_leaf': [9,10,11],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [18,20,22],
    'max_depth': [10],
    'max_iter': [38,40,42,44],
    'max_leaf_nodes': [52,55,57],
    'min_samples_leaf': [10],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'l2_regularization': [0.65],
    'learning_rate': [0.1],
    'max_bins': [20,21],
    'max_depth': [10],
    'max_iter': [41,42,43],
    'max_leaf_nodes': [53,54,55,56],
    'min_samples_leaf': [10],
}

cv_search = GridSearchCV(
    estimator=hgb_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
hgb_clf = HistGradientBoostingClassifier(
    l2_regularization=0.65,
    learning_rate=0.1,
    max_bins=20,
    max_depth=10,
    max_iter=43,
    max_leaf_nodes=54,
    min_samples_leaf=10,
)

hgb_clf.fit(x_train, y_train)

y_pred_hgb = hgb_clf.predict(x_val)

accuracy_score(y_val, y_pred_hgb)

### Losowe lasy

In [None]:
rf_clf = RandomForestClassifier()

param_grid = {
    'n_estimators': sp_randint(50, 500),
    'max_depth': [None] + list(sp_randint(3, 20).rvs(10)),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_train)

In [None]:
random_search.best_params_

In [None]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_randomforest.xlsx', index=False)

In [None]:
param_grid = {
    'n_estimators': [100,300,500],
    'max_depth': [None,8,10,12,14],
    'min_samples_split': [3,8,12,18],
    'min_samples_leaf': [3,8,12,18],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [None],
    'min_samples_split': [10,12,14],
    'min_samples_leaf': [10,12,14],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'n_estimators': [150,200,250],
    'max_depth': [None],
    'min_samples_split': [13,14,15,16],
    'min_samples_leaf': [11,12,13],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'n_estimators': [170, 185, 200, 215, 230],
    'max_depth': [None],
    'min_samples_split': [16, 17, 18, 19, 20],
    'min_samples_leaf': [13],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
param_grid = {
    'n_estimators': [205, 210, 215, 220, 225],
    'max_depth': [None],
    'min_samples_split': [19],
    'min_samples_leaf': [13],
    'max_features': [None],
    'bootstrap': [True],
    'criterion': ['gini']
}

cv_search = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy',
)

cv_search.fit(x_train, y_train)

In [None]:
cv_search.best_params_

In [None]:
rf_clf = RandomForestClassifier(
    random_state=42,
    bootstrap=True,
    criterion='gini',
    max_depth=None,
    max_features=None,
    min_samples_leaf=13,
    min_samples_split=19,
    n_jobs=-1,
    n_estimators=210,
)

rf_clf.fit(x_train, y_train)

y_pred_rf = rf_clf.predict(x_val)

accuracy_score(y_val, y_pred_rf)

In [None]:
rf_clf = RandomForestClassifier(
    random_state=42,
    bootstrap=True,
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=10,
    min_samples_split=17,
    n_jobs=-1,
    n_estimators=320,
)

rf_clf.fit(x_train, y_train)

y_pred_rf = rf_clf.predict(x_val)

accuracy_score(y_val, y_pred_rf)

### XGBoost

In [None]:
xgb_clf = XGBClassifier()

In [None]:
le = LabelEncoder()
y_numeric_train = le.fit_transform(y_train)  # Zamienia 'G1' -> 0 itd
y_numeric_val = le.fit_transform(y_val)

In [None]:
param_grid = {
    'n_estimators': sp_randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': sp_randint(3, 10),
    'min_child_weight': sp_randint(1, 10),
    'gamma': uniform(0, 0.5),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1)
}

random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy',
)

random_search.fit(x_train, y_numeric_train)

In [None]:
random_search.best_params_

In [None]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_xgboost.xlsx', index=False)

In [None]:
xgb_clf = XGBClassifier(
    colsample_bytree=1,
    gamma=0.5,
    learning_rate=0.1,
    max_depth=8,
    min_child_weight=8,
    n_estimators=100,
    reg_alpha=0.5,
    reg_lambda=1,
    subsample=0.66,
)

In [None]:
xgb_clf.fit(x_train, y_numeric_train)

y_pred_xgb = xgb_clf.predict(x_val)

accuracy_score(y_numeric_val, y_pred_xgb)

### Regresja logistyczna

In [None]:
log_reg = LogisticRegression(max_iter=10000)

# pipeline z imputerem Nanow i liniowa regresja
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('log_reg', LogisticRegression(max_iter=10000))
])

#
param_grid = {
    'log_reg__C': uniform(0.1, 10),
    'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'log_reg__penalty': ['l2']
}

# 3
random_search = RandomizedSearchCV(
    estimator=pipeline,  # Use the pipeline as the estimator
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(x_train, y_numeric_train)

In [None]:
random_search.best_params_

In [None]:
log_reg = LogisticRegression(max_iter=10000, C=3.845, solver='saga')

### Klasyfikator głosujący

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_clf), ('rf', rf_clf), ('hgb', hgb_clf), ('logreg', log_reg)],
    voting='soft'
)

voting_clf.fit(x_train, y_train)

y_pred_voting = voting_clf.predict(x_val)

accuracy_score(y_val, y_pred_voting)