In [4]:
import pandas as pd
import numpy as np

np.random.seed(42)

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.decomposition import PCA

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 

from sklearn.ensemble import GradientBoostingClassifier

In [5]:
data = pd.read_csv('./data/raw_data/data.csv', header=None)
labels = pd.read_csv('./data/raw_data/labels.csv', header=None)

data_labelled = data[:len(labels.index)]
data_unlabelled = data[len(labels.index):]

In [6]:
labels = labels.rename(columns={0: 'target'})
labels['target'] = labels['target'].map({1: False, 2: True})

In [7]:
X = data_labelled.copy()
y = labels.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [9]:
gradient_boosting_pipeline = Pipeline(
    [
    # ('selector',SelectKBest(f_classif)),
    # ('pca',PCA(random_state=42)),
    ('smote',SMOTE(random_state=42)),
    ('model',GradientBoostingClassifier(random_state=42))
    ]
)

gradient_boosting_search = GridSearchCV(
    estimator = gradient_boosting_pipeline,
    param_grid = {
        # 'selector__k':[5,10,20,50,70,100],
        # 'pca__n_components':[6,9,19],
        'model__learning_rate':[0.1],
        'model__max_depth':[2,3,4,5],
        'model__min_samples_split':[2,3,4],
        'model__min_samples_leaf':[2,5,7,10,20],
        'model__max_features':[3,4,5,6],
        'model__subsample': [0.8],
        'model__n_estimators':[200]
    },
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=1
)
gradient_boosting_search.fit(X_train,y_train.values.ravel())
gradient_boosting_best = gradient_boosting_search.best_estimator_

print('Best parameters:\n', gradient_boosting_search.best_params_)
print('Best score in cross-validation:\n', round(gradient_boosting_search.best_score_,3))
print('Score in test dataset:\n', round(gradient_boosting_search.score(X_test, y_test),3))

y_pred = gradient_boosting_best.predict(X_test)
print('Confusion marix:\n', confusion_matrix(y_true=y_test, y_pred=y_pred))

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
Best parameters:
 {'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__max_features': 5, 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200, 'model__subsample': 0.8, 'selector__k': 100}
Best score in cross-validation:
 0.905
Score in test dataset:
 1.0
Confusion marix:
 [[31  0]
 [ 0  5]]


300 fits failed out of a total of 7200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/marius/miniforge3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/marius/miniforge3/lib/python3.9/site-packages/imblearn/pipeline.py", line 266, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/Users/marius/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 586, in fit
    n_stages = self._fit_stages(
  File "/Users/marius/miniforge3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 663, in _f

In [None]:
y_pred_validation = gradient_boosting_best.predict(data_unlabelled)
pd.DataFrame(y_pred_validation).value_counts()