In [20]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import recall_score

In [21]:
train_data = pd.read_csv('../../data/raw/train.csv',  encoding= 'unicode_escape')
y_train = train_data["Unusual"]                      #defining the labels
X_train = train_data.drop(["Unusual"], axis=1)
test_data = pd.read_csv('../../data/raw/test.csv',  encoding= 'unicode_escape')
y_test = test_data["Unusual"]                      #defining the labels
X_test = test_data.drop(["Unusual"], axis=1)

In [22]:
# Define the pipeline
pipeline = Pipeline([
    ('preprocessing', joblib.load('../pipelines/PreprocessingPipeline.joblib')),
    ('classifier', None)
])

In [23]:
# Define the hyperparameter grid for Random Forest
rf_param_grid = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5],
}

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    'classifier': [XGBClassifier()],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__max_depth': [3, 5],
    'classifier__n_estimators': [50, 100, 200],
}


In [26]:
# Define the hyperparameter grid for both classifiers
param_grid = [rf_param_grid, xgb_param_grid]

# Define the cross-validation strategy
cv = 5

# Define the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid = param_grid, cv=cv, n_jobs=-1, scoring='recall')

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and score
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...), 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
Best score: 0.9139712655960676


In [27]:
print("Test set score:", grid_search.score(X_test, y_test))

Test set score: 0.9162759544541192
