In [1]:
#Importing packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.impute import SimpleImputer
import lightgbm

In [2]:
#Loading the files Xtrain_scaled_pca, Xtest_scaled_pca, Ytrain, Ytest from the 
#preprocessing notebook.

folder_path = '/kaggle/input/rsna-breast-cancer-detection-preprocessing-2/'

with open(folder_path + 'Xtest_scaled_pca.pickle', 'rb') as file:
    Xtest_scaled_pca = pickle.load(file)
with open(folder_path + 'Xtrain_scaled_pca.pickle', 'rb') as file:
    Xtrain_scaled_pca = pickle.load(file)
with open(folder_path + 'Ytest.pickle', 'rb') as file:
    Ytest = pickle.load(file)
with open(folder_path + 'Ytrain.pickle', 'rb') as file:
    Ytrain = pickle.load(file)

# Logistic Regression

Let's first use logistic regression. We will use Bayesian optimization to do the hyperparameter search. We will use the F1-score as the evaluation metric, because we want to minimize the number of false positives.

In [3]:
def logreg_eval(C):
    logreg = LogisticRegression(solver = 'liblinear', max_iter = 500, C=C)
    cv_results = cross_val_score(logreg, Xtrain_scaled_pca, Ytrain, scoring="f1",cv=5)
    return cv_results.mean()

In [4]:
logregBO = BayesianOptimization(logreg_eval, {'C': (0.01,200)})
logregBO.maximize(init_points=5,n_iter=10,acq='ucb')

|   iter    |  target   |     C     |
-------------------------------------


Passing acquisition function parameters or gaussian process parameters to maximize
is no longer supported, and will cause an error in future releases. Instead,
please use the "set_gp_params" method to set the gp params, and pass an instance
 of bayes_opt.util.UtilityFunction using the acquisition_function argument

  


| [0m1        [0m | [0m0.9538   [0m | [0m33.93    [0m |
| [95m2        [0m | [95m0.9542   [0m | [95m11.84    [0m |
| [0m3        [0m | [0m0.9538   [0m | [0m32.12    [0m |
| [0m4        [0m | [0m0.9535   [0m | [0m85.47    [0m |
| [0m5        [0m | [0m0.9538   [0m | [0m62.5     [0m |
| [0m6        [0m | [0m0.9488   [0m | [0m0.03289  [0m |
| [0m7        [0m | [0m0.9538   [0m | [0m20.13    [0m |
| [0m8        [0m | [0m0.9538   [0m | [0m49.58    [0m |
| [0m9        [0m | [0m0.9535   [0m | [0m102.3    [0m |
| [0m10       [0m | [0m0.9535   [0m | [0m118.6    [0m |
| [0m11       [0m | [0m0.9538   [0m | [0m73.34    [0m |
| [0m12       [0m | [0m0.9535   [0m | [0m135.7    [0m |
| [0m13       [0m | [0m0.9535   [0m | [0m152.1    [0m |
| [0m14       [0m | [0m0.9535   [0m | [0m168.1    [0m |
| [0m15       [0m | [0m0.9535   [0m | [0m183.8    [0m |


We see that the F1-score doesn't vary much over the whole range of C, and stays at around 0.95. Let's now choose the best value C=38.05 and make predictions on the test set.

In [5]:
logreg = LogisticRegression(solver = 'liblinear', max_iter = 500, C=38.05)
logreg.fit(Xtrain_scaled_pca, Ytrain)
print(f'F1-score on training data: {f1_score(logreg.predict(Xtrain_scaled_pca), Ytrain):.2f}')
print(f'F1-score on test data: {f1_score(logreg.predict(Xtest_scaled_pca), Ytest):.2f}')

F1-score on training data: 0.96
F1-score on test data: 0.64


# Random Forest

Next, we try random forest. We go back to using GridSearchCV because we want to optimize over discrete (integer) values of the parameter min_samples_split, and it's hard to deal with discrete hyperparameter values with BayesianOptimization.

In [6]:
rf = RandomForestClassifier(random_state=47, n_estimators=1000, n_jobs=-1)
parameters = {"min_samples_split": [2,4,6],
             "max_depth": [3,5,7],
             "criterion": ["gini","entropy"]}

cv_rf = GridSearchCV(rf, param_grid=parameters, scoring='f1', cv=3)

Printing the best parameter value and the best F1-score of the train set:

In [7]:
cv_rf.fit(Xtrain_scaled_pca, Ytrain)
print(cv_rf.best_score_)
print(cv_rf.best_params_)

0.9351033066160749
{'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 6}


Printing the F1-score on the test set:

In [8]:
Ypred = cv_rf.predict(Xtest_scaled_pca)
print("F1-score of the test set:", f1_score(Ytest, Ypred))

F1-score of the test set: 0.2285714285714286


Unfortunately, the F1-score of the test set is much lower than the one of the train set.

# Gradient boosting

In [9]:
gb = GradientBoostingClassifier(random_state=47, n_estimators=1000)
parameters = {"learning_rate": [0.5,1],
             "min_samples_split": [2,4],
             "max_depth": [3,5]}

cv_gb = GridSearchCV(gb, param_grid=parameters, scoring='f1', cv=3)

Printing the best parameter value and the best F1-score of the train set:

In [10]:
cv_gb.fit(Xtrain_scaled_pca, Ytrain)
print(cv_gb.best_score_)
print(cv_gb.best_params_)

0.9739732227789645
{'learning_rate': 0.5, 'max_depth': 3, 'min_samples_split': 4}


Printing the F1-score on the test set:

In [11]:
Ypred = cv_gb.predict(Xtest_scaled_pca)
print("F1-score of the test set:", f1_score(Ytest, Ypred))

F1-score of the test set: 0.3364055299539171


Again, the F1-score of the test set is considerably lower than the one of the train set.

Overall, it looks like logistic regression is the best algorithm.