In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

### Data import

In [2]:
data = load_breast_cancer(as_frame=True)

### Data preparation

In [3]:
# (separate) X and y
X = data.data
y = data.target

In [4]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, shuffle=True)

In [5]:
# scale the data
scaler = StandardScaler()

X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

### Testing of different ML models

In [6]:
# define models to test (for this comparison, default settings are used with every model)
seed = 101
log_model = LogisticRegression(random_state=seed)
SVC_model = SVC(random_state=seed)
GBC_model = GradientBoostingClassifier(random_state=seed)
RF_model = RandomForestClassifier(random_state=seed)

In [7]:
# test the different models parallelly on the test set
models = [log_model, SVC_model, RF_model, GBC_model]
results = []

for m in models:
    m.fit(X_train_sc, y_train)
    y_pred = m.predict(X_test_sc)
    model_results = [str(m), accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), 
                     recall_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]
    results.append(model_results)

# create a df with the results
test_results = pd.DataFrame(results, columns=('model', 'accuracy', 'precision', 'recall', 'ROC AUC'))

In [8]:
test_results

Unnamed: 0,model,accuracy,precision,recall,ROC AUC
0,LogisticRegression(random_state=101),0.976608,0.971963,0.990476,0.972511
1,SVC(random_state=101),0.982456,0.981132,0.990476,0.980087
2,RandomForestClassifier(random_state=101),0.953216,0.961905,0.961905,0.950649
3,GradientBoostingClassifier(random_state=101),0.959064,0.962264,0.971429,0.955411


##### SVM produces the best results with the default settings.

<br>

In [9]:
# test the different models parallelly on the train set to compare the results with the above
models = [log_model, SVC_model, RF_model, GBC_model]
results = []

for m in models:
    m.fit(X_train_sc, y_train)
    y_pred = m.predict(X_train_sc)
    model_results = [str(m), accuracy_score(y_train, y_pred), precision_score(y_train, y_pred), 
                     recall_score(y_train, y_pred), roc_auc_score(y_train, y_pred)]
    results.append(model_results)

# create a df with the results
train_results = pd.DataFrame(results, columns=('model', 'accuracy', 'precision', 'recall', 'ROC AUC'))

In [10]:
train_results

Unnamed: 0,model,accuracy,precision,recall,ROC AUC
0,LogisticRegression(random_state=101),0.984925,0.984252,0.992063,0.982333
1,SVC(random_state=101),0.982412,0.980392,0.992063,0.978908
2,RandomForestClassifier(random_state=101),1.0,1.0,1.0,1.0
3,GradientBoostingClassifier(random_state=101),1.0,1.0,1.0,1.0


##### RF and GBM models are definitely overfitted to the train set. This might be the reason for their poorer performance on the test set.
##### The performance of SVM is very close to one another on both data sets.