In [1]:
import pandas as pd

DATA = '/kaggle/input/breast-cancer-dataset/breast-cancer.csv'
df = pd.read_csv(filepath_or_buffer=DATA, index_col=['id'])
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


All of our data is numerical except for the target class, and we have no missing data. How balanced is our target class?

In [2]:
df['diagnosis'].value_counts().to_dict()

{'B': 357, 'M': 212}

Benign tumors outnumber malignant tumors about three to two, so our class is reasonably well balanced. Let's visualize our data using dimension reduction to see if our data is random or noisy.

In [3]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df.drop(columns=['diagnosis']))
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:12.007108


In [4]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='diagnosis')

We see a lot of clustering and some separation: for most of our cases, the nearest instance in this plot is of the same kind. We might expect a model to do well. Let's build a model.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['x', 'y', 'diagnosis']), df['diagnosis'], test_size=0.2, random_state=2024, stratify=df['diagnosis'])

logreg = LogisticRegression(max_iter=100000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 2158 iterations
accuracy: 0.9649
f1: 0.9651
              precision    recall  f1-score   support

           B       0.99      0.96      0.97        72
           M       0.93      0.98      0.95        42

    accuracy                           0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114



Let's take a look at our model probabilities.

In [6]:
probability_df = pd.DataFrame(data=logreg.predict_proba(X=X_test).max(axis=1), columns=['probability'])
probability_df['true'] = y_test.tolist()
probability_df['pred'] = logreg.predict(X=X_test)
probability_df['correct'] = probability_df['true'] == probability_df['pred']
probability_df[['x', 'y']] = umap.transform(X=X_test)

probability_df.head()

Unnamed: 0,probability,true,pred,correct,x,y
0,0.996963,B,B,True,-1.983557,5.715521
1,0.999982,M,M,True,11.896913,2.327398
2,0.999455,B,B,True,0.026627,10.481048
3,0.999686,B,B,True,-1.105289,8.834642
4,0.999418,B,B,True,-1.775157,1.643788


What is the mean model probability conditioned on whether the model is correct?

In [7]:
probability_df[['probability', 'correct']].groupby(by=['correct']).mean().to_dict()

{'probability': {False: 0.6700285772004945, True: 0.962947503641099}}

The fact that the mean model probability when the model is correct is nearly 1.0 suggests there aren't a lot of model probabilities that aren't really close to 1.0. Let's plot the model probabilities.

In [8]:
express.scatter(data_frame=probability_df, x='x', y='y', color='probability', facet_col='correct', hover_name='true')