In [78]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

The goal of this exercise is to use Logistic Regression
to predict breast cancer. It is always important to understand the data before training any Machine Learning algorithm. The data is described in **breast-cancer-wisconsin.names**. We suggest adding manually the column names in the DataFrame.

Preliminary:

- If needed, replace missing values with the median of the column.

- Handle the column `Sample code number`. This column should not be used to train the model as it doesn't contain information about breast cancer. There are two solutions: drop it or set it as index.

In [63]:
columns = [
    "Sample code number",
    "Clump Thickness",
    "Uniformity of Cell Size",
    "Uniformity of Cell Shape",
    "Marginal Adhesion",
    "Single Epithelial Cell Size",
    "Bare Nuclei",
    "Bland Chromatin",
    "Normal Nucleoli",
    "Mitoses",
    "Class"
]
df = pd.read_csv('data/breast-cancer-wisconsin.data', names=columns, na_values="?")
df = df.drop(columns=['Sample code number'])
df_nan = df[df.isna().any(axis=1)]
df_nan

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
23,8,4,5,1,2,,7,3,1,4
40,6,6,6,9,6,,7,8,1,2
139,1,1,1,1,1,,2,1,1,2
145,1,1,3,1,2,,2,1,1,2
158,1,1,2,1,3,,1,1,1,2
164,5,1,1,1,2,,3,1,1,2
235,3,1,4,1,2,,3,1,1,2
249,3,1,1,1,2,,3,1,1,2
275,3,1,3,1,2,,2,1,1,2
292,8,8,8,1,2,,6,10,1,4


In [64]:
df.fillna(df.median(),inplace=True)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1.0,3,1,1,2
1,5,4,4,5,7,10.0,3,2,1,2
2,3,1,1,1,2,2.0,3,1,1,2
3,6,8,8,1,3,4.0,3,7,1,2
4,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2.0,1,1,1,2
695,2,1,1,1,2,1.0,1,1,1,2
696,5,10,10,3,7,3.0,8,10,2,4
697,4,8,6,4,3,4.0,10,6,1,4


1. Print the proportion of class `Benign`. What would be the accuracy if the model always predicts `Benign`?
   Later this week, we will learn about other metrics as AUC that will help us to tackle high imbalanced data sets.

In [66]:
df['Class'] = df['Class'].map({2: 0, 4: 1})
y = df['Class']
X = df.drop(columns='Class')

In [69]:
prop_benign = (df['Class'] == 0).mean()
print(f"Proportion of Benign cases: {prop_benign:.2%}")
print(f"Baseline accuracy (always predict Benign): {prop_benign:.2%}")

Proportion of Benign cases: 65.52%
Baseline accuracy (always predict Benign): 65.52%


2. Using train_test_split, split the data set in a train set and test set (20%). Both sets should have approximately the same proportion of class `Benign`. Use `random_state = 43`.

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    stratify=y, 
    random_state=43
)
prop_benign_test = (y_test == 0).mean()
prop_brngin_train = (y_train == 0).mean()
print(f"Proportion of Benign test: {prop_benign_test:.2%}")
print(f"Proportion of Benign train: {prop_brngin_train:.2%}")

Proportion of Benign test: 65.71%
Proportion of Benign train: 65.47%


3. Fit the logistic regression on the train set. Predict on the train set and test set. Compute the score on the train set and test set. 92-97% accuracy is expected on the test set.

In [77]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9695885509838998
0.9642857142857143


4. Compute the confusion matrix on both tests. Analyse the number of false negative and false positive.

In [79]:
confusion_matrix(y_test, clf.predict(X_test))

array([[90,  2],
       [ 3, 45]])