In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [94]:
# Classe -> 
# 2 = Benign;
# 4 = Malignant

df = pd.read_csv("breast-cancer-wisconsin.data", index_col="Sample code number", 
    na_values=["", "?"],
    names=[
    "Sample code number", "Clump Thickness", "Uniformity of Cell Size",
    "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
    "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class",
])
df

Unnamed: 0_level_0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
Sample code number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000025,5,1,1,1,2,1.0,3,1,1,2
1002945,5,4,4,5,7,10.0,3,2,1,2
1015425,3,1,1,1,2,2.0,3,1,1,2
1016277,6,8,8,1,3,4.0,3,7,1,2
1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
776715,3,1,1,1,3,2.0,1,1,1,2
841769,2,1,1,1,2,1.0,1,1,1,2
888820,5,10,10,3,7,3.0,8,10,2,4
897471,4,8,6,4,3,4.0,10,6,1,4


In [95]:
df.isna().sum()

Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [96]:
df.loc[:, "Bare Nuclei"] = df.loc[:, "Bare Nuclei"].fillna(df.loc[:, "Bare Nuclei"].median())
df.isna().sum()

Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [97]:
df.loc[:, "Class"].value_counts(2)

Class
2    0.655222
4    0.344778
Name: proportion, dtype: float64

In [113]:
X = df.drop(columns=["Class"])

X_train, X_test, y_train, y_test = train_test_split(
    X, df.loc[:, "Class"],
    test_size=0.2, random_state=43, stratify=df.loc[:, "Class"])
print(y_train.value_counts(2), "\n")
y_test.value_counts(2)

Class
2    0.654741
4    0.345259
Name: proportion, dtype: float64 



Class
2    0.657143
4    0.342857
Name: proportion, dtype: float64

In [127]:
log = LogisticRegression()
log.fit(X_train, y_train)

y_preds_train = log.predict(X_train)
y_preds_test = log.predict(X_test)

y_probas_train = log.predict_proba(X_train)
y_probas_test = log.predict_proba(X_test)

print(y_preds_train[:10], "\n")
print(y_probas_train[:10, 1], "\n")

print(y_preds_test[:10], "\n")
y_probas_test[:10, 1]

[4 2 4 2 2 2 2 4 2 2] 

[0.99600525 0.00908951 0.99992741 0.00528932 0.02098032 0.00582947
 0.03566504 0.99515453 0.00788556 0.01065907] 

[2 2 2 4 2 4 2 2 2 4] 



array([0.01747894, 0.22504709, 0.00699085, 0.54019127, 0.00152929,
       0.9986223 , 0.33611399, 0.01228123, 0.00438318, 0.99972361])

In [129]:
print(log.score(X_train, y_train))
print(log.score(X_test, y_test))

0.9695885509838998
0.9642857142857143


In [131]:
confusion_matrix(y_train, y_preds_train)

array([[357,   9],
       [  8, 185]])

In [132]:
confusion_matrix(y_test, y_preds_test)

array([[90,  2],
       [ 3, 45]])