In [59]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

#load in the dataset.
cancer_data = load_breast_cancer()

#create a df for features.
df_features = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)

#create a df for target labels.
df_target = pd.DataFrame(cancer_data.target, columns=['target'])

#add the target df to the df with the features. 
df = pd.concat([df_features, df_target], axis=1)

#x_test are the features, y_test are the labels for the test dataset. 
#similarly, x_train are the features, and y_train and the labels for the training dataset.
#split the data into training and testing sets. Here I used the 80% 20% split. You can see that the test size is 0.2. 
x_train, x_test, y_train, y_test = train_test_split(df_features, cancer_data.target, test_size=0.2, random_state=42)

#train the model based on the x and y training splits I made. I was getting an error, so I had to set max_iter to 3000. 
model = LogisticRegression(max_iter=3000)
model.fit(x_train, y_train)

#predict on the test set.
y_pred = model.predict(x_test)

#use sklearn to calculate accuracy/f1 score. This uses the test and predicted.
report = f1_score(y_test, y_pred)
print("F1 Score:", report)

F1 Score: 0.9655172413793103


In [60]:
#look at where the predicted value does not equal the actual value.
misclassified_indices = np.where(y_pred != y_test)[0]

#print the examples.
for i in misclassified_indices[:5]:
    print("Misclassified Example:", i)
    print("Actual label:", y_test[i])
    print("Predicted label:", y_pred[i])
    print("Features:", x_test.iloc[i])
    print("----" * 10)

Misclassified Example: 20
Actual label: 0
Predicted label: 1
Features: mean radius                 13.800000
mean texture                15.790000
mean perimeter              90.430000
mean area                  584.100000
mean smoothness              0.100700
mean compactness             0.128000
mean concavity               0.077890
mean concave points          0.050690
mean symmetry                0.166200
mean fractal dimension       0.065660
radius error                 0.278700
texture error                0.620500
perimeter error              1.957000
area error                  23.350000
smoothness error             0.004717
compactness error            0.020650
concavity error              0.017590
concave points error         0.009206
symmetry error               0.012200
fractal dimension error      0.003130
worst radius                16.570000
worst texture               20.860000
worst perimeter            110.300000
worst area                 812.400000
worst smoothness 

In [61]:
# data used: 
#https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
#https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic