In [3]:
pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.10-py3-none-any.whl.metadata (31 kB)
Downloading kagglehub-0.3.10-py3-none-any.whl (63 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.10
Note: you may need to restart the kernel to use updated packages.


In [185]:
#Loading in dataset from Kaggle
import kagglehub
import pandas as pd
import numpy as np
import os

# Download latest version
path = kagglehub.dataset_download("utkarshx27/breast-cancer-wisconsin-diagnostic-dataset")

print("Path to dataset files:", path)
print("Files in dataset directory:", os.listdir(path))

csv_file = os.path.join(path, "brca.csv")

Path to dataset files: /Users/sofiafischel/.cache/kagglehub/datasets/utkarshx27/breast-cancer-wisconsin-diagnostic-dataset/versions/1
Files in dataset directory: ['brca.csv']


In [307]:
#Logistic Regression model 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix)
from sklearn.metrics import (mean_squared_error, accuracy_score, precision_score, recall_score)

df = pd.read_csv(csv_file)
y = df['y']
df.drop('y', axis='columns', inplace=True)

#MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
model=scaler.fit(df)
scaled_data=model.transform(df)
scaled_data=pd.DataFrame(data = scaled_data, columns = df.columns)
df = scaled_data.join(y)

#Using all 30 features
X = df.drop('y', axis='columns')
y = df['y']

#print(df.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
predict = lr.predict(X_test)

In [275]:
from sklearn.model_selection import cross_validate

CV = cross_validate(lr, X, y, cv=10, scoring=["accuracy"])
print("Accuracy")
print(CV["test_accuracy"])

print("Average Accuracy = ", sum(CV["test_accuracy"]) / len(CV["test_accuracy"]))

Accuracy
[1.         0.96491228 0.98245614 0.98245614 1.         1.
 0.98245614 1.         1.         0.98214286]
Average Accuracy =  0.9894423558897243


In [309]:
#Probability Analysis
from scipy import stats
probabilities = lr.predict_proba(X_test)
np.set_printoptions(suppress=True)

avgMaxVals = np.mean(np.max(probabilities, axis=1))
print(f"Average probability value for chosen class: {avgMaxVals} \n")

#Average probability score of all cells classed as B
probabilitiesB = probabilities[:, 0]
filteredProbabilitiesB = probabilitiesB[predict == 'B']
avgBVals = np.mean(filteredProbabilitiesB)
print(f"Average probability value for class B: {avgBVals}")
#Excluding outliers
z = np.abs(stats.zscore(filteredProbabilitiesB))
inliersIndices = np.where(z < 2)[0]
filteredProbabilitiesB_NoOutliers = filteredProbabilitiesB[inliersIndices]
print(f"Average probability value without outliers for class B: {np.mean(filteredProbabilitiesB_NoOutliers)} \n")

#Average probability score of all cells classed as M
probabilitiesM = probabilities[:, 1]
filteredProbabilitiesM = probabilitiesM[predict == 'M']
avgMVals = np.mean(filteredProbabilitiesM)
print(f"Average probability value for class M: {avgMVals}")
#Excluding outliers
z = np.abs(stats.zscore(filteredProbabilitiesM))
inliersIndices = np.where(z < 2)[0]
filteredProbabilitiesM_NoOutliers = filteredProbabilitiesM[inliersIndices]
print(f"Average probability value without outliers for class M: {np.mean(filteredProbabilitiesM_NoOutliers)} \n")

Average probability value for chosen class: 0.9421112821637193 

Average probability value for class B: 0.9493748281028732
Average probability value without outliers for class B: 0.9689073073929567 

Average probability value for class M: 0.9296594891251698
Average probability value without outliers for class M: 0.9532050978191594 



In [311]:
#Classic metrics analysis
print(f"Classification report: \n {classification_report(y_test, predict)}")
print(f"Accuracy score: {accuracy_score(y_test, predict)} \n")

matrix = confusion_matrix(y_test, predict)
TN, FP, FN, TP = matrix.ravel()
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN} \n")

print(f"Recall - TP/(TP+FN): {TP/(TP+FN)}")
print(f"False Negative Rate - FN/(TP+FN): {FN/(TP+FN)}")

Classification report: 
               precision    recall  f1-score   support

           B       0.99      1.00      0.99        71
           M       1.00      0.98      0.99        43

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Accuracy score: 0.9912280701754386 

True Positives (TP): 42
False Positives (FP): 0
True Negatives (TN): 71
False Negatives (FN): 1 

Recall - TP/(TP+FN): 0.9767441860465116
False Negative Rate - FN/(TP+FN): 0.023255813953488372


In [305]:
#Confusion Matrix for both classes (essentially overkill!)
from sklearn.metrics import (classification_report, confusion_matrix, multilabel_confusion_matrix)
from sklearn.metrics import (mean_squared_error, accuracy_score, precision_score, recall_score)

import warnings
warnings.filterwarnings('ignore')  

print("Accuracy: ", accuracy_score(y_test, predict))

allMatricies = multilabel_confusion_matrix(y_test, predict)
print("\n")
for i, matrix in enumerate(allMatricies):
    TN, FP, FN, TP = matrix.ravel()

    #Explicitly writing out the confusion matrix values for each category
    print(f"Confusion Matrix for {i}:")
    print(f"  True Positives (TP): {TP}")
    print(f"  False Positives (FP): {FP}")
    print(f"  True Negatives (TN): {TN}")
    print(f"  False Negatives (FN): {FN}")
    print("\n")

print(classification_report(y_test, predict))


Accuracy:  1.0


Confusion Matrix for 0:
  True Positives (TP): 72
  False Positives (FP): 0
  True Negatives (TN): 42
  False Negatives (FN): 0


Confusion Matrix for 1:
  True Positives (TP): 42
  False Positives (FP): 0
  True Negatives (TN): 72
  False Negatives (FN): 0


              precision    recall  f1-score   support

           B       1.00      1.00      1.00        72
           M       1.00      1.00      1.00        42

    accuracy                           1.00       114
   macro avg       1.00      1.00      1.00       114
weighted avg       1.00      1.00      1.00       114



In [337]:
#Using the top five most important features
df = pd.read_csv(csv_file)
y = df['y']
df.drop('y', axis='columns', inplace=True)

#MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
model=scaler.fit(df)
scaled_data=model.transform(df)
scaled_data=pd.DataFrame(data = scaled_data, columns = df.columns)
df = scaled_data.join(y)

#Using top 5 features
X = df[['x.area_worst', 'x.concave_pts_worst', 'x.radius_worst', 'x.perimeter_worst', 'x.concave_pts_mean']]
y = df['y']

#print(df.head())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
predict = lr.predict(X_test)

from sklearn.model_selection import cross_validate

CV = cross_validate(lr, X, y, cv=10, scoring=["accuracy"])
print("Accuracy")
print(CV["test_accuracy"])

print("Average Accuracy = ", sum(CV["test_accuracy"]) / len(CV["test_accuracy"]))

#Classic metrics analysis
print(f"Classification report: \n {classification_report(y_test, predict)}")
print(f"Accuracy score: {accuracy_score(y_test, predict)} \n")

matrix = confusion_matrix(y_test, predict)
TN, FP, FN, TP = matrix.ravel()
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"True Negatives (TN): {TN}")
print(f"False Negatives (FN): {FN} \n")

print(f"Recall - TP/(TP+FN): {TP/(TP+FN)}")
print(f"False Negative Rate - FN/(TP+FN): {FN/(TP+FN)}")

Accuracy
[0.92982456 0.85964912 0.96491228 0.94736842 0.94736842 0.98245614
 0.92982456 1.         0.94736842 0.94642857]
Average Accuracy =  0.9455200501253133
Classification report: 
               precision    recall  f1-score   support

           B       0.91      0.99      0.95        71
           M       0.97      0.84      0.90        43

    accuracy                           0.93       114
   macro avg       0.94      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114

Accuracy score: 0.9298245614035088 

True Positives (TP): 36
False Positives (FP): 1
True Negatives (TN): 70
False Negatives (FN): 7 

Recall - TP/(TP+FN): 0.8372093023255814
False Negative Rate - FN/(TP+FN): 0.16279069767441862
