In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step i: Load & clean data
df = pd.read_csv("breast-cancer.csv")

# Drop columns with missing or unknown values
df.replace(['?', 'na', 'NA', 'NaN'], np.nan, inplace=True)
df.dropna(inplace=True)

# Remove negative values
df = df[(df.select_dtypes(include=[np.number]) >= 0).all(axis=1)]

# Step j: Outlier removal using Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
df = df[(z_scores < 3).all(axis=1)]

# Step k: Transform diagnosis to 0/1
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Separate features and target
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step l: Model 1 - Naïve Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_preds = nb.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_preds))

# Model 2 - Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_preds))

# Classification Reports
print("\nNaive Bayes Report:\n", classification_report(y_test, nb_preds))
print("\nLogistic Regression Report:\n", classification_report(y_test, lr_preds))


Naive Bayes Accuracy: 0.9489795918367347
Logistic Regression Accuracy: 0.9897959183673469

Naive Bayes Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        59
           1       0.95      0.92      0.94        39

    accuracy                           0.95        98
   macro avg       0.95      0.94      0.95        98
weighted avg       0.95      0.95      0.95        98


Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        59
           1       1.00      0.97      0.99        39

    accuracy                           0.99        98
   macro avg       0.99      0.99      0.99        98
weighted avg       0.99      0.99      0.99        98

