In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy import stats

# Load the dataset
df = pd.read_csv("/Users/akshay/Desktop/dsbda_practical/newdata/breast-cancer.csv")

# --- i. Data Cleaning (Remove NA, ?, Negative values etc.) ---

# Replace '?' with NaN and drop missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Convert all numeric columns
numeric_cols = df.columns.drop(['id', 'diagnosis'])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Drop rows with negative values
df = df[(df[numeric_cols] >= 0).all(axis=1)]

print("✅ Cleaned Data Sample:")
print(df.head())

# --- j. Error Correcting (Outlier detection and removal) ---

# Use Z-score to remove outliers from numerical columns
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

print("\n✅ After Removing Outliers:")
print(df.describe())

# --- k. Data Transformation ---

# Encode 'diagnosis' to 0 (benign) and 1 (malignant)
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

# Separate features and target
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n✅ Transformed Features:")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

# --- l. Build Models & Compare ---

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)

# Results
print("\n✅ Accuracy Comparison for Breast Cancer Prediction:")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"Naive Bayes Accuracy: {nb_acc * 100:.2f}%")

✅ Cleaned Data Sample:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  pe