In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('/Users/maryamazgomi/Documents/ECS171/brca.csv') #Breast Cancer Wisconsin Dataset

from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop the index column if it exists
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Encode the target variable (B -> 0, M -> 1)
label_encoder = LabelEncoder()
df["y"] = label_encoder.fit_transform(df["y"])  # B = 0, M = 1

# Normalize the feature columns
scaler = StandardScaler()
feature_columns = df.columns[:-1]  # All columns except 'y'
df[feature_columns] = scaler.fit_transform(df[feature_columns])

# Display first few rows after preprocessing
df.head()

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define parameters
n_iterations = 200  # Number of bootstrap samples
sample_size = len(df)  # Sample size same as original dataset

# Separate features and target variable
X = df.drop(columns=["y"]).values
y = df["y"].values

# Store accuracy results
accuracy_scores = []

for _ in range(n_iterations):
    # Generate a bootstrap sample (with replacement)
    indices = np.random.choice(range(sample_size), size=sample_size, replace=True)
    X_bootstrap, y_bootstrap = X[indices], y[indices]

    # Identify the out-of-bag (OOB) samples
    oob_indices = list(set(range(sample_size)) - set(indices))
    if len(oob_indices) == 0:
        continue  # Skip iteration if no OOB samples

    X_oob, y_oob = X[oob_indices], y[oob_indices]

    # Train a logistic regression model
    model = LogisticRegression(max_iter=200, solver="liblinear")  # Optimized solver
    model.fit(X_bootstrap, y_bootstrap)

    # Evaluate model on OOB samples
    y_pred = model.predict(X_oob)
    acc = accuracy_score(y_oob, y_pred)
    accuracy_scores.append(acc)

mean_accuracy = np.mean(accuracy_scores)
conf_interval = np.percentile(accuracy_scores, [2.5, 97.5])

print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"95% Confidence Interval: [{conf_interval[0]:.4f}, {conf_interval[1]:.4f}]")

Mean Accuracy: 0.9736
95% Confidence Interval: [0.9523, 0.9912]
