In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#Import Dataset
df = pd.read_csv('/Users/maryamazgomi/Documents/ECS171/brca.csv') #Breast Cancer Wisconsin Dataset

#Remove Index Column
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

#Encode Target Variable (Diagnosis Column)
#Benign = 0, Malignant = 1
label_encoder = LabelEncoder()
df["y"] = label_encoder.fit_transform(df["y"])

# Normalize the feature columns
scaler = StandardScaler()
feature_columns = df.columns[:-1]  # All columns except 'y'
df[feature_columns] = scaler.fit_transform(df[feature_columns])

#Define parameters
#Number of Bootstrap Samples
#Equate the sample size to number of samples in dataset
n_iterations = 1000 
sample_size = len(df)

#Drop unecessary columns that aren't needed for calculation
X = df.drop(columns=["y"]).values
Y = df["y"].values

#Split Data into Training and Testing Sets
#80/20 split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.80, random_state=42)

# Store accuracy results
accuracy_scores = []

for i in range(n_iterations):
    #Generate Bootstrap Sample that Includes Replacement
    indices = np.random.choice(range(sample_size), size=sample_size, replace=True)
    X_bootstrap, Y_bootstrap = X[indices], Y[indices]

    #Train Logistic Regression Model
    model = LogisticRegression(max_iter=200)
    model.fit(X_bootstrap, Y_bootstrap)

    #Confidence Intervals
    y_pred = model.predict(X_test)
    accuracy_scores.append(accuracy_score(Y_test, y_pred))


mean_accuracy = np.mean(accuracy_scores)

alpha = 100-95
conf_interval = np.percentile(accuracy_scores, [alpha/2, 100-alpha/2])

print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"569 samples were used to the 95% Confidence Interval of: [{conf_interval[0]:.4f}, {conf_interval[1]:.4f}]")

Mean Accuracy: 0.9801
569 samples were used to the 95% Confidence Interval of: [0.9649, 0.9912]
