In [15]:
import pandas as pd

# Load the dataset to get a basic understanding of its structure and contents

dat = pd.read_csv('Downloads/breast_cancer_data.csv')

# Display the first few rows of the dataset to understand its features
data.head()


Unnamed: 0.1,Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis_numeric
0,0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [16]:
# Convert 'Diagnosis' to a numerical format where Malignant (M) = 1 and Benign (B) = 0
data['Diagnosis_numeric'] = data['Diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

# Selecting the specified features for correlation analysis
features_to_analyze = ['texture1', 'area1', 'area3', 'smoothness1', 'smoothness3', 'Diagnosis_numeric']

# Compute the correlation matrix for the specified features and the diagnosis
correlation_matrix = data[features_to_analyze].corr()

# Display the correlation values between features and the diagnosis
correlation_matrix['Diagnosis_numeric']


texture1             0.415185
area1                0.708984
area3                0.733825
smoothness1          0.358560
smoothness3          0.421465
Diagnosis_numeric    1.000000
Name: Diagnosis_numeric, dtype: float64

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Selecting the features and target for the model
features = data[['texture1', 'area1', 'smoothness1']]
target = data['Diagnosis_numeric']

# Scaling the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Splitting the dataset into training and testing sets for initial model fitting
# This split is only necessary for an initial look at model performance. Cross-validation will be used for a more robust evaluation.
X_train, X_test, Y_train, Y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Initializing models
log_reg = LogisticRegression(random_state=42)
svm_model = SVC(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Dictionary to store models and their names for easy access
models = {
    "Logistic Regression": log_reg,
    "SVM": svm_model,
    "Random Forest": rf_model
}

# Cross-validation scores
cv_scores = {}

for model_name, model in models.items():
    # Compute 10-fold cross-validation scores
    scores = cross_val_score(model, features_scaled, target, cv=10)
    cv_scores[model_name] = scores.mean()

cv_scores


{'Logistic Regression': 0.9190476190476191,
 'SVM': 0.9296679197994988,
 'Random Forest': 0.9209273182957395}

In [10]:
from sklearn.utils import resample
import numpy as np

# Function to perform bootstrapping and estimate model accuracy
def bootstrap_accuracy(model, X, y, n_iterations=1000, test_size=0.2):
    np.random.seed(42)  # For reproducibility
    accuracy_scores = []
    
    # Total number of samples in the dataset
    n_samples = X.shape[0]
    n_test_samples = int(n_samples * test_size)
    
    for _ in range(n_iterations):
        # Randomly sample with replacement from the original dataset
        X_resampled, y_resampled = resample(X, y)
        
        # Split the resampled data into training and testing sets
        X_train_resampled = X_resampled[:-n_test_samples]
        y_train_resampled = y_resampled[:-n_test_samples]
        X_test_resampled = X_resampled[-n_test_samples:]
        y_test_resampled = y_resampled[-n_test_samples:]
        
        # Fit the model and evaluate on the test set
        model.fit(X_train_resampled, y_train_resampled)
        predictions = model.predict(X_test_resampled)
        accuracy = accuracy_score(y_test_resampled, predictions)
        
        accuracy_scores.append(accuracy)
    
    return accuracy_scores

# Using SVM model for bootstrapping
svm_accuracy_scores = bootstrap_accuracy(svm_model, features_scaled, target)

# Calculate the mean accuracy and the 95% confidence interval of the accuracy scores
mean_accuracy = np.mean(svm_accuracy_scores)
confidence_interval = np.percentile(svm_accuracy_scores, [2.5, 97.5])

mean_accuracy, confidence_interval


(0.936716814159292, array([0.88495575, 0.98230088]))