# Downloading and Formatting Dataset

In [72]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [73]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

## Make pandas df of data

In [74]:
# get list of column names 
import pandas as pd

variables = pd.DataFrame(breast_cancer_wisconsin_diagnostic.variables)
headerlist = variables['name'].to_list()
print(headerlist)

['ID', 'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3']


In [75]:
# create dataframe to see data better
mydata = pd.read_csv("wdbc.data", names = ['ID', 'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3'])
mydata

Unnamed: 0,ID,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [76]:
# Convert string to float: 'M' 
# y['Diagnosis'] = pd.to_numeric(y['Diagnosis'])
# y = y.value..replace(',', '').astype(float)

import pandas as pd

y = y.replace('M', 1)
y = y.replace('B', 0)
print(y)

     Diagnosis
0            1
1            1
2            1
3            1
4            1
..         ...
564          1
565          1
566          1
567          1
568          0

[569 rows x 1 columns]


# Using cross-validation on logistic regression w/ L1 penalty

In [77]:
import scipy as sp, numpy as np
from sklearn.model_selection import LeaveOneOut
import matplotlib.pyplot as plt
from scipy.io import loadmat 
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [78]:
# split data into folds
def kfold_indices(X, k):
    fold_size = len(X) // k
    indices = np.arange(len(X))
    folds = []
    for i in range(k):
        test_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        folds.append((train_indices, test_indices))
    return folds

# Define the number of folds (K)
k = 10

# Get the fold indices
fold_indices = kfold_indices(X, k)
# print(fold_indices)

In [79]:
# Scaling the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [80]:
## Using 10-fold cross validation for logistic regression with LASSO penalty 

# Initialize machine learning model 
model = LogisticRegression(penalty = 'l1', solver = 'saga', max_iter = 50000) # Use lasso and saga

# Initialize a list to store the evaluation scores
r2_scores_list = []

# Iterate through each fold
for train_indices, test_indices in fold_indices:
    X_train, y_train = X[train_indices,], y.iloc[train_indices]
    X_test, y_test = X[test_indices], y.iloc[test_indices]
    
    # Train the model on the training data
    model.fit(X_train, y_train.to_numpy().ravel())
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate the r2 score for this fold
    r2_scores = r2_score(y_test.values.astype(float), y_pred.astype(float)) 
    
    # Append the r2 score to the list of scores
    r2_scores_list.append(r2_scores)
      
# Calculate the mean r2 score across all folds
mean_r2score = np.mean(r2_scores_list)
print("The mean R^2 score is: ", mean_r2score)

The mean R^2 score is:  0.8600405104553278


In [81]:
## Using Leave-one-out CV for logistic regression w/ LASSO penalty 

# Initialize a list to store the evaluation scores
r2_scores_list = []

for train_indices, test_indices in LeaveOneOut().split(X):  
    
    # Train the model on the training data
    model.fit(X_train, y_train.to_numpy().ravel())
    
    # Make predictions on the test data
    y_pred_loo = model.predict(X_test)
    
    # Calculate the r2 score for this fold
    r2_scores = r2_score(y_test.values.astype(float), y_pred_loo.astype(float)) 
    
    # Append the r2 score to the list of scores
    r2_scores_list.append(r2_scores)
      
# Calculate the mean r2 score across all folds
mean_r2score = np.mean(r2_scores_list)
print("The mean R^2 score is: ", mean_r2score)

The mean R^2 score is:  0.8676122931442078


# Using cross validation on SVM

In [82]:
## Using 10-fold cross validation for SVM 

# Initialize machine learning model 
model2 = SVC(kernel = 'rbf')

# Initialize a list to store the evaluation scores
r2_scores_list = []

# Iterate through each fold
for train_indices, test_indices in fold_indices:
    X_train, y_train = X[train_indices,], y.iloc[train_indices]
    X_test, y_test = X[test_indices], y.iloc[test_indices]
    
    # Train the model on the training data
    model.fit(X_train, y_train.to_numpy().ravel())
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate the r2 score for this fold
    r2_scores = r2_score(y_test.values.astype(float), y_pred.astype(float)) 
    
    # Append the r2 score to the list of scores
    r2_scores_list.append(r2_scores)
      
# Calculate the mean r2 score across all folds
mean_r2score = np.mean(r2_scores_list)
print("The mean R^2 score is: ", mean_r2score)

The mean R^2 score is:  0.8600405104553278


In [86]:
## Using Leave-one-out CV for logistic regression w/ LASSO penalty 

# Initialize a list to store the evaluation scores
r2_scores_list = []

for train_indices, test_indices in LeaveOneOut().split(X):  
    
    # Train the model on the training data
    model2.fit(X_train, y_train.to_numpy().ravel())
    
    # Make predictions on the test data
    y_pred_loo = model2.predict(X_test)
    
    # Calculate the r2 score for this fold
    r2_scores = r2_score(y_test.values.astype(float), y_pred_loo.astype(float)) 
    
    # Append the r2 score to the list of scores
    r2_scores_list.append(r2_scores)
      
# Calculate the mean r2 score across all folds
mean_r2score = np.mean(r2_scores_list)
print("The mean R^2 score is: ", mean_r2score)

The mean R^2 score is:  0.7352245862884161
