In [231]:
## Optional installs
%pip install numpy pandas scikit-learn matplotlib


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [232]:
## Imports
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## Python & Pandas Refresher

### Load in the data

In [233]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns=np.append(cancer['feature_names'], ['target']))

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


### Choose your parameters and target variable
**The parameters are the features that your model uses to make predictions, the target variable is the value that your model aims to predict.**

**In our breast cancer case, the target variable is called `target` where 0 represents a benign tumor and a 1 represents a malignant tumor**

##### **Categorical vs Continuous Data**
Categorical means the target variable is organized into buckets or "categories".

Continuous means the target variable is $\in {\mathbb{R}}$ (fancy for saying the target is a continuous or numeric value)

In the breast cancer case, the target is a categorical variable because we only have two buckets (categories) classifying the data as benign or malignant



In [234]:
# Split the data into features and target
X = df.drop(columns=['target'])
y = df['target']

### Scale the data (if necessary)

In [235]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### Split the data into training and testing sets

In [236]:
# Split the standardized dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Let's look at a simple ML model

Remember decision trees?

In [237]:
from sklearn.tree import DecisionTreeClassifier
linear_model = DecisionTreeClassifier()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.9385964912280702
Precision: 0.9444444444444444
Recall: 0.9577464788732394
Confusion Matrix:
[[39  4]
 [ 3 68]]


##### The results look pretty good, but can we do better?

## Neural Networks (MLP)

In [238]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns=np.append(cancer['feature_names'], ['target']))


In [239]:
X = df.drop(columns=['target'])
y = df['target']

In [240]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [241]:
# Split the standardized dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Let's look at the model as it comes, out of the box

In [242]:
# Define the MLPClassifier model
model = MLPClassifier()

In [243]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(confusion)

Accuracy: 0.9736842105263158
Precision: 0.9722222222222222
Recall: 0.9859154929577465
Confusion Matrix:
[[41  2]
 [ 1 70]]




##### Are there any hyperparameters that make the model work better?

In [244]:
model = MLPClassifier(random_state=42, max_iter=2000, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)

In [245]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128],
}

In [246]:
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [247]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

In [248]:
# Make predictions on the testing data
y_pred = best_model.predict(X_test)

In [249]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [250]:
# Print the best hyperparameters and performance metrics
print("Best NN hyperparameters found:")
print(grid_search.best_params_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print('Confusion matrix:\n', cm)

Best NN hyperparameters found:
{'activation': 'tanh', 'alpha': 0.0001, 'batch_size': 64, 'hidden_layer_sizes': (50, 100, 50), 'solver': 'adam'}
Accuracy: 0.9736842105263158
Precision: 0.9722222222222222
Recall: 0.9859154929577465
Confusion matrix:
 [[41  2]
 [ 1 70]]


## SVM

In [251]:
# Load breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns=np.append(cancer['feature_names'], ['target']))

In [252]:
X = df.drop(columns=['target'])
y = df['target']

In [253]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [254]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [255]:
# Define hyperparameters to tune
hyperparams = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
                'kernel': ['linear', 'rbf'], 
                'gamma': ['scale', 'auto']}

In [256]:
svc = SVC(probability=True)
# Perform hyperparameter tuning using k-fold cross-validation
grid_search = GridSearchCV(svc, hyperparams, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_svc = grid_search.best_estimator_

In [257]:
# Make predictions on test set
y_pred = best_svc.predict(X_test)
y_prob = best_svc.predict_proba(X_test)[:, 1]

In [258]:
# Calculate performance metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [259]:
# Print SVM performance metrics
print("Best SVM hyperparameters found:")
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)
print('F1 score:', f1)
print('Confusion matrix:\n', cm)

Best SVM hyperparameters found:
Accuracy: 0.9824561403508771
Precision: 0.9726027397260274
Recall: 1.0
F1 score: 0.9861111111111112
Confusion matrix:
 [[41  2]
 [ 0 71]]


## Try It Yourself