In [None]:
## Optional installs
%pip install numpy pandas scikit-learn matplotlib

In [None]:
## Imports
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Python & Pandas Refresher

### Load in the data

In [None]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns=np.append(cancer['feature_names'], ['target']))

# Display the first 5 rows of the dataframe
df.head()

### Choose your parameters and target variable
**The parameters are the features that your model uses to make predictions, the target variable is the value that your model aims to predict.**

**In our breast cancer case, the target variable is called `target` where 0 represents a benign tumor and a 1 represents a malignant tumor**

##### **Categorical vs Continuous Data**
Categorical means the target variable is organized into buckets or "categories".

Continuous means the target variable is $\in {\mathbb{R}}$ (fancy for saying the target is a continuous or numeric value)

In the breast cancer case, the target is a categorical variable because we only have two buckets (categories) classifying the data as benign or malignant



In [None]:
# Split the data into features and target
X = df.drop(columns=['target'])
y = df['target']

### Scale the data (if necessary)

In [None]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#### Split the data into training and testing sets

In [None]:
# Split the standardized dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Let's look at a simple ML model

Remember decision trees?

In [None]:
linear_model = DecisionTreeClassifier()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(confusion)

##### The results look pretty good, but can we do better?

## Neural Networks (MLP)

In [None]:
# Load the breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                  columns=np.append(cancer['feature_names'], ['target']))


In [None]:
X = df.drop(columns=['target'])
y = df['target']

In [None]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split the standardized dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### Let's look at the model as it comes, out of the box

In [None]:
# Define the MLPClassifier model
model = MLPClassifier()

[Scikit learn MLPClassifier documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Calculate accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(confusion)

##### Are there any hyperparameters that make the model work better?

In [None]:
model = MLPClassifier(random_state=42, max_iter=2000, early_stopping=True, validation_fraction=0.1, n_iter_no_change=10)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128],
}

In [None]:
# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

In [None]:
# Make predictions on the testing data
y_pred = best_model.predict(X_test)

In [None]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Print the best hyperparameters and performance metrics
print("Best NN hyperparameters found:")
print(grid_search.best_params_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print('Confusion matrix:\n', cm)

## SVM

Support Vector Machine (SVM) is a supervised machine learning algorithm used for classification and regression tasks. SVM is particularly well-suited for classification tasks. 

SVM aims to find the optimal hyperplane in an N-dimensional space to separate data points into different classes. The algorithm maximizes the margin between the closest points of different classes.

Let's make an SVM for our breast cancer data set.

In [None]:
# Load breast cancer dataset
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']],
                columns=np.append(cancer['feature_names'], ['target']))

In [None]:
X = df.drop(columns=['target'])
y = df['target']

In [None]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svc = SVC(probability=True)

[Scikit learn SCV documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [None]:
# Define hyperparameters to tune
hyperparams = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
                'kernel': ['linear', 'rbf'], 
                'gamma': ['scale', 'auto']}

In [None]:
# Perform hyperparameter tuning using k-fold cross-validation
grid_search = GridSearchCV(svc, hyperparams, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_svc = grid_search.best_estimator_

In [None]:
# Make predictions on test set
y_pred = best_svc.predict(X_test)
y_prob = best_svc.predict_proba(X_test)[:, 1]

In [None]:
# Calculate performance metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Print SVM performance metrics
print("Best SVM hyperparameters found:")
print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)
print('F1 score:', f1)
print('Confusion matrix:\n', cm)

## Try It Yourself

1. Choose a data set