# Crush Rig - LAB
Written by Matt MacDonald for CIGITI at the Hospital for Sick Children Toronto

### This notebook is to explore alternative models than baseline logistic regression.
***

All tools to manipulate data will be obtained from the crush_plot.py file. The objective of this notebook is to predict the histological targets from the force/position crush data using a classifier, either logistic regression or otherwise.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import os.path
import pickle
import pandas as pd
import numpy as np

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from pdb import set_trace
from warnings import warn

In [None]:
from crush_read import *
from crush_plot import *
plt.style.use('ggplot')

The crush data must be collected using the crush rig and crush.py and stored in the expected folder structure at the root directory indicated by PATH.

In [None]:
# PATH = Path('')
# Default in crush_plot.py
PATH

Load all data and modify as needed.

In [None]:
study = study_outline(PATH)
targets = study_targets(PATH)
crushes = study_data(study)
crushes = modify(crushes)
crushes = calculate(crushes)

Prepare data for classification.

In [None]:
X, y, legend = preprocess(crushes, targets)
y = binary_classes(y)
print('Reference for categorical features:')
legend

In [None]:
X.shape

In [None]:
for col in y.columns:
    most_common = y[col].value_counts().idxmax()
    s = (y[col] == most_common).sum()
    c = y[col].count()
    r = s / c
    print(f"{col}\nBaseline Accuracy = {s}/{c} ({r:.2%})")

Remove any histology related features to focus on real time predictors. Also remove the holding strain since only the STOP protocol is being considered.

In [None]:
X_full = X.copy()
X.columns

In [None]:
X = X.drop('Holding Strain', axis=1)
X.columns

The goal for the prediction algorithm is to provide a metric for preventing tissue damage intraoperatively. Thus it has the following requirements:

1. Good overall accuracy so it is reliable without being restrictive
2. High recall such that it is conservative, limiting the occurrence of false negatives
3. Simple with limited input so that it can be implemented cheaply in real time

Further to requirement 3 above, no histology features can be used to make the prediction.

In [None]:
import seaborn as sns

# Show correlations for the reduced feature set
X_corr = X.corr(method='spearman')
sns.heatmap(X_corr, cmap='RdBu', vmin=-1, vmax=1)

In [None]:
y_full = y.copy()

In [None]:
def log_fn(x):
    if x.dtype == 'bool':
        x = x.astype('float64')
    if np.any(x == 0):
        x = x + 0.001
    return np.log(x)

X_log = X.apply(log_fn)
for col in X.columns:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    X[col].hist(ax=ax1)
    X_log[col].hist(ax=ax2)
    fig.suptitle(f'{col} - Normal and Log');

Certain features, namely thickness, crush duration and relaxation stress, are more normally distributed when using the log of them.

# Classifier Builder
Function to iteratively build and fit a number of common binary classifiers in sklearn. Inspired and forked from https://github.com/VinGPan/Machine_Learning_3252_project

In [None]:
np.random.seed = 42

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
# Models, transforms and model selection tools to explore
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score, confusion_matrix

# Sklearn utilities
from sklearn.externals import joblib

The models will attempt to predict significant serosa change or tissue damage. Both are histology based metrics of tissue trauma.

In [None]:
y.columns

In [None]:
y = y[['Significant Serosa Change', 'Tissue Damage']]
y.head()

In [None]:
pwd

Define functions for modelling.

In [None]:
def evaluate_classifiers(all_scores):  
    # Find top three models
    all_scores = sorted(all_scores, key=lambda x: x[3], reverse=True)  # sort by accuracy
    prev_cls = None
    cls_count = 0
    top_scores = []
    for score in all_scores:
        if prev_cls == score[0]:
            continue

        prev_cls = score[0]
        cls_count += 1
        top_scores.append(score)
        
        if cls_count == 3:
            break
    
    col_names = ['classifier', 'scaler', 'transform', 'accuracy',
                 'balanced_accuracy', 'f1_score', 'auc_score', 'params', 'model']
    top_scores = pd.DataFrame(np.array(top_scores), columns=col_names)
    all_scores = pd.DataFrame(np.array(all_scores), columns=col_names)
    
    return all_scores, top_scores

In [None]:
def build_classifiers(target_idx, exp_name):
    '''
    This function provides logic for building several binary classification models and performing
    data transformations, dimensionality reduction, and tuning using GridSearchCV.

    This function stores all the intermediate results so that one can stop and restart.

    NOTE: If you have already run this experiment, this function will simply load the old results.

    :target_idx: which target to use for training
    :exp_name: name of the experiment for saving, suggest 'expN'
    :return: all model scores, top three model scores
    '''
    
    config = {'classifiers': ['logistic',
                              'naive_bayes',
                              'knn', 'random_forest',
                              'xgboost',
                              'adaboost',
                              'gradboost',
                              'svc'],
              'scalers': ['none', 'min_max', 'standard_scalar'],
              'transforms': ['none', 'pca']}

    # Split data into training set
    ratio = 0.8
    X_train, X_test, y_train, y_test = train_test_split(X.values, y.values[:, target_idx], train_size=ratio)

    # Create output directory
    try:
        os.mkdir("output/" + exp_name + "/")
    except:
        pass

    # Check if models already fit
    if os.path.exists("output/" + exp_name + "/all_scores.pkl"):
        all_scores = pickle.load(open("output/" + exp_name + "/all_scores.pkl", "rb"))
        
        return evaluate_classifiers(all_scores)
        
    # Otherwise fit all models
    all_scores = []
    count = 0
    for clf_str in config['classifiers']:
        for scl_str in config['scalers']:
            for tfm_str in config['transforms']:
                count += 1
                steps = [('imputer', SimpleImputer(strategy='mean'))]
                param_grid = [{}]

                # Pick a data scaler
                if scl_str == 'min_max':
                    steps.append(('scaler', MinMaxScaler()))
                elif scl_str == 'standard_scalar':
                    steps.append(('scaler', StandardScaler()))

                # Pick a dimensionality reduction transformation
                if tfm_str == 'pca':
                    steps.append(('transform', PCA()))
                    param_grid[0]["transform__n_components"] = [None, 0.95, 0.9]

                # Pick a binary classifier
                if clf_str == 'logistic':
                    steps.append(('clf', LogisticRegression(multi_class='auto', random_state=0, solver='liblinear')))
                    param_grid[0]["clf__penalty"] = ['l1', 'l2']
                    param_grid[0]["clf__C"] = [0.01, 0.1, 1, 10]
                    param_grid[0]["clf__class_weight"] = [None, 'balanced']
                elif clf_str == 'naive_bayes':
                    steps.append(('clf', GaussianNB()))
                elif clf_str == 'knn':
                    steps.append(('clf', KNeighborsClassifier()))
                    param_grid[0]["clf__n_neighbors"] = [3, 5, 10, 20]
                    param_grid[0]["clf__weights"] = ['uniform', 'distance']
                    param_grid[0]["clf__metric"] = ['euclidean', 'manhattan']
                elif clf_str == 'random_forest':
                    steps.append(('clf', RandomForestClassifier()))
                    param_grid[0]["clf__max_depth"] = [3, 7, 10, 20]
                    param_grid[0]["clf__min_samples_split"] = [10, 15, 30]
                    param_grid[0]["clf__n_estimators"] = [50, 100, 150, 200]
                elif clf_str == 'svc':
                    steps.append(('clf', SVC(class_weight='balanced', random_state=42)))
                    param_grid[0]["clf__kernel"] = ['linear', 'rbf']
                    param_grid[0]["clf__C"] = [0.01, 0.1, 1]
                elif clf_str == 'xgboost':
                    steps.append(('clf', xgb.XGBClassifier(random_state=42, objective='binary:logistic')))
                    param_grid[0]["clf__learning_rate"] = [0.001, 0.01, 0.1]
                    param_grid[0]["clf__n_estimators"] = [50, 100, 150, 200]
                elif clf_str == 'adaboost':
                    steps.append(('clf', AdaBoostClassifier(random_state=42)))
                    param_grid[0]["clf__n_estimators"] = [50, 100, 150, 200]
                elif clf_str == 'gradboost':
                    steps.append(('clf', GradientBoostingClassifier(random_state=42)))
                    param_grid[0]["clf__learning_rate"] = [0.001, 0.01, 0.1]
                    param_grid[0]["clf__n_estimators"] = [50, 100, 150, 200]

                # Perform grid search
                pipeline = Pipeline(steps=steps)
                clf = GridSearchCV(estimator=pipeline, cv=3, refit=True,
                                   param_grid=param_grid, verbose=1, scoring='balanced_accuracy')
                
                res_path = "output/" + exp_name + "/" + clf_str + "_" + scl_str + "_" + tfm_str + ".pkl"
                if os.path.exists(res_path):
                    clf = joblib.load(res_path)
                else:
                    try:
                        clf.fit(X_train, y_train)
                    except:
                        print("Crash for " + res_path)
                        continue
                    
                    # Store the model
                    joblib.dump(clf, res_path)
                    
                y_hat = clf.predict(X_test)

                # Compute accuracy scores
                acc = accuracy_score(y_test, y_hat)
                bal_acc = balanced_accuracy_score(y_test, y_hat)
                f1 = f1_score(y_test, y_hat)
                auc = roc_auc_score(y_test, y_hat)

                all_scores.append([clf_str, scl_str, tfm_str, acc, bal_acc, f1, auc, clf.best_params_, clf])

        pickle.dump(all_scores, open("output/" + exp_name + "/all_scores.pkl", "wb"))
        
    print(f'Fit and tuned {count} total models!')
    return evaluate_classifiers(all_scores)

In [None]:
X_full = X.copy()

### Target 1 - Serosa Thickness
Predict significant serosa change.

In [None]:
target_idx = 0
y.columns[target_idx]

In [None]:
all_scores, top_scores = build_classifiers(target_idx, 'serosa_sig2')

In [None]:
top_scores

In [None]:
all_scores

In [None]:
all_scores[all_scores['classifier'] == 'logistic']

In [None]:
target_1_scores = [all_scores, top_scores]

### Target 2 - Trauma Score
Predict trauma score of 1 or greater.

In [None]:
target_idx = 1
y.columns[target_idx]

In [None]:
all_scores, top_scores = build_classifiers(target_idx, 'tissue_dam2')

In [None]:
top_scores

In [None]:
all_scores

In [None]:
all_scores[all_scores['classifier'] == 'logistic']

In [None]:
target_2_scores = [all_scores, top_scores]

# Stress only as a metric
### Trauma Score

In [None]:
target_idx = 1
y.columns[target_idx]

In [None]:
X = X.loc[:, ['Target Stress (MPa)']]

In [None]:
all_scores, top_scores = build_classifiers(target_idx, 'tissue_dam_stress_only')

In [None]:
top_scores

In [None]:
all_scores

In [None]:
all_scores[all_scores['classifier'] == 'logistic']

In [None]:
target_2_stress_only_scores = [all_scores, top_scores]

### Serosa Thickness

In [None]:
target_idx = 0
y.columns[target_idx]

In [None]:
all_scores, top_scores = build_classifiers(target_idx, 'serosa_sig_stress_only')

In [None]:
top_scores

In [None]:
all_scores

In [None]:
all_scores[all_scores['classifier'] == 'logistic']

In [None]:
target_1_stress_only_scores = [all_scores, top_scores]