# Crush Rig - LAB
Written by Matt MacDonald for CIGITI at the Hospital for Sick Children Toronto

### This notebook is to explore alternative models than baseline logistic regression.
***

All tools to manipulate data will be obtained from the crush_plot.py file. The objective of this notebook is to predict the histological targets from the force/position crush data using a classifier, either logistic regression or otherwise.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from pdb import set_trace
from warnings import warn

In [None]:
from crush_read import *
from crush_plot import *
plt.style.use('ggplot')

The crush data must be collected using the crush rig and crush.py and stored in the expected folder structure at the root directory indicated by PATH.

In [None]:
# PATH = Path('')
# Default in crush_plot.py
PATH

Load all data and modify as needed.

In [None]:
study = study_outline(PATH)
targets = study_targets(PATH)
crushes = study_data(study)
crushes = modify(crushes)
crushes = calculate(crushes)

Prepare data for classification.

In [None]:
X, y, legend = prep(crushes, targets)
y = refine(y)
print('Reference for categorical features:')
legend

In [None]:
X.shape

In [None]:
for col in y.columns:
    most_common = y[col].value_counts().idxmax()
    s = (y[col] == most_common).sum()
    c = y[col].count()
    r = s / c
    print(f"{col}\nBaseline Accuracy = {s}/{c} ({r:.2%})")

Remove any histology related features to focus on real time predictors. Also remove the holding strain since only the STOP protocol is being considered.

In [None]:
X_full = X.copy()
X.columns

In [None]:
X = X.drop('Pathologist (Cathy or Corwyn)', axis=1)
X = X.drop('Serosal Thickness (mm)', axis=1)
X = X.drop('Post Serosal Thickness (mm)', axis=1)
X = X.drop('Holding Strain', axis=1)
X.columns

The goal for the prediction algorithm is to provide a metric for preventing tissue damage intraoperatively. Thus it has the following requirements:

1. Good overall accuracy so it is reliable without being restrictive
2. High recall such that it is conservative, limiting the occurrence of false negatives
3. Simple with limited input so that it can be implemented cheaply in real time

Further to requirement 3 above, no histology features can be used to make the prediction.

In [None]:
# Show correlations for the reduced feature set
X_corr = X.corr(method='spearman')
sns.heatmap(X_corr, cmap='RdBu', vmin=-1, vmax=1)

# Gaussian Mix Models

### Major Tissue Damage

Select a specific indicator from the targets and split the dataset.

In [None]:
indicators = ['Significant Serosal Change',
              'Tissue Damage',
              'Major Tissue Damage']
indicator_labels = {'Significant Serosal Change': ['No Change', 'Significant Change'],
                    'Tissue Damage': ['No Damage', 'Damage'],
                    'Major Tissue Damage': ['No Damage or Minor Damage', 'Major Damage']}

In [None]:
ind = indicators[2]
ind

In [None]:
# Only 3 positive examples for major damage!!
y[ind].sum()

As expected, 3 positive samples for major damage is not enough to form a useful model. So instead an anomaly detection algorithm will be applied to look for deviations from the normal expectation. For simplicity the validation will be excluded.

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
def anom_split(X, y, seed=0):
    X_np = X.values.astype(np.float64)
    y_np = y.values

    pos = (y_np == 1)
    y_pos = y_np[pos].reshape([-1, 1])
    X_pos = X_np[pos]
    y_neg = y_np[~pos].reshape([-1, 1])
    X_neg = X_np[~pos]

    size = 0.2
    X_train, X_test, y_train, y_test = train_test_split(X_neg, y_neg, test_size=size, random_state=seed)
    X_test = np.vstack([X_test, X_pos])
    y_test = np.vstack([y_test, y_pos])
    
    return (X_train, y_train), (X_test, y_test)

In [None]:
def anom_prob_dist_GMM(dataset, n_comp=1, seed=0):
    model = GaussianMixture(n_components=n_comp,
                            covariance_type='full',
                            random_state=seed)
    model.fit(dataset[0])
    return model

In [None]:
def anom_stats(X):
    mu = np.mean(X, axis=0)
    Sigma2 = np.diag(np.var(X, axis=0))
    return mu, Sigma2

def anom_prob_dist(X, mu, Sigma2):
    k = mu.size
    X = X - mu
    p = (2 * np.pi) ** (- k / 2) * np.linalg.det(Sigma2) ** (-0.5)\
        * np.exp(-0.5 * np.sum(np.dot(X, np.linalg.pinv(Sigma2)) * X, axis=1))
    return p

In [None]:
def anom_best_eps(pval, yval, disp=False):
    eps = np.linspace(1.01 * min(pval), 0.99 * max(pval), 1000)
    F1 = np.zeros(eps.shape)
    for i, epsilon in enumerate(eps):
        yhat = (pval <= epsilon)
        tp = ((yval == 1) & yhat).sum()
        fp = ((yval == 0) & yhat).sum()
        fn = ((yval == 1) & ~yhat).sum()
        
        if tp > 0:
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            F1[i] = 2 * prec * rec / (prec + rec)
    
    if disp:
        plt.plot(eps, F1)
    idx = np.argmax(F1)
    return eps[idx], F1[idx]

In [None]:
print(X.shape, y.shape)

In [None]:
ds_train, ds_test = anom_split(X, y[ind])
mu, S2 = anom_stats(ds_train[0])
probs = anom_prob_dist(ds_test[0], mu, S2)
y_test = ds_test[1].ravel()
eps = probs[y_test == 1].max()
eps

In [None]:
bestEps, F1 = anom_best_eps(probs, y_test, disp=True)
print(f"epsilon = {bestEps:.6f}, F1 = {F1:.3f}")
print('Pretty bad..')

In [None]:
# Plot a 2D version of the data, highlighting damage and major damage
X_np = X.values.astype(np.float64)
scaler = StandardScaler()
X_std = scaler.fit_transform(X_np)

masks = [y['Tissue Damage'] == 0,
         y['Tissue Damage'] == 1,
         y['Major Tissue Damage'] == 1]
masks[1][masks[2]] = False  # exclude damage score 2 from set of score 1

# Find outliers using simple gaussian
mu, S2 = anom_stats(X_std)
probs = anom_prob_dist(X_std, mu, S2)
eps = anom_best_eps(probs, masks[2])[0]
outliers = probs <= eps

X_0 = X_std[masks[0], :]
X_1 = X_std[masks[1], :]
X_2 = X_std[masks[2], :]

cov = X_std.T @ X_std / X_std.shape[0]
U, S, V = np.linalg.svd(cov)

X_2D_0 = X_0 @ U[:, :2]
X_2D_1 = X_1 @ U[:, :2]
X_2D_2 = X_2 @ U[:, :2]

for i, X_2D in enumerate([X_2D_0, X_2D_1, X_2D_2]):
    plt.scatter(X_2D[:, 0], X_2D[:, 1], label=f'Trauma score {i}')
    plt.plot(X_2D[outliers[masks[i]], 0],
             X_2D[outliers[masks[i]], 1],
             'ro', ms=10, mfc='None', mew=2)
plt.legend(loc='lower right')
print('Retained {:.2%}% of the variance'.format(S[:2].sum() / S.sum()))

In [None]:
scales = scaler.scale_[:, None]
eig = U[:, :2] / np.concatenate([scales, scales], axis=1)
for i, name in enumerate(X.columns):
    print(f"{name:30s}{eig[i, 0]:6.2f}, {eig[i, 1]:6.2f}\tScale = {eig[i, 0] / eig[i, 1]:+.3f}")

In [None]:
# Max in both dimensions
idx = X_2D_1.argmax(axis=0)
for i, name in enumerate(X.columns):
    val_1 = X_1[idx[0], i]
    val_2 = X_1[idx[1], i]
    print(f"{name:30s}Max X1 = {val_1:+10.2f}\tMax X2 = {val_2:+10.2f}\tSpread = {abs(val_1 - val_2):.2f}")

It isn't super clear how to divide the segments but there are clear differences. The division between trauma score 1 and 2 is there so it should be possible in theory with an GMM to define a boundary.

In [None]:
# Train a GMM on the data
dataset_train, dataset_test = anom_split(X, y[ind], seed=SEED)
model = anom_prob_dist(dataset_train, 2, seed=SEED)

In [None]:
np.exp(model.score_samples(dataset_train[0])).round(2)

In [None]:
prob = np.exp(model.score_samples(dataset_test[0])).round(5)
prob

In [None]:
y_test = dataset_test[1].reshape([-1]).astype(np.bool)
y_test

In [None]:
eps = np.max(prob[y_test == 1])
eps

In [None]:
prob <= eps

Not a very effective model.

In [None]:
%matplotlib inline
def log_fn(x):
    if x.dtype == 'bool':
        x = x.astype('float64')
    if np.any(x == 0):
        x = x + 0.001
    return np.log(x)

X_log = X.apply(log_fn)
for col in X.columns:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    X[col].hist(ax=ax1)
    X_log[col].hist(ax=ax2)
    fig.suptitle(f'{col} - Normal and Log')


Certain features, namely thickness, crush duration and relaxation stress, are more normally distributed when using the log of them.

In [None]:
%matplotlib notebook

# XGBoost