# Crush Rig - Classifier
Written by Matt MacDonald for CIGITI at the Hospital for Sick Children Toronto
***

All tools to manipulate data will be obtained from the crush_plot.py file. The objective of this notebook is to predict the histological targets from the force/position crush data using a classifier, either logistic regression or xgboost.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# or notebook

In [None]:
from crush_read import *

The crush data must be collected using the crush rig and crush.py and stored in the expected folder structure at the root directory indicated by PATH.

In [None]:
# PATH = Path('')
# Default in crush_plot.py
PATH

Load all data and modify as needed.

In [None]:
study = study_outline(PATH)
targets = study_targets(PATH)
crushes = study_data(study)
crushes = modify(crushes)
crushes = calculate(crushes)

Prepare data for regression and xgboost.

In [None]:
X, y, legend = prep(crushes, targets)
y = refine(y)
print('Reference for categorical features:')
legend

In [None]:
X.shape

In [None]:
for col in y.columns:
    s = y[col].sum()
    c = y[col].count()
    r = s / c
    print(f"{col}: {s}/{c} ({r * 100:.2f})")

In [None]:
y.describe()

The major tissue damage target is unbalanced. It may not be enough data for an accurate classifier.

Generate matrix of correlations to aid understanding.

In [None]:
W = pd.concat([X, y], axis=1)
W_corr = W.corr(method='spearman')
sns.heatmap(W_corr, cmap='RdBu')

In [None]:
X.describe()

Visualize the key variable which is target stress. Below is the corresponding load in grams for reference.

In [None]:
for load in np.arange(0, 1300, 100):
    stress = (9.81 * load / 1000) / (np.pi * (5/2)** 2)
    print(f"{stress:5.2} (MPa) = {load:5} (grams)")

In [None]:
x_name = 'Target Stress (MPa)'
for y_name in y.columns:
    plt.figure()
    plt.scatter(x=X[x_name], y=y[y_name])
    plt.xlabel(x_name)
    plt.ylabel(y_name)

Remove any histology related features to focus on real time predictors.

In [None]:
X_full = X.copy()
X.columns

In [None]:
X = X.drop('Pathologist (Cathy or Corwyn)', axis=1)
X = X.drop('Serosal Thickness (mm)', axis=1)
X = X.drop('Post Serosal Thickness (mm)', axis=1)
X.columns

Build logistic regression models.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

In [None]:
def logreg_metrics(cm, disp=False):
    acc = 100 * (cm[0][0] + cm[1][1]) / cm.sum()
    f_pos = 100 * cm[0][1] / cm.sum()
    f_neg = 100 * cm[1][0] / cm.sum()
    
    if disp:
        print(f'Accuracy = {acc:.2f}%')
        print(f'False Positives = {f_pos:.2f}%')
        print(f'False Negatives = {f_neg:.2f}%')
    else:
        return acc, f_pos, f_neg

In [None]:
def logreg_predict(model, dataset, disp=True):
    # Predict and make a confusion matrix, optionally display results
    X = dataset[0]
    y = dataset[1]
    clf = model[0]
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return y_pred, cm

In [None]:
def logreg_model(X, y, seed=0, size=0.25):
    # Convert from pandas to numpy
    X_np = X.values.astype(np.float64)
    y_np = y.values.ravel()
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=size, random_state=seed)

    # Scale input features
    scl = StandardScaler()
    X_train = scl.fit_transform(X_train)
    X_test = scl.transform(X_test)

    # Fit logistic regression to training set
    clf = LogisticRegression(random_state=seed,
                             solver='lbfgs',
                             max_iter=1000,
                             n_jobs=4)
    clf.fit(X_train, y_train)

    return (clf, scl), (X_train, y_train), (X_test, y_test)

In [None]:
def logreg_features(X, y, n_features, seed=0):
    X_np = X.values.astype(np.float64)
    y_np = y.values.ravel()
    
    clf = LogisticRegression(random_state=seed,
                             solver='lbfgs',
                             max_iter=1000,
                             n_jobs=4)
    rfe = RFE(clf, n_features)  # pick the best features
    rfe = rfe.fit(X_np, y_np)
    return rfe

In [None]:
def decision_bndr(model, dataset, n1):
    # Plot a series of 2D decision boundaries freezing the other features to mean values
    # dataset=(X, y) ideally of the test set and n1 is the primary feature being compared (default stress)
    X_set, y_set = dataset[0], dataset[1]
    clf = model[0]
    X_avg = np.mean(X_set, axis=0).reshape(1, -1)
    
    def clf_grid(grid1, grid2, n1, n2):
        # Predict for two 2D grids of features
        s = grid1.shape
        Z = np.zeros(s)
        for i in range(s[0]):
            for j in range(s[1]):
                X = X_avg
                X[0, n1] = grid1[i, j]
                X[0, n2] = grid2[i, j]
                Z[i, j] = clf.predict(X)
        return Z

    # Visualize the decision boundary
    from matplotlib.colors import ListedColormap
    colors = ('blue', 'red')
    for n2 in range(len(X.columns)):
        if n2 == n1:
            continue
        X1, X2 = np.meshgrid(np.linspace(X_set[:, n1].min() - 1, X_set[:, n1].max() + 1, 100),
                             np.linspace(X_set[:, n2].min() - 1, X_set[:, n2].max() + 1, 100))
        Z = clf_grid(X1, X2, n1, n2)

        plt.figure()
        plt.contourf(X1, X2, Z, alpha=0.75, cmap=ListedColormap(colors))
        plt.xlim(X1.min(), X1.max())
        plt.ylim(X2.min(), X2.max())
        for res in np.unique(y_set):
            color = colors[0] if (res == False) else colors[1]
            plt.scatter(X_set[y_set == res, n1], X_set[y_set == res, n2],
                        c=color, label=res)
        plt.title('Classification (N={})'.format(len(y_set)))
        plt.xlabel(X.columns[n1])
        plt.ylabel(X.columns[n2])
        plt.legend()

Select a specific indicator from the targets.

In [None]:
indicator = 'Significant Serosal Change'
# indicator = 'Tissue Damage'
# indicator = 'Major Tissue Damage'
y.columns

Remove any features deemed to be irrelevent by recursive feature elimination.

In [None]:
rfe = logreg_features(X, y[indicator], 3)
rank = pd.DataFrame({'feature': X.columns.values,
                     'support': rfe.support_,
                     'ranking': rfe.ranking_})
rank = rank.sort_values(by='ranking')
rank

In [None]:
X = X[['Tissue (COLON or SB)', 'Target Stress (MPa)', 'Target Stiffness (MPa)']]
X.columns

In [None]:
accs = pd.Series(index=range(100))
for seed in range(100):
    model, train, test = logreg_model(X, y[indicator], seed=seed)
    _, cm = logreg_predict(model, test)
    _, cm_tr = logreg_predict(model, train)
    acc, f_pos, f_neg = logreg_metrics(cm)
    acc_tr, f_pos_tr, f_neg_tr = logreg_metrics(cm_tr)
    w = 3
    accs[seed] = ((acc + acc_tr) / 2) - w * (f_neg + f_neg_tr) / 2

print(f"Average: metric={accs.mean():.2f}")
print(f"Worst: seed={accs.idxmin()}, metric={accs.min():.2f}")
print(f"Best: seed={accs.idxmax()}, metric={accs.max():.2f}")
model, train, test = logreg_model(X, y[indicator], seed=accs.idxmax())
y_pred, cm = logreg_predict(model, test)
logreg_metrics(cm, disp=True)

In [None]:
# Training metrics for comparison
logreg_metrics(logreg_predict(model, train)[1], disp=True)

In [None]:
n1 = X.columns.get_loc('Target Stress (MPa)')
decision_bndr(model, train, n1)

In [None]:
decision_bndr(model, test, n1)

In [None]:
clf = model[0]
scl = model[1]
arr = np.column_stack([(clf.coef_ / scl.scale_).T, X.columns.values])

importance = pd.DataFrame(arr, columns=['weight', 'feature'])
importance = importance.append({'weight': clf.intercept_[0], 'feature': 'Intercept'},
                               ignore_index=True)
importance

In [None]:
pd.DataFrame.sort_values?

In [None]:
scl.get_params()

Build xgboost model.

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_corr = y_pred == y_test

In [None]:
y_pred_train = clf.predict(X_train)
y_corr_train = y_pred_train == y_train

In [None]:
print(f"test acc = {sum(y_corr) / len(y_corr)}")
print(f"train acc = {sum(y_corr_train) / len(y_corr_train)}")

In [None]:
from xgboost import plot_tree
plot_tree(clf, rankdir='LR', num_trees=3)

In [None]:
# feature numbers legend
for i, feat in enumerate(X.columns):
    print(f"f{i} = {feat}")

In [None]:
X.corr()

In [None]:
df = pd.concat([X, y], axis=1)
df.corr()

In [None]:
for prot in [0, 1]:
    prot_str = f"Protocol[{prot}]"
    avg = df.loc[df['Protocol'] == prot, 'Damage Score'].mean()
    print(f"{legend[prot_str]} average damage = {avg}")
          