# Crush Rig - Classifier
Written by Matt MacDonald for CIGITI at the Hospital for Sick Children Toronto
***

All tools to manipulate data will be obtained from the crush_plot.py file. The objective of this notebook is to predict the histological targets from the force/position crush data using a classifier, either logistic regression or xgboost.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# or notebook

In [None]:
from crush_read import *

The crush data must be collected using the crush rig and crush.py and stored in the expected folder structure at the root directory indicated by PATH.

In [None]:
# PATH = Path('')
# Default in crush_plot.py
PATH

Load all data and modify as needed.

In [None]:
study = study_outline(PATH)
targets = study_targets(PATH)
crushes = study_data(study)
crushes = modify(crushes)
crushes = calculate(crushes)

Prepare data for regression and xgboost.

In [None]:
X, y, legend = prep(crushes, targets)
y = refine(y)
print('Reference for categorical features:')
legend

In [None]:
X.shape

In [None]:
for col in y.columns:
    s = y[col].sum()
    c = y[col].count()
    r = s / c
    print(f"{col}: {s}/{c} ({r * 100:.2f})")

In [None]:
y.describe()

The major tissue damage target is unbalanced. It may not be enough data for an accurate classifier.

Generate matrix of correlations to aid understanding.

In [None]:
W = pd.concat([X, y], axis=1)
W_corr = W.corr(method='spearman')
sns.heatmap(W_corr, cmap='RdBu')

In [None]:
X.describe()

Visualize the key variable which is target stress. Below is the corresponding load in grams for reference.

In [None]:
for load in np.arange(0, 1300, 100):
    stress = (9.81 * load / 1000) / (np.pi * (5/2)** 2)
    print(f"{stress:5.2} (MPa) = {load:5} (grams)")

In [None]:
x_name = 'Target Stress (MPa)'
for y_name in y.columns:
    plt.figure()
    plt.scatter(x=X[x_name], y=y[y_name])
    plt.xlabel(x_name)
    plt.ylabel(y_name)

Remove any histology related features to focus on real time predictors.

In [None]:
X = X.iloc[:, :-3]
X.columns

Build logistic regression models.

In [None]:
SEED = 42
SIZE = 0.25
y.columns

#### Significant Serosal Change

In [None]:
# Convert from pandas to numpy
X_np = X.values.astype(np.float64)
y_np = y.iloc[:, 0].values

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=SIZE, random_state=SEED)

In [None]:
# Scale input features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Fit logistic regression to training set
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=SEED, solver='lbfgs')
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)
y_corr = y_pred == y_test

In [None]:
print('Accuracy = {}%'.format(100 * y_corr.sum() / y_corr.size))

In [None]:
importance = pd.DataFrame((clf.coef_ / scaler.scale_).T, columns=['weight'])
importance['feature'] = X.columns.values
importance

In [None]:
# Make a confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
np.array(X_set[:, n1])

In [None]:
# Visualize the decision boundary
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
n1, n2 = 6, 3
# X1, X2 = np.meshgrid(np.arange(start=X_set[:, n1].min() - 1,
#                                stop=X_set[:, n1].max() + 1,
#                                step=0.01),
#                      np.arange(start=X_set[:, n2].min() - 1,
#                                stop=X_set[:, n2].max() + 1,
#                                step=0.01))
print(X_set[:, n1].shape)
plt.contour(X_set[:, n1], X_set[:, n2], clf.predict(X_train), # np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha=0.75, cmap=ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, n1], X_set[y_set == j, n2],
                c=ListedColormap(('red', 'green'))(i), label=j)
plt.title('Logistic Regression (training set)')
plt.xlabel('Target Stress (MPa)')
plt.ylabel('Thickness (mm)')
plt.legend()

In [None]:
(np.array([X1.ravel(), X2.ravel()]).T).shape

In [None]:
X1.shape

In [None]:
X_train[:, 3].max()

Build xgboost model.

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
y_corr = y_pred == y_test

In [None]:
y_pred_train = clf.predict(X_train)
y_corr_train = y_pred_train == y_train

In [None]:
print(f"test acc = {sum(y_corr) / len(y_corr)}")
print(f"train acc = {sum(y_corr_train) / len(y_corr_train)}")

In [None]:
from xgboost import plot_tree
plot_tree(clf, rankdir='LR', num_trees=3)

In [None]:
# feature numbers legend
for i, feat in enumerate(X.columns):
    print(f"f{i} = {feat}")

In [None]:
X.corr()

In [None]:
df = pd.concat([X, y], axis=1)
df.corr()

In [None]:
for prot in [0, 1]:
    prot_str = f"Protocol[{prot}]"
    avg = df.loc[df['Protocol'] == prot, 'Damage Score'].mean()
    print(f"{legend[prot_str]} average damage = {avg}")
          