# This notebook trains a FastAI tabular model on preprocessed 'fingerprint difference' data and outputs a trained model ready to be used in inference

In [None]:
import os
import pandas as pd
import numpy as np
import pdb
import multiprocessing as mp
from functools import partial
import matplotlib.pyplot as plt
import pickle
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

from fastai.basics import *
from fastai.tabular import *

%matplotlib inline

In [None]:
dataset = 'noncovalent'

if dataset is 'acry':
    with open('acry_model_data', 'rb') as filehandle:
        data = pickle.load(filehandle)

elif dataset is 'noncovalent':
    with open('noncovalent_model_data', 'rb') as filehandle:
        data = pickle.load(filehandle)

elif dataset is 'combined':
    with open('combined_model_data', 'rb') as filename:
        data = pickle.load(filename)

elif dataset is 'PLPro':
    with open('PLPro_model_data', 'rb') as filehandle:
        data = pickle.load(filehandle)

else:
    print('Sadly no such dataset exists')
    
X_train, y_train, X_valid, y_valid = np.array(data[0]), np.array(data[1]), np.array(data[2]), np.array(data[3])

# Preprocess data using PCA
preprocess = PCA(n_components = 20)
X_train = preprocess.fit_transform(X_train)
X_valid = preprocess.transform(X_valid)

In [None]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

### Check class balance -- both valid and train should be perfectly 50:50 by construction

In [None]:
data = 'Valid' # train or valid

if data is 'train':
    data = y_train
else:
    data = y_valid
all_classes = np.unique(data)
for class_id in all_classes:
    print('Class ' + str(class_id) + ': ' + str(100 * np.count_nonzero(data == class_id)/len(data)))

### We convert the training and validation arrays into dataframes for input into FastAI

In [None]:
def relabel(value):
    if value == 1: return 'Higher_Activity'
    elif value == -1: return 'Lower_Activity'
    else: print('Unknown value: ', value)

In [None]:
columns = ['Feature ' + str(i) for i in range(X_train.shape[1])]

df_train = pd.DataFrame(X_train, columns = columns)
df_train['Target'] = [relabel(value) for value in y_train]

df_valid = pd.DataFrame(X_valid, columns = columns)
df_valid['Target'] = [relabel(value) for value in y_valid]

df = df_train.append(df_valid).reset_index(drop = True)

## Now we use the FASTAI library

In [None]:
path = ''
classes = ['Lower_Activity', 'Higher_Activity']
valid_idx = range(len(df)-len(df_valid), len(df))
data = TabularDataBunch.from_df(path, df, 'Target', valid_idx=valid_idx, classes = classes)

### We found that the default parameters resulted in easy overfitting so we use a smaller network with non-negligible dropout

In [None]:
learner = tabular_learner(data, layers=[10, 5], ps = [0.3], metrics=[accuracy, AUROC()])
learner.fit_one_cycle(5, 1e-3)

In [None]:
learner.lr_find()
learner.recorder.plot()

In [None]:
learner.fit_one_cycle(5, slice(1e-4))

In [None]:
learner.recorder.plot_losses()

## Analysis of model performance on validation set

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

def roc_plot(y_truth, y_probs):
    fpr, tpr, _ = roc_curve(y_truth.numpy(), y_probs.numpy())
    auc_score = roc_auc_score(y_truth.numpy(), y_probs.numpy())
    roc_df = pd.DataFrame(columns = ['False Positive Rate', 'True Positive Rate'] )
    roc_df['False Positive Rate'], roc_df['True Positive Rate'] = fpr, tpr
        
    plt.figure(figsize = [8, 6] )
    plt.plot(fpr, tpr, color='darkorange',
             lw=2, label = 'ROC curve (area = %0.2f)' % auc_score)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Validation ROC (Train/Test diffs)')
    plt.legend(loc="lower right")
    plt.show()
    
    return roc_df

### Load a pre-saved model...

In [None]:
learner = load_learner('', 'noncovalent_model.pkl', test = TabularList.from_df(df_valid))
y = torch.LongTensor([classes.index(row['Target']) for index, row in df_valid.iterrows()])
preds,_,losses = learner.get_preds(ds_type=DatasetType.Test, with_loss=True)

### ...or use the live trained model

In [None]:
preds,y,losses = learner.get_preds(with_loss=True)

### Confusion Matrix

In [None]:
interp = ClassificationInterpretation(learner, preds, y, losses)
interp.plot_confusion_matrix(cmap = 'Greens', normalize = True)

## Results

In [None]:
train_probs, train_targets = learner.get_preds(DatasetType.Train)
train_hits = np.argmax(train_probs, 1).numpy() == train_targets.numpy()

valid_probs, valid_targets = learner.get_preds(DatasetType.Valid)
valid_hits = np.argmax(valid_probs, 1).numpy() == valid_targets.numpy()

print('Train Accuracy: ', str(np.round(100 * train_hits.sum()/len(train_hits),2)), '%')
print('Valid Accuracy: ', str(np.round(100 * valid_hits.sum()/len(valid_hits),2)), '%')
print('Train AUC: ', str(np.round(100 * roc_auc_score(train_targets.numpy(), np.array(list(zip(*train_probs))[1])),2)), '%')
print('Valid AUC: ', str(np.round(100 * roc_auc_score(valid_targets.numpy(), np.array(list(zip(*valid_probs))[1])),2)), '%')
print('Train LogLoss: ', str(np.round(log_loss(train_targets.numpy(), train_probs.numpy()),2)))
print('Valid LogLoss: ', str(np.round(log_loss(valid_targets.numpy(), valid_probs.numpy()),2)))

## Export Model

In [None]:
learner.export(file = 'noncovalent_model.pkl')