In [1]:
## Importing Dependencies

# Standard libraries
import pandas as pd
import numpy as np
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn.linear_model import Lasso
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
 
# Metrics
from sklearn.metrics import matthews_corrcoef, mean_squared_error, accuracy_score, make_scorer

# Model Persistence
from joblib import dump, load

# Plotter
import matplotlib.pyplot as plt

# Argument Parser
import argparse

# Write to a log file
import logging
import sys

In [2]:
## Import the complete dataset.
def import_data(threshold):
    # Importing the full KI set into a dataframe.
    path = os.getcwd()
    df = pd.read_csv(path + '/PositivePeptide_Ki.csv')
    

    # Rescaling the dataframe in the log10 (-5,5) range.
    df['KI (nM) rescaled'], base_range  = rescale(df['KI (nM)'], destination_interval=(-5,5))
    df['Bucket'] = pd.cut(x=df['KI (nM)'], bins=(0, threshold, 4000, float('inf')), labels=(0,1,2))
    return df, base_range

## Logarithmically scalling the values.
def rescale(array=np.array(0), destination_interval=(-5,5)):
    # Rescaling the values and saving the initial range.
    array = np.log(array)
    saved_range = (array.min(), array.max())    
    array = np.interp(array, saved_range, destination_interval)

    return array, saved_range

## Inverse of the rescale function to rescale the outputs.
def unscale(array, destination_interval, source_interval=(-5,5)):
    # Undoing the previous rescaling.
    array = np.interp(array, source_interval, destination_interval)
    array = np.exp(array)

    return array

def hyperparameter_optimizer(x, y, params, model=SVR()):
    reg = GridSearchCV(model, param_grid=params, scoring='neg_root_mean_squared_error', cv=5, return_train_score=True,
                       n_jobs=-1)
    reg.fit(x,y)


    # Testing on the development set.  Save the results to a pandas dataframe and then sort it by
    # standard deviation of the test set.
    df = pd.DataFrame(reg.cv_results_)
    index = reg.best_index_
    scores = [df['mean_train_score'][index], df['std_train_score'][index], reg.best_score_, df['std_test_score'][index], reg.best_params_]

    df = df.sort_values(by=['std_test_score'])

    # Clean up the output for the hyperparameters.  Eliminate any values that have too low of a test ranking
    #   as well as eliminate anything with too high of a training score.
    max_test_rank = df['rank_test_score'].max()
    col_start = 'split0_train_score'
    index_start = df.columns.get_loc(col_start)
    df = df[~(df.iloc[:,index_start:]>0.98).any(1)]
    df = df[df['mean_train_score'] > 0.65]
    df = df[df['rank_test_score'] < (0.20*max_test_rank)]
    df = df[df['mean_test_score'] > 0.25]

    # Save the best parameters.
    bestparams = reg.best_params_

    model.set_params(**bestparams)

    return model, df, scores

In [3]:
def load_saved_clf():
    # Create the models with the relevant hyperparameters.
    rbf = SVC(kernel='rbf', C=61, break_ties=True, class_weight=None, gamma=0.001)
    xgb = XGBClassifier(alpha=0.0, gamma=2, reg_lambda=1, max_depth=2, n_estimators=11, subsample=0.5)
    rfc = RandomForestClassifier(ccp_alpha=0.1, criterion='gini', max_depth=9, max_features=1.0, n_estimators=7)
    knn = KNeighborsClassifier(leaf_size=5, n_neighbors=7, p=2, weights='uniform')
    
    models = [rbf, xgb, rfc, knn]
    thresholds = [10, 0.01, 10, 10]
    variances = [80, False, 85, 100]
    names = ['SVC with RBF Kernel', 'XGBoost Classifier', 'Random Forest Classifier', 'KNN Classifier']

    saved_clf = list(zip(thresholds, variances, names, models))

    return saved_clf

def load_regression_models():
    rbf_params = {}
    lin_params = {}
    las_params = {}
    
    # names and hyperparameters to sort through are shared.
    names = ['SVR with RBF Kernel', 'SVR with Linear Kernel', 'Lasso Regression']
    params = [rbf_params, lin_params, las_params]

    # I need to instantiate new models for both the small and medium buckets.
    sml_models = [SVR(kernel='rbf'), SVR(kernel='linear'), Lasso()]
    med_models = [SVR(kernel='rbf'), SVR(kernel='linear'), Lasso()]

    # Create the lists.
    reg_models = list(zip(names, params, sml_models, med_models))

    return reg_models

saved_clf = load_saved_clf()
threshold, var, name, clf = saved_clf[0]

In [4]:
def inference(x, y, buckets, ki_range, clf=SVC(), sml_reg=SVR(), med_reg=SVR()):

    # Training set
    # Bucketize
    buckets_actual = buckets[y.index]
    buckets_pred = clf.predict(x)

    # Make predictions for all of the buckets.  The large bucket we'll just predict as 0 for now.
    sml_pred = sml_reg.predict(x[buckets_pred==0])
    med_pred = med_reg.predict(x[buckets_pred==1])
    lrg_pred = np.zeros(np.count_nonzero(x[buckets_pred==2]))

    # Put back the predictions in the original order.
    y_pred = np.array([])
    for i in buckets_pred:
        if i == 0:
            y_pred = np.append(y_pred, sml_pred[0])
            sml_pred = np.delete(sml_pred, 0)
        elif i == 1:
            y_pred = np.append(y_pred, med_pred[0])
            med_pred = np.delete(med_pred, 0)
        elif i == 2:
            y_pred = np.append(y_pred, lrg_pred[0])
            lrg_pred = np.delete(lrg_pred, 0)

    y_pred_unscaled = unscale(y_pred, ki_range)
    y_unscaled = unscale(y, ki_range)

    # RMSE
    train_rmse = mean_squared_error(y_unscaled, y_pred_unscaled)**0.5
    log_train_rmse = mean_squared_error(y, y_pred)**0.5

    # Save the results in a dataframe.
    cols = ['Log Y Actual', 'Log Y Predicted', 'Y Actual', 'Y Predicted', 'Actual Bucket', 'Predicted Bucket']
    df_data = zip(y, y_pred, y_unscaled, y_pred_unscaled, buckets_actual, buckets_pred)
    df = pd.DataFrame(data=df_data, columns=cols)

    return train_rmse, log_train_rmse, df

In [5]:
# Data and path import.
df, ki_range = import_data(threshold)
path = os.getcwd()

# Extract the x, y, and bucket information.
x = df[df.columns[1:573]]
y = df['KI (nM) rescaled']
buckets = df['Bucket']

# Apply MinMaxScaler to the initial x values.
scaler = MinMaxScaler()
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x), columns=df.columns[1:573])

# Sequential Feature Selection with the saved model.
sfs = load(path + '/%s/sfs/%s %2.2f fs.joblib' %(name, name, threshold))
x = sfs.transform(x)

# Where applicable, apply PCA tuning as well.
if var != False:
    pca = load(path + '/%s/sfs-pca/%s %2.2f pca.joblib' %(name, name, threshold))
    x = pca.transform(x)

    # Dimensonality Reduction based on accepted variance.
    ratios = np.array(pca.explained_variance_ratio_)
    ratios = ratios[ratios.cumsum() <= (var/100)]
    
    # Readjust the dimensions of x based on the variance we want.
    length = len(ratios)
    x = x[:,0:length]

# Load up the regression models here:
reg_models = load_regression_models()
reg_name, reg_params, sml_reg, med_reg = reg_models[0]

## Seed values and k-folding required variables.
seeds = [33, 42, 55, 68, 74]
i = 0
folds = len(seeds)


In [6]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=(1/folds), random_state=42, stratify=buckets)
buckets_train = buckets[y_train.index]
buckets_valid = buckets[y_valid.index]

# Training sets for the each bucket.
x_train_sml = x_train[buckets_train==0]
y_train_sml = y_train[buckets_train==0]
x_train_med = x_train[buckets_train==1]
y_train_med = y_train[buckets_train==1]

# Fitting the classification and regression models.
clf.fit(x_train, buckets_train)
sml_reg.fit(x_train_sml, y_train_sml)
med_reg.fit(x_train_med, y_train_med)

In [10]:
train_rmse, train_rmse_log, train_df = inference(x_train, y_train, buckets, ki_range, clf, sml_reg, med_reg)
valid_rmse, valid_rmse_log, valid_df = inference(x_valid, y_valid, buckets, ki_range, clf, sml_reg, med_reg)