In [1]:
## Importing Dependencies

# Standard libraries
import pandas as pd
import numpy as np
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn.linear_model import Lasso
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
 
# Metrics
from sklearn.metrics import matthews_corrcoef, mean_squared_error, accuracy_score, make_scorer

# Model Persistence
from joblib import dump, load

# Plotter
import matplotlib.pyplot as plt

# Argument Parser
import argparse

# Write to a log file
import logging
import sys

In [2]:
## Import the complete dataset.
def import_data(threshold):
    """
    Import the full dataset from the current path.  Also apply some of the necessary preprocessing.

    Parameters
    ----------
    None

    Returns
    -------
    df:  Dataframe of the full KI training dataset, with any values above 50,000 removed.

    base_range:  Contains the range of values within the dataframe for rescaling purposes.

    """

    # Importing the full KI set into a dataframe.
    path = os.getcwd()
    df = pd.read_csv(path + '/PositivePeptide_Ki.csv')
    

    # Rescaling the dataframe in the log10 (-5,5) range.
    df['KI (nM) rescaled'], base_range  = rescale(df['KI (nM)'], destination_interval=(-5,5))
    df['Bucket'] = pd.cut(x=df['KI (nM)'], bins=(0, threshold, 4000, float('inf')), labels=(0,1,2))
    return df, base_range

## Logarithmically scalling the values.
def rescale(array=np.array(0), destination_interval=(-5,5)):
    """
    Rescale the KI values from nM to a log scale within the range of
        a given destination interval.

    Parameters
    ----------
    array:  A numpy array of KI values, in nM.

    destination_interval: the interval that we set the range of the log scale to

    Returns
    -------
    array:  Transformed array into the log scale.
    
    saved_range:  The (min, max) range of the original given array.  Used if we need
        to rescale back into "KI (nM)" form.
    
    """

    # Rescaling the values and saving the initial range.
    array = np.log(array)
    saved_range = (array.min(), array.max())    
    array = np.interp(array, saved_range, destination_interval)

    return array, saved_range

## Inverse of the rescale function to rescale the outputs.
def unscale(array, destination_interval, source_interval=(-5,5)):
    """
    Rescales an array of log-transformed values back into "KI (nM)" form.

    Parameters
    ----------
    array:  A numpy array of KI values in log-transformed form.

    destination_interval:  The original range of KI values.

    source_interval: The current range of KI log transformed values.

    Returns
    -------
    array:  A numpy array of the KI values back in the original format.

    """

    # Undoing the previous rescaling.
    array = np.interp(array, source_interval, destination_interval)
    array = np.exp(array)

    return array

In [3]:
def load_saved_clf():
    """
    This section runs the finalized classification portion of the data across the various model types to bucketize our data
        for inference.
        
        - SVC w/RBF Kernel w/SFS and PCA @80% variance. {'C': 61, 'break_ties': True, 'class_weight': None, 'gamma': 0.001},
            Test MCC = 0.529094, Train MCC = 0.713933, Threshold @ 10.  Large Bucket Size 20, Small Bucket Size 44, Extra Bucket Size
            9.

        - XGBoost Classifier w/SFS. {'alpha': 0.0, 'gamma': 2, 'lambda': 1, 'max_depth': 2, 'n_estimators': 11, 'subsample': 0.5},
            Test MCC = 0.661811, Train MCC = 0.709423, Threshold @ 0.01.  Large Bucket Size 46, Small Bucket Size 18, Extra Bucket Size
            9.

        - Random Forest Classifier w/SFS and PCA @85% variance.  {'ccp_alpha': 0.1, 'criterion': 'gini', 'max_depth': 9, 
            'max_features': 1.0, 'n_estimators': 7}, Test MCC = 0.614015, Train MCC = 0.729953, Threshold @ 10.  Large Bucket Size 20,
            Small Bucket Size 44, Extra Bucket Size 9.

        - KNN Classifier w/SFS and PCA @100% variance. {'leaf_size': 5, 'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}, 
            Test MCC = 0.61151, Train MCC = 0.564734, Threshold @10.  Large Bucket Size 20, Small Bucket Size 44, Extra Bucket Size 9.


    """
    # Create the models with the relevant hyperparameters.
    rbf = SVC(kernel='rbf', C=61, break_ties=True, class_weight=None, gamma=0.001)
    rbf_threshold = 10
    rbf_var = 80

    xgb = XGBClassifier(alpha=0.0, gamma=2, reg_lambda=1, max_depth=2, n_estimators=11, subsample=0.5)
    xgb_threshold = 0.01
    xgb_var = False         # Variance of 'False' indicates that we will not be 

    rfc = RandomForestClassifier(ccp_alpha=0.1, criterion='gini', max_depth=9, max_features=1.0, n_estimators=7)
    rfc_threshold = 10
    rfc_var = 85

    knn = KNeighborsClassifier(leaf_size=5, n_neighbors=7, p=2, weights='uniform')
    knn_threshold = 10
    knn_var = 100

    # Put the model information in 4 different list.  Models, Thresholds, Variances, and Names
    models = [rbf, xgb, rfc, knn]
    thresholds = [rbf_threshold, xgb_threshold, rfc_threshold, knn_threshold]
    vars = [rbf_var, xgb_var, rfc_var, knn_var]
    names = ['SVC with RBF Kernel', 'XGBoost Classifier', 'Random Forest Classifier', 'KNN Classifier']

    saved_clf = zip(models,thresholds,vars, names)
    return saved_clf

In [4]:
threshold = 10
seeds = [33, 42, 55, 68, 74]
i = 0

folds = len(seeds)

df, ki_range = import_data(threshold)
path = os.getcwd()

x = df[df.columns[1:573]]
y = df['KI (nM) rescaled']
buckets = df['Bucket']

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=(1/folds), random_state=42, stratify=buckets)

# Create a small and large training set
buckets_train = buckets[y_train.index]
buckets_valid = buckets[y_valid.index]

x_train_sml = x_train[buckets_train==0]
buckets_train_sml = buckets_train[buckets_train==0]
y_train_sml = y_train[buckets_train==0]

x_train_med = x_train[buckets_train==1]
buckets_train_med = buckets_train[buckets_train==1]
y_train_med = y_train[buckets_train==1]

reg_sml = SVR()
reg_sml.fit(x_train_sml, y_train_sml)
reg_med = SVR()
reg_med.fit(x_train_med, y_train_med)

saved_clf = load_saved_clf()
i = 0

In [5]:
bucketflag=0

for clf, threshold, var, name in saved_clf:

    # Now we need to train and apply our classifier on the original dataset.  I think we should apply whichever transformations
    #   already existing on that part of the pipeline.  We should already have the saved .joblib files so this should be easier.

    # Apply Sequential Feature Selection to the x_train values.
    sfs = load(path + '/%s/sfs/%s %2.2f fs.joblib' %(name, name, threshold))
    x_train_clf = sfs.transform(x_train)
    x_valid_clf = sfs.transform(x_valid)

    # Apply PCA if applicable with the necessary var.
    if var != False:
        pca = load(path + '/%s/sfs-pca/%s %2.2f pca.joblib' %(name, name, threshold))
        x_train_clf = pca.transform(x_train_clf)
        x_valid_clf = pca.transform(x_valid_clf)

        # Dimensonality Reduction based on accepted variance.
        ratios = np.array(pca.explained_variance_ratio_)
        ratios = ratios[ratios.cumsum() <= (var/100)]
        
        # Readjust the dimensions of x based on the variance we want.
        length = len(ratios)
        x_train_clf = x_train_clf[:,0:length]
        x_valid_clf = x_valid_clf[:,0:length]

    clf.fit(x_train_clf, buckets_train)

    # Apply the transformations to the Validation set:
    bucket_valid = clf.predict(x_valid_clf)
    print(bucket_valid)

print('another')
print('test')

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 1 1 1 0 0 1 0]
[0 0 0 0 1 0 1 1 0 0 1 1 0 1 0]
[0 0 0 0 1 0 1 0 1 0 2 0 0 0 0]
another
test


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
x_valid_reg = x_valid[bucket_valid==bucketflag]

In [7]:
y_pred = reg_sml.predict(x_valid_reg)
alid_accuracy = accuracy_score(y_valid, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [15, 11]