In [1]:
## Importing Dependencies

# Standard libraries
import pandas as pd
import numpy as np
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn.linear_model import Lasso
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
 
# Metrics
from sklearn.metrics import matthews_corrcoef, mean_squared_error, accuracy_score, make_scorer

# Model Persistence
from joblib import dump, load

# Plotter
import matplotlib.pyplot as plt

# Argument Parser
import argparse

# Write to a log file
import logging
import sys

In [None]:
## Import the complete dataset.
def import_data():
    """
    Import the full dataset from the current path.  Also apply some of the necessary preprocessing.

    Parameters
    ----------
    None

    Returns
    -------
    df:  Dataframe of the full KI training dataset, with any values above 50,000 removed.

    base_range:  Contains the range of values within the dataframe for rescaling purposes.

    """

    # Importing the full KI set into a dataframe.
    path = os.getcwd()
    df = pd.read_csv(path + '/PositivePeptide_Ki.csv')

    # Rescaling the dataframe in the log10 (-5,5) range.
    df['KI (nM) rescaled'], base_range  = rescale(df['KI (nM)'], destination_interval=(-5,5))

    return df, base_range

## Logarithmically scalling the values.
def rescale(array=np.array(0), destination_interval=(-5,5)):
    """
    Rescale the KI values from nM to a log scale within the range of
        a given destination interval.

    Parameters
    ----------
    array:  A numpy array of KI values, in nM.

    destination_interval: the interval that we set the range of the log scale to

    Returns
    -------
    array:  Transformed array into the log scale.
    
    saved_range:  The (min, max) range of the original given array.  Used if we need
        to rescale back into "KI (nM)" form.
    
    """

    # Rescaling the values and saving the initial range.
    array = np.log(array)
    saved_range = (array.min(), array.max())
    array = np.interp(array, saved_range, destination_interval)

    return array, saved_range

In [None]:
def param_name_model_zipper():
    """
    Zips up and creates 
    """
    
    # Create the feature set for the 3 classifiers.  Put them all into an array.
    rbf_params = {'gamma': [1e-1,1e-2,1e-3,1e-4,'scale','auto'], 'C': [5,10,50,100,250,500,1000],
                  'class_weight': [None,'balanced'], 'break_ties': [False,True]}
    xgb_params = {'max_depth': np.arange(2,11,1), 'n_estimators': np.arange(1,25,1), 'gamma': np.arange(0,4,1),
                  'subsample': [0.5,1], 'lambda': [1,5,9], 'alpha': np.arange(0,1.1,0.2)}
    rfc_params = {'criterion': ['gini','entropy'], 'max_features': ['sqrt','log2',1.0,0.3], 'ccp_alpha': np.arange(0,0.3,0.1),
                  'n_estimators': np.arange(1,25,1), 'max_depth': np.arange(2,11,1)}
    params_list = [rbf_params, xgb_params, rfc_params]

    # Create the string titles for the various models.
    rbf_name = 'SVC with RBF Kernel'
    xgb_name = 'XGBoost Classifier'
    rfc_name = 'Random Forest Classifier'
    names = [rbf_name, xgb_name, rfc_name]

    # Create the models.  We've selected our 'base' hyperparameters from earlier.

    # Mean_Test_Score = 0.394548, Std_Test_score = 0.133936, mean_train_score = 0.94489, std_train_score = 0.022332.  It looks like
    # when gamma = 0.01, C = 1 and when gamma = 0.001, c = 100.  break_ties can be true or false.  no effect.  class weight always 'None'
    rbf = SVC(C=10,gamma=0.01,break_ties=True,class_weight=None)

    # mean_test_score = 0.43961, std_test_score = 0.0879, mean_train_score = 0.854369, std_train_score = 0.32907
    # Max depth doesn't seem to matter too much past 3.
    # Another parameter set can be {'alpha': 0.4, 'gamma': 0, 'lambda': 5, 'max_depth': 9, 'n_estimators': 9, 'subsample': 0.5}
    # mean_test_score = 0.466982, std_test_score = 0.070494, mean_train_score = 0.815888, std_train_score = 0.049071
    xgb = XGBClassifier(alpha=1.0,gamma=1,reg_lambda=1,max_depth=4,n_estimators=22,subsample=0.5)

    # mean_test_score = 0.465951, std_test_score = 0.056089, mean_test_score = 0.894403, std_train_score = 0.01336
    rfc = RandomForestClassifier(ccp_alpha=0.0,criterion='gini',max_depth=3,max_features='sqrt',n_estimators=23)
    models = [rbf, xgb, rfc]

    # In a for loop, create a directory for the 3 models and then deposit the hyperparameter tuning results as well
    #   as the SFS and PCA models/
    attributes = zip(params_list, names, models)

    return attributes

In [None]:
path = os.getcwd()
threshold = 0.01

# DataFrame importing and adding 'Bucket' column
df, _ = import_data()
df['Bucket'] = pd.cut(x=df['KI (nM)'], bins=(0, threshold, 4000, float('inf')), labels=(0,1,2))

# Get x and y values.
x = df[df.columns[1:573]]
y = df['Bucket']

# Add minMaxScaler here to reduce overfitting.
scaler = MinMaxScaler()
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x), columns=df.columns[1:573])

attributes = param_name_model_zipper()

for params, name, model in attributes:
    x = df[df.columns[1:573]]
    sfs = load(path + '/%s/sfs/%s %2.2f fs.joblib' %(name, name, threshold))
    x = sfs.transform(x)

    if os.path.exists(path + '/%s/PCA Tuning' %(name)) == False:
        os.mkdir('%s/PCA Tuning' %(name))

    pca = PCA()
    pca.fit(x)