In [9]:
## Importing Dependencies

# Standard libraries
import pandas as pd
import numpy as np
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn.linear_model import Lasso
from sklearn.svm import SVR, SVC

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector

# Metrics
from sklearn.metrics import matthews_corrcoef, mean_squared_error, accuracy_score

# Model Persistence
from joblib import dump, load

# Plotter
import matplotlib.pyplot as plt

# Argument Parser
import argparse

# Write to a log file
import logging
import sys

In [10]:
## Create the logger
def log_files():
    """
    Create the meachanism for which we log results to a .log file.

    Parameters
    ----------
    None

    Returns
    -------
    logger:  The logger object we create to call on in other functions. 
    """

    # Instantiate the logger and set the formatting and minimum level to DEBUG.
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

    # Display the logs in the output
    stdout_handler = logging.StreamHandler(sys.stdout)
    stdout_handler.setLevel(logging.DEBUG)
    stdout_handler.setFormatter(formatter)

    # Write the logs to a file
    file_handler = logging.FileHandler('threshold.log')
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)

    # Adding the file and output handlers to the logger.
    logger.addHandler(file_handler)
    logger.addHandler(stdout_handler)
    return logger

In [11]:
## Logarithmically scalling the values.
def rescale(array=np.array(0), destination_interval=any):
    """
    Rescale the KI values from nM to a log scale within the range of
        a given destination interval.

    Parameters
    ----------
    array:  A numpy array of KI values, in nM.

    destination_interval: the interval that we set the range of the log scale to

    Returns
    -------
    array:  Transformed array into the log scale.
    
    saved_range:  The (min, max) range of the original given array.  Used if we need
        to rescale back into "KI (nM)" form.
    
    """

    # Rescaling the values and saving the initial range.
    array = np.log(array)
    saved_range = (array.min(), array.max())
    array = np.interp(array, saved_range, destination_interval)

    return array, saved_range

## Inverse of the rescale function to rescale the outputs.
def unscale(array, destination_interval, source_interval=(-5,5)):
    """
    Rescales an array of log-transformed values back into "KI (nM)" form.

    Parameters
    ----------
    array:  A numpy array of KI values in log-transformed form.

    destination_interval:  The original range of KI values.

    source_interval: The current range of KI log transformed values.

    Returns
    -------
    array:  A numpy array of the KI values back in the original format.

    """

    # Undoing the previous rescaling.
    array = np.interp(array, source_interval, destination_interval)
    array = np.exp(array)

    return array

In [12]:
## Import the complete dataset.
def import_data():
    """
    Import the full dataset from the current path.  Also apply some of the necessary preprocessing.

    Parameters
    ----------
    None

    Returns
    -------
    df:  Dataframe of the full KI training dataset, with any values above 50,000 removed.

    base_range:  Contains the range of values within the dataframe for rescaling purposes.

    """

    # Importing the full KI set into a dataframe.
    path = os.getcwd()
    df = pd.read_csv(path + '/PositivePeptide_Ki.csv')
    logger.debug('The full dataset has %i examples.' %(len(df)))

    # Data where KI > 50uM (50,000nM) is an outlier.  Total of 69 different values.
    df = df[df['KI (nM)']<75000]
    logger.debug('Without outliers, the dataset has %i examples.' %(len(df)))

    # Rescaling the dataframe in the log10 (-5,5) range.
    df['KI (nM) rescaled'], base_range  = rescale(df['KI (nM)'], destination_interval=(-5,5))

    return df, base_range

In [13]:
logger = log_files()

In [14]:
df, base_range = import_data()

2022-07-31 21:31:15,308 | DEBUG | The full dataset has 73 examples.
2022-07-31 21:31:15,308 | DEBUG | The full dataset has 73 examples.
2022-07-31 21:31:15,310 | DEBUG | Without outliers, the dataset has 69 examples.
2022-07-31 21:31:15,310 | DEBUG | Without outliers, the dataset has 69 examples.


In [15]:
bucket_clf = load('bucket_clf.joblib')
bucket_sfs = load('bucket_sfs.joblib')
bucket_pca = load('bucket_pca.joblib')

In [21]:
x = df[df.columns[1:573]]
x = bucket_sfs.transform(x)
x = bucket_pca.transform(x)
df['Bucket'] = bucket_clf.predict(x)

In [24]:
df_large = df[df['Bucket'] == True]
df_small = df[df['Bucket'] == False]

In [28]:
x_large = df_large[df_large.columns[1:573]]
y_log_large = df_large[df_large.columns[574]]
y_large = df_large[df_large.columns[573]]

x_small = df_small[df_small.columns[1:573]]
y_log_small = df_small[df_small.columns[574]]
y_small = df_small[df_small.columns[573]]