In [1]:
## Importing Dependencies

# Standard libraries
import pandas as pd
import numpy as np
import os
import csv

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn.linear_model import Lasso
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.feature_selection import SequentialFeatureSelector
 
# Metrics
from sklearn.metrics import matthews_corrcoef, mean_squared_error, accuracy_score, make_scorer

# Model Persistence
from joblib import dump, load

# Plotter
import matplotlib.pyplot as plt

# Argument Parser
import argparse

# Write to a log file
import logging
import sys

In [2]:
## Import the complete dataset.
def import_data():
    """
    Import the full dataset from the current path.  Also apply some of the necessary preprocessing.

    Parameters
    ----------
    None

    Returns
    -------
    df:  Dataframe of the full KI training dataset, with any values above 50,000 removed.

    base_range:  Contains the range of values within the dataframe for rescaling purposes.

    """

    # Importing the full KI set into a dataframe.
    path = os.getcwd()
    df = pd.read_csv(path + '/PositivePeptide_Ki.csv')
    
    # Remove all but the 73 features that were relavant in Nivedha's Classification pipeline
    extracted_features = pd.read_json('features.json', typ='Series')

    # Rescaling the dataframe in the log10 (-5,5) range.
    df['KI (nM) rescaled'], base_range  = rescale(df['KI (nM)'], destination_interval=(-5,5))

    return df, extracted_features, base_range

## Logarithmically scalling the values.
def rescale(array=np.array(0), destination_interval=(-5,5)):
    """
    Rescale the KI values from nM to a log scale within the range of
        a given destination interval.

    Parameters
    ----------
    array:  A numpy array of KI values, in nM.

    destination_interval: the interval that we set the range of the log scale to

    Returns
    -------
    array:  Transformed array into the log scale.
    
    saved_range:  The (min, max) range of the original given array.  Used if we need
        to rescale back into "KI (nM)" form.
    
    """

    # Rescaling the values and saving the initial range.
    array = np.log(array)
    saved_range = (array.min(), array.max())
    array = np.interp(array, saved_range, destination_interval)

    return array, saved_range

## Inverse of the rescale function to rescale the outputs.
def unscale(array, destination_interval, source_interval=(-5,5)):
    """
    Rescales an array of log-transformed values back into "KI (nM)" form.

    Parameters
    ----------
    array:  A numpy array of KI values in log-transformed form.

    destination_interval:  The original range of KI values.

    source_interval: The current range of KI log transformed values.

    Returns
    -------
    array:  A numpy array of the KI values back in the original format.

    """

    # Undoing the previous rescaling.
    array = np.interp(array, source_interval, destination_interval)
    array = np.exp(array)

    return array

In [3]:
threshold = 10

df, extracted_features, _ = import_data()
path = os.getcwd()

df['Bucket'] = pd.cut(x=df['KI (nM)'], bins=(0, threshold, 4000, float('inf')), labels=(0,1,2))

x = df[extracted_features]
y = df['Bucket']

scaler = MinMaxScaler()
scaler.fit(x)
x = pd.DataFrame(scaler.transform(x), columns=extracted_features)

sfs = load(path + '/SVC with RBF Kernel/sfs/SVC with RBF Kernel 10.00 fs.joblib')

features_taken = sfs.get_feature_names_out()
x = sfs.transform(x)

In [4]:
pca_all = PCA()
pca_all.fit(x)

x_all = pca_all.transform(x)

components_all = pca_all.components_
variances_all = pca_all.explained_variance_
ratios_all = np.array(pca_all.explained_variance_ratio_)

np.sum(ratios_all)
ratios_max = np.array(ratios_all)
ratios_max = ratios_max[ratios_max.cumsum() <= 0.8]
length = len(ratios_max)
x_reduced = x_all[:,0:length]

In [5]:
pca_5 = PCA(n_components=5)
pca_5.fit(x)

x_5 = pca_5.transform(x)

components_5 = pca_5.components_
variances_5 = pca_5.explained_variance_
ratios_5 = pca_5.explained_variance_ratio_

np.sum(ratios_5)

0.5523198488577367

In [6]:
percent = 100
print('percent %i%%' %(percent))

percent 100%
