In [1]:
"""
Goal: this notebook will be the demonstration notebook for replicating the training and testing metrics of each model type against the training/testing data as well as the GNPS testing data.

Author: Nathan Brittin 
Date: 10 - 11 - 2024

"""
print("This notebook and work was run and verified on Python 3.9.13. Running this on any other version may result in errors or deviations.")
import pandas as pd
print("Verified Pandas version:  2.2.2. Your Pandas version: ", pd.__version__)
import numpy as np
print("Verified Numpy version:  1.23.0. Your Numpy version: ", np.__version__)
import matplotlib.pyplot as plt
print("Verified Matplotlib version:  3.9.2 Your Matplotlib version: ", plt.matplotlib.__version__)
import seaborn as sns
print("Verified Seaborn version:  0.13.2. Your Seaborn version: ", sns.__version__)
import tqdm
print("Verified TQDM version:  4.66.4. Your TQDM version: ", tqdm.__version__)
from tqdm import notebook
import warnings
from sklearn.model_selection import train_test_split
import sklearn
print("Verified Sklearn version:  1.3.1. Your Sklearn version: ", sklearn.__version__)
from sklearn import linear_model
from sklearn import neural_network
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

This notebook and work was run and verified on Python 3.9.13. Running this on any other version may result in errors or deviations.
Verified Pandas version:  2.2.2. Your Pandas version:  2.2.2
Verified Numpy version:  1.23.0. Your Numpy version:  1.23.0
Verified Matplotlib version:  3.9.2 Your Matplotlib version:  3.9.2
Verified Seaborn version:  0.13.2. Your Seaborn version:  0.13.2
Verified TQDM version:  4.66.4. Your TQDM version:  4.66.4
Verified Sklearn version:  1.3.1. Your Sklearn version:  1.3.1


In [2]:
def import_data(training_testing_path, gnps_testing_path):
    training_testing = pd.read_csv(training_testing_path, sep='\t')
    gnps_df = pd.read_csv(gnps_testing_path, sep='\t')
    return training_testing, gnps_df

def make_training_testing(training_testing):
    # Make the training and testing sets using the training_testing data
    X = training_testing.copy()
    X.reset_index(drop=True)
    X = X.drop(['CID'], axis=1)
    y = X.pop('Parent Class')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

def make_gnps_testing(gnps_df):
    # Make the gnps testing set using the gnps_df data
    X = gnps_df.copy()
    X.set_index('Unnamed: 0', inplace=True)
    X.reset_index(drop=True)
    y = X.pop('Parent Class')
    return X, y

def prepare_negative_data(negatives_path):
    negatives_df = pd.read_pickle(negatives_path)
    negatives_df.drop(columns=["SMILES", "Positive"], inplace=True)
    # Insert Parent Class column as the second column
    negatives_df.insert(1, "Parent Class", "Negatives")
    # Make column headers strings
    negatives_df.columns = negatives_df.columns.astype(str)
    return negatives_df

def combine_data_with_neg(training_testing, negatives_df):
    # Combine the training_testing data with the negative data
    combined_df = pd.concat([training_testing, negatives_df], axis=0, ignore_index=True)
    combined_df.reset_index(drop=True, inplace=True)
    return combined_df

def encode_labels(y_train, y_test, y_gnps):
    """
    Encode the labels for the training/testing data
    """
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    y_gnps = le.transform(y_gnps)
    return y_train, y_test, y_gnps

def decode_labels(y_train, x, round=False):
    """
    Decode the labels for the training/testing data
    """
    le = LabelEncoder()
    le.fit(y_train)
    if round:
        x = np.rint(x).astype(int)
    x = le.inverse_transform(x)
    return x

In [3]:
# Ignore warnings of future deprecated functions
warnings.filterwarnings("ignore")

In [4]:
training_testing_path = "Multiclassifier_23_Drug_Class_Train-Test_Fingerprint_Matrix.tsv"
negatives_path = "Negative_Train-Test_Fingerprint_Matrix.tsv"
gnps_positive_path = "GNPS_23_Drug_Class_Fingerprints_Matrix.tsv"
gnps_negative_path = "Negative_Train-Test_Fingerprint_Matrix.tsv"

# Import and Prepare Datasets
training_testing_df = pd.read_csv(training_testing_path, sep='\t')
print(f"Training and Testing Data Shape: {training_testing_df.shape}")
negative_df = prepare_negative_data(negatives_path)
print(f"Negative Data Shape: {negative_df.shape}")
training_testing_df = combine_data_with_neg(training_testing_df, negative_df)
print(f"Combined Data Shape: {training_testing_df.shape}")
X_train, X_test, y_train, y_test = make_training_testing(training_testing_df)
print(f"Training Data Shape: {X_train.shape}; Testing Data Shape: {X_test.shape}")
gnps_positive_df = pd.read_csv(gnps_positive_path, sep='\t')
gnps_negative_df = pd.read_csv(gnps_negative_path, sep='\t')
gnps_df = pd.concat([gnps_positive_df, gnps_negative_df], axis=0, ignore_index=True)
print(f"GNPS Data Shape: {gnps_df.shape}")
X_gnps, y_gnps = make_gnps_testing(gnps_df)
print(f"GNPS Testing Data Shape: {X_gnps.shape}")
y_train_encoded, y_test_encoded, y_gnps_encoded = encode_labels(y_train, y_test, y_gnps)

Training and Testing Data Shape: (8521, 3880)
Negative Data Shape: (2778, 3880)
Combined Data Shape: (11299, 3880)
Training Data Shape: (9039, 3878); Testing Data Shape: (2260, 3878)
GNPS Data Shape: (4050, 3881)
GNPS Testing Data Shape: (4050, 3879)


In [5]:
# Optimized model parameters done with a non-exhaustive grid search.
models_list = [
            linear_model.RidgeClassifier(random_state=42, alpha=10, class_weight=None, fit_intercept=False, max_iter=500, tol=0.0001),
            linear_model.Perceptron(random_state=42, n_jobs=-1), 
            linear_model.PassiveAggressiveClassifier(random_state=42, n_jobs=-1, C=10, class_weight=None, early_stopping=False, fit_intercept=True, max_iter=500),
            linear_model.SGDClassifier(random_state=42, n_jobs=-1, class_weight='balanced', early_stopping=False, fit_intercept=True, loss='hinge', max_iter=1000, penalty='elasticnet'), 
            linear_model.LogisticRegression(random_state=42, n_jobs=-1, C=0.1, class_weight=None, fit_intercept=True, max_iter=200, penalty='l2'),
            neighbors.KNeighborsClassifier(n_jobs=-1, leaf_size=10, n_neighbors=3, p=1, weights='distance'),
            tree.DecisionTreeClassifier(random_state=42), 
            svm.SVC(random_state=42, C=1, cache_size=200, class_weight=None, decision_function_shape='ovr', gamma='scale', kernel='rbf', tol=0.01), 
            neural_network.MLPClassifier(random_state=42, hidden_layer_sizes =(384,), early_stopping=True)]

# Fitting each optimized model type.
for model in tqdm(models_list):
    model.fit(X_train, y_train_encoded)

  0%|          | 0/9 [00:00<?, ?it/s]

In [6]:
# Testing results for each model type using accuracy, precision, recall, F1, and MCC.

training_results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'MCC'])
for model in notebook.tqdm(models_list):
    predictions = model.predict(X_test)
    predictions = np.rint(predictions).astype(int)
    accuracy = accuracy_score(y_test_encoded, predictions)
    precision = precision_score(y_test_encoded, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_test_encoded, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_test_encoded, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test_encoded, predictions)
    model_name = str(model).split('(')[0]
    temp_df = pd.DataFrame({'Model': model_name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1, 'MCC': mcc}, index=[0])
    training_results_df = pd.concat([training_results_df, temp_df], axis=0, ignore_index=True)
training_results_df = training_results_df.sort_values(by=['MCC'], ascending=False)
print("Testing results for each model type:")
display(training_results_df)

  0%|          | 0/9 [00:00<?, ?it/s]

Testing results for each model type:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC
8,MLPClassifier,0.982743,0.983011,0.982743,0.982624,0.980658
2,PassiveAggressiveClassifier,0.982301,0.982491,0.982301,0.982308,0.98016
0,RidgeClassifier,0.981858,0.982124,0.981858,0.981869,0.979688
4,LogisticRegression,0.980088,0.980291,0.980088,0.980059,0.977721
1,Perceptron,0.980088,0.980444,0.980088,0.980086,0.977718
7,SVC,0.978761,0.979231,0.978761,0.978747,0.976262
3,SGDClassifier,0.978761,0.979055,0.978761,0.978811,0.976196
5,KNeighborsClassifier,0.961504,0.963583,0.961504,0.961305,0.957492
6,DecisionTreeClassifier,0.887611,0.890651,0.887611,0.888402,0.874302


In [7]:
# Results for testing each model type on GNPS data.

gnps_results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'MCC'])
gnps_cids = X_gnps.pop('CID')
for model in notebook.tqdm(models_list):
    predictions = model.predict(X_gnps)
    accuracy = accuracy_score(y_gnps_encoded, predictions)
    precision = precision_score(y_gnps_encoded, predictions, average='weighted', zero_division=0)
    recall = recall_score(y_gnps_encoded, predictions, average='weighted', zero_division=0)
    f1 = f1_score(y_gnps_encoded, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_gnps_encoded, predictions)
    model_name = str(model).split('(')[0]
    temp_df = pd.DataFrame({'Model': model_name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1, 'MCC': mcc}, index=[0])
    gnps_results_df = pd.concat([gnps_results_df, temp_df], axis=0, ignore_index=True)
gnps_results_df = gnps_results_df.sort_values(by=['MCC'], ascending=False)
print("Testing results for each model type on GNPS data:")
display(gnps_results_df)

  0%|          | 0/9 [00:00<?, ?it/s]

Testing results for each model type on GNPS data:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,MCC
7,SVC,0.935802,0.946922,0.935802,0.938867,0.879116
3,SGDClassifier,0.920988,0.933415,0.920988,0.924159,0.848906
4,LogisticRegression,0.912346,0.933847,0.912346,0.919533,0.840077
0,RidgeClassifier,0.904938,0.931591,0.904938,0.91294,0.832499
8,MLPClassifier,0.893333,0.912561,0.893333,0.898716,0.805205
2,PassiveAggressiveClassifier,0.887901,0.914407,0.887901,0.89653,0.798511
1,Perceptron,0.867654,0.910417,0.867654,0.880493,0.769399
5,KNeighborsClassifier,0.853086,0.915978,0.853086,0.873089,0.763452
6,DecisionTreeClassifier,0.738765,0.835042,0.738765,0.775078,0.578345


In [8]:
# Test FPR on 9,443 random GNPS spectra

fpr_set = "C:/Users/nbrittin/Desktop/Antifungal_Multiclassifier/Manually_Curated_Compounds/False Positive Rate Evaluation/Random_Fingerprints/Random_GNPS_Fingerprints.tsv"
fpr_df = pd.read_csv(fpr_set, sep='\t')
fpr_cid = fpr_df.pop('CID')
fpr_df.columns = fpr_df.columns.astype(int)
fpr_results_df = pd.DataFrame(columns=['Model', 'False Positive Rate', 'Number of False Positives', 'Number of True Negatives'])
FPR_counts = pd.DataFrame()
for model in notebook.tqdm(models_list):
    predictions_encoded = model.predict(fpr_df)
    predictions = decode_labels(y_train, predictions_encoded, round=True)
    predictions = pd.DataFrame(predictions)
    total_pred = predictions.shape[0]
    counts = predictions.value_counts()
    counts_df = pd.DataFrame(counts)
    counts_df = counts_df.T
    FPR_counts = pd.concat([FPR_counts, counts_df], axis=0, ignore_index=True)
    num_negatives = predictions.value_counts()['Negatives']
    num_positives = predictions.shape[0] - num_negatives
    false_positive_rate = num_positives / total_pred
    model_name = str(model).split('(')[0]
    temp_df = pd.DataFrame({'Model': model_name, 'False Positive Rate': false_positive_rate, 'Number of False Positives': num_positives, 'Number of True Negatives': num_negatives}, index=[0])
    fpr_results_df = pd.concat([fpr_results_df, temp_df], axis=0, ignore_index=True)
fpr_results_df = fpr_results_df.sort_values(by=['False Positive Rate'], ascending=True)
print("False positive rate for each model type on new random spectra:")
display(fpr_results_df)

  0%|          | 0/9 [00:00<?, ?it/s]

False positive rate for each model type on new random spectra:


Unnamed: 0,Model,False Positive Rate,Number of False Positives,Number of True Negatives
7,SVC,0.054961,519,8924
3,SGDClassifier,0.06015,568,8875
4,LogisticRegression,0.081754,772,8671
8,MLPClassifier,0.09838,929,8514
0,RidgeClassifier,0.100286,947,8496
2,PassiveAggressiveClassifier,0.100392,948,8495
1,Perceptron,0.12782,1207,8236
5,KNeighborsClassifier,0.168167,1588,7855
6,DecisionTreeClassifier,0.227152,2145,7298
