In [40]:
# import packages

import numpy as np
import pandas as pd
import timeit
import time
import statistics 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from umap.umap_ import UMAP
from joblib import dump, load
from itertools import chain
from typing import Union, Any

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Non-linear classifiers
from sklearn import svm # Should do with RBF or polynomial kernel
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier 
# Note OVR -- sensitive to imbalanced dataset, OVO is less sensitive
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)

import datashader as ds
import datashader.transfer_functions as tf
import datashader.bundling as bd
import colorcet
import matplotlib.cm
import bokeh.plotting as bpl
import bokeh.transform as btr
import holoviews as hv
import holoviews.operation.datashader as hd

import umap.plot
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

## Utilities

In [41]:
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

def create_label_df(df):
    # This is required for dataset function
    le = preprocessing.LabelEncoder()
    le.fit(df['species_name'].unique())
    y_index = le.transform(df['species_name'].values)
    df['labels'] = y_index
    print(f"Unique labels {len(df['species_name'].unique())}")

    return df, le

def create_new_dataset(labels:pd.DataFrame, dateset:Union[pd.DataFrame, np.array], filtered:Union[str,list]):
    if isinstance(filtered, str):
        filter_idx = labels[labels.species_name != filtered].index
        print(f" filtered for {filtered} : {labels.iloc[labels[labels.species_name != filtered].index].species_name.unique()}")
        final_labels = labels.copy().iloc[filter_idx]
        final_dataset = dateset.copy().iloc[filter_idx]

        return pd.concat([final_labels,final_dataset], axis=1)
        
def filtered_array(array, idx_to_keep):
    return np.take(array, idx_to_keep, axis=1)
  
def get_all_jaccard_index_with_filter( filepath:str,model:Any, label_encoder:Any , filtered_idx:Any, num_patients:int=10, threshold:float=0.95):

    all_jaccard_index = []
    all_pred = []
    all_true = []
    for id in range(num_patients):
        patient_id=f'patient{id}'
        print('predicting for {}'.format(patient_id))

        with open(f'{filepath}/{patient_id}_6mers.npy', 'rb') as read_file:
            df_test = np.load(read_file)

        # regr.predict relies on argmax, thus predict to every single read and you will end up with many false positives
        transformed_data = filtered_array(df_test,filtered_idx)
        print(f"Shape of {transformed_data.shape}")
        y_pred = model.predict(transformed_data)

        # we can use regr.predict_proba to find a good threshold and predict only for case where the model is confident.
        # here I apply 0.95 as the cutoff for my predictions, let's see how well my model will behave...
        y_predprob = model.predict_proba(transformed_data)

        # we get only predictions larger than the threshold and if there is more than one, we take the argmax again
        final_predictions = label_encoder.inverse_transform(
                                np.unique([np.argmax(item) for item in y_predprob if len(np.where(item >= threshold)[0]) >= 1]
                            ))

        # my pathogens dectected, decoy will be ignored
        final_predictions = [item for item in final_predictions if item !='decoy']


        ji, pred_pathogen, true_pathogen = jaccard_index_per_patient(filepath, patient_id, final_predictions)
        print('Jaccard index: {}'.format(ji))
        all_jaccard_index.append(ji)    
        all_pred.append(pred_pathogen)
        all_true.append(true_pathogen)

    return all_jaccard_index, flatten(all_pred), flatten(all_true) 

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

def random_forest_selector(X,y):
    # Using feature importance to select features
    model = RandomForestClassifier(random_state=1, max_depth=10)
    model.fit(X,y)
    features = X.columns
    importances = model.feature_importances_
    # Create a dataframe for feature importance
    feature_importance_df = pd.DataFrame({"features": list(features) ,"importances": list(importances)} )
    indices = np.argsort(importances)[-9:]  # top 10 features
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

    return feature_importance_df

def create_coarse_labels(df):
    df['coarse_species_name'] = np.where(df['species_name'] != "decoy", "non_decoy", "decoy")
    le = preprocessing.LabelEncoder()
    le.fit(df['coarse_species_name'].unique())
    y_index = le.transform(df['coarse_species_name'].values)
    df['labels'] = y_index
    print(f"Unique labels {len(df['coarse_species_name'].unique())}")
    return df, le



def get_all_jaccard_index_with_transformation( filepath:str, model:Any, label_encoder:Any , x_transformer:Any, num_patients:int=10, threshold:float=0.95):

    all_jaccard_index = []
    all_pred = []
    all_true = []
    for id in range(num_patients):
        patient_id=f'patient{id}'
        print('predicting for {}'.format(patient_id))
        with open(f'{filepath}/{patient_id}_6mers.npy', 'rb') as read_file:
            df_test = np.load(read_file)

        # regr.predict relies on argmax, thus predict to every single read and you will end up with many false positives
        transformed_data =x_transformer.fit_transform(df_test)
        y_pred = model.predict(transformed_data)

        # we can use regr.predict_proba to find a good threshold and predict only for case where the model is confident.
        # here I apply 0.95 as the cutoff for my predictions, let's see how well my model will behave...
        y_predprob = model.predict_proba(transformed_data)

        # we get only predictions larger than the threshold and if there is more than one, we take the argmax again
        final_predictions = label_encoder.inverse_transform(
                                np.unique([np.argmax(item) for item in y_predprob if len(np.where(item >= threshold)[0]) >= 1]
                            ))
        
        # my pathogens dectected, decoy will be ignored
        final_predictions = [item for item in final_predictions if item !='decoy']

        print(f"final_predictions {final_predictions} , unique {np.unique(final_predictions)}")
        ji, pred_pathogen, true_pathogen = jaccard_index_per_patient(filepath, patient_id, final_predictions)
        print('Jaccard index: {}'.format(ji))
        all_jaccard_index.append(ji)    
        all_pred.append(pred_pathogen)
        all_true.append(true_pathogen)

    return all_jaccard_index, flatten(all_pred), flatten(all_true)

def jaccard_index_per_patient(filepath:str,fname:str, preds):
    # Generic filename
    print(f"{filepath}/{fname}_labels.csv")
    df_true = pd.read_csv(f'{filepath}/{fname}_labels.csv')
    tp, fp, tp_fn = 0, 0, df_true['labels'].shape[0]
    print('my predition(s) for patient {}:'.format(fname))
    print(preds)
    print('true pathogen')
    print(df_true['labels'].values)
    # if don't predict any pathogen, it means there is only decoy in the test dataset (your prediction)
    if len(preds) == 0:
        preds = ['decoy']
    for item in np.unique(preds):
        if item in df_true['labels'].values:
            tp += 1
        else:
            fp += 1
    #you have to predict all labels correctly, but you are penalized for any false positive
    return tp / (tp_fn + fp) , preds, df_true['labels'].values



def predict_kmer_profile(filepath:str, fname: str, model:Any, label_encoder:Any , filtered_idx:Any, threshold:float=0.95): #expecet the fname to be of npy file
    # all_jaccard_index = []
    # all_pred = []
    # all_true = []

    f_id, suffix_=fname.split('_')
    print('predicting for {}'.format(f_id))
    
    with open(f'{filepath}/{fname}', 'rb') as read_file:
        df_test = np.load(read_file)

    # regr.predict relies on argmax, thus predict to every single read and you will end up with many false positives
    transformed_data = filtered_array(df_test,filtered_idx)
    print(f"Shape of {transformed_data.shape}")
    y_pred = model.predict(transformed_data)

    # we can use regr.predict_proba to find a good threshold and predict only for case where the model is confident.
    # here I apply 0.95 as the cutoff for my predictions, let's see how well my model will behave...
    y_predprob = model.predict_proba(transformed_data)

    # we get only predictions larger than the threshold and if there is more than one, we take the argmax again
    final_predictions = label_encoder.inverse_transform(
                            np.unique([np.argmax(item) for item in y_predprob if len(np.where(item >= threshold)[0]) >= 1]
                        ))
    
    # my pathogens dectected, decoy will be ignored
    final_predictions = [item for item in final_predictions if item !='decoy']

    print(f"final_predictions {final_predictions} , unique {np.unique(final_predictions)}")
    ji, pred_pathogen, true_pathogen = jaccard_index_per_patient(filepath, f_id, final_predictions)
    print('Jaccard index: {}'.format(ji))
    # all_jaccard_index.append(ji)    
    # all_pred.append(pred_pathogen)
    # all_true.append(true_pathogen)
    data_={'labels':pred_pathogen}
    df=pd.DataFrame(data=data_)
    df.to_csv(f"{f_id}_prediction.csv", index=False)

def flatten(original_list:list):
    return list(chain.from_iterable(original_list))

def get_kmer_analysis_map(dataset):
    test = dataset.X_mapped.copy()
    test["labels"] = dataset.Y   
    return test.groupby("labels").mean().reset_index()
    
def getting_no_kmer_existence(analysis):

    kmer_by_label = dict()
    # Obtaining the profile
    for elem in analysis.index:
        kmer_by_label[elem] = analysis.iloc[elem,1:-1]

    # Getting label without that kmer
    for elem in analysis.index:
        kmer_zero = list(kmer_by_label[elem][kmer_by_label[elem]==0].index)
        if (len(kmer_zero) > 0):
            print(f" label {elem} ::  {kmer_zero}")

    return kmer_zero

def get_label_by_kmer(kmer_analysis):
    label_profile_by_kmer = dict()

    # Obtaining the profile
    for elem in range(1,len(kmer_analysis.columns)-1):
        label_profile_by_kmer[kmer_analysis.columns[elem]] = kmer_analysis.iloc[:,elem]
    return label_profile_by_kmer

def get_std_across_labels_by_kmer(kmer_analysis):
    label_profile_by_kmer = get_label_by_kmer(kmer_analysis)

    std_accross_labels = dict()

    for key, values in label_profile_by_kmer.items():
        std_accross_labels[key] = np.std(values)

    # Sort by variation
    std_accross_labels_sorted = dict(sorted(std_accross_labels.items(), key=lambda item: item[1]))

    return std_accross_labels_sorted

In [42]:
fname='patient0_6mers.npy'

f_id, suffix_=fname.split('_')
print('predicting for {}'.format(f_id))

predicting for patient0


## Run Prediction

In [43]:
# LABEL_PATH = 'train_labels.csv'
LABEL_PATH = 'training_data/train_labels.csv'

processed_label_df ,label_encoder = create_label_df(pd.read_csv(LABEL_PATH))

# NUMBER
NUMBER_TO_DROP = 501
# feature_impt = pd.read_csv("feature_impt.csv")
feature_impt = pd.read_csv("assets/feature_impt.csv")
idx_keep_by_feature_impt_1580= feature_impt.sort_values(by="importances").iloc[NUMBER_TO_DROP:].index
print(len(idx_keep_by_feature_impt_1580))

Unique labels 26
1580


In [44]:
# MODEL_PATH= "rf_feature_impt_filtered_1580.joblib" # Download in Gdrive
MODEL_PATH= "models/rf_feature_impt_filtered_1580.joblib" # Download in Gdrive

model = load(MODEL_PATH)

all_jaccard_index, all_pred, all_true = get_all_jaccard_index_with_filter(
    filepath=r"test_data", # specify filepath name where test data is in
    model=model, # specify model name
    label_encoder=label_encoder, # specify label encoder
    filtered_idx=idx_keep_by_feature_impt_1580, 
    threshold=0.6)

print(['patient {}: {}'.format(c,item) for c, item in enumerate(all_jaccard_index)], 'avg: {}'.format(np.mean(all_jaccard_index)))


predicting for patient0
Shape of (10054, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s


test_data/patient0_labels.csv
my predition(s) for patient patient0:
['staphylococcus_aureus']
true pathogen
['staphylococcus_aureus']
Jaccard index: 1.0
predicting for patient1
Shape of (10132, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s


test_data/patient1_labels.csv
my predition(s) for patient patient1:
['neisseria_gonorrhoeae']
true pathogen
['staphylococcus_pyogenes']
Jaccard index: 0.0
predicting for patient2
Shape of (10022, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient2_labels.csv
my predition(s) for patient patient2:
['burkholderia_pseudomallei', 'corynebacterium_ulcerans']
true pathogen
['burkholderia_pseudomallei' 'corynebacterium_ulcerans']
Jaccard index: 1.0
predicting for patient3
Shape of (9984, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient3_labels.csv
my predition(s) for patient patient3:
['pseudomonas_aeruginosa']
true pathogen
['pseudomonas_aeruginosa']
Jaccard index: 1.0
predicting for patient4
Shape of (10086, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient4_labels.csv
my predition(s) for patient patient4:
['corynebacterium_diphtheriae']
true pathogen
['corynebacterium_diphtheriae']
Jaccard index: 1.0
predicting for patient5
Shape of (10046, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient5_labels.csv
my predition(s) for patient patient5:
['streptococcus_pneumoniae']
true pathogen
['streptococcus_pneumoniae']
Jaccard index: 1.0
predicting for patient6
Shape of (9974, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.5s


test_data/patient6_labels.csv
my predition(s) for patient patient6:
['mycobacterium_ulcerans']
true pathogen
['mycobacterium_ulcerans']
Jaccard index: 1.0
predicting for patient7
Shape of (10046, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient7_labels.csv
my predition(s) for patient patient7:
['neisseria_gonorrhoeae']
true pathogen
['mycobacterium_tuberculosis' 'streptococcus_pneumoniae']
Jaccard index: 0.0
predicting for patient8
Shape of (10009, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient8_labels.csv
my predition(s) for patient patient8:
['streptococcus_pneumoniae']
true pathogen
['streptococcus_pneumoniae']
Jaccard index: 1.0
predicting for patient9
Shape of (10074, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


test_data/patient9_labels.csv
my predition(s) for patient patient9:
['burkholderia_pseudomallei', 'neisseria_gonorrhoeae']
true pathogen
['burkholderia_pseudomallei']
Jaccard index: 0.5
['patient 0: 1.0', 'patient 1: 0.0', 'patient 2: 1.0', 'patient 3: 1.0', 'patient 4: 1.0', 'patient 5: 1.0', 'patient 6: 1.0', 'patient 7: 0.0', 'patient 8: 1.0', 'patient 9: 0.5'] avg: 0.75


In [47]:
predict_kmer_profile(
    filepath=r"test_data", # specify filepath name where test data is in
    fname="patient0_6mers.npy",
    model=model, # specify model name
    label_encoder=label_encoder, # specify label encoder
    filtered_idx=idx_keep_by_feature_impt_1580, 
    threshold=0.6)

predicting for patient0
Shape of (10054, 1580)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.4s


final_predictions ['staphylococcus_aureus'] , unique ['staphylococcus_aureus']
test_data/patient0_labels.csv
my predition(s) for patient patient0:
['staphylococcus_aureus']
true pathogen
['staphylococcus_aureus']
Jaccard index: 1.0
