### Tune and test NERVE

In [None]:
### Implement new version of select module

In [None]:
import subprocess, os
import pandas as pd
from code import Protein
import numpy as np
from scipy.stats import hypergeom as hg
import matplotlib.pyplot as plt
import random

In [None]:
class RunNerve():
    """Class to store nerve parameters and run NERVE program"""
    def __init__(self):
        """Initiate arguments and program path"""
        self.args = {'annotation': True, 'e_value': 1e-10, 'gram': None, 'minlength': 9, 'mismatch': 1,
                     'mouse': True, 'mouse_peptides_sum_limit': .15, 'proteome1': None, 'proteome2': None, 
                     'p_ad_extracellular_filter': .38, 'p_ad_no_citoplasm_filter': .46, 'padlimit': .5, 
                     'razor': True, 'razlen': 50, 'select': True, 'substitution': 3, 
                     'transmemb_doms_limit': 3, 'virlimit': .5, 'virulent': True, 'working_dir': None, 
                     'NERVE_dir': '../../', 'iFeature_dir': '/iFeature', 'DeepFri_dir': '/DeepFri'
                     }
        self.program = "../../code/NERVE.py"

    def print_args(self):
        print(f'Arguments are:\n{self.args}')
        print(f'NERVE executable path is:\n{self.program}')
    
    def run(self):
        """Pass commands to line
        program: command/program to run
        args: arguments"""
        
        program = self.program
        args = self.args
        for key in args:
            args[key] = str(args[key])
        
        args_list = []
        for item in args.items():
            args_list.append(f'--{item[0]}')
            args_list.append(item[1])
        cmd = ['python3'] + [program] + args_list
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        output, error = process.communicate()
        return output, error

In [None]:
class Localization:
        """class to store and handle protein subcellular localizations"""
        def __init__(self, localization, reliability):
            self.localization = str(localization)
            self.reliability = float(reliability)

def protein_loader(proteome):
    
    if type(proteome) != pd.DataFrame:
        df = pd.read_csv(proteome)
    else:
        df = proteome
    df = df.astype(object).where(pd.notnull(df),None)
    list_of_proteins = []
    for index, row in df.iterrows():
        protein = Protein.Protein(row['id'], row['sequence'])
        protein.accession = row['uniprot_accession_code']
        protein.length = row['length']
        protein.transmembrane_doms = 0 if row['transmembrane_doms'] == None else row['transmembrane_doms']
        protein.localization = [Localization(row['localization'], row['localization score'])]
        protein.p_vir = row['virulence_probability']
        protein.p_ad = row['adhesin_probability']
        protein.conservation_score = row['conservation_score']
        protein.shared_human_peps = 0 if row['shared_human_peps'] == None else row['shared_human_peps']
        protein.shared_mouse_peps = 0 if row['shared_mouse_peps'] == None else row['shared_mouse_peps']
        protein.shared_conserv_proteome_peps = 0 if row['shared_conserv_proteome_peps'] == None else row['shared_conserv_proteome_peps']
        protein.sapiens_peptides_sum = 0 if row['human_peptides_sum'] == None else row['human_peptides_sum']
        protein.mouse_peptides_sum = 0 if row['mouse_peptides_sum'] == None else row['mouse_peptides_sum']
        protein.annotations = row['annotations']
        protein.list_of_peptides_from_comparison_with_mhcpep_sapiens = [] if \
        row['list_of_peptides_from_comparison_with_mhcpep_sapiens'] == None else \
        [peptide for peptide in range(len(row['list_of_peptides_from_comparison_with_mhcpep_sapiens']))]
        protein.list_of_peptides_from_comparison_with_mhcpep_mouse = [] if \
        row['list_of_peptides_from_comparison_with_mhcpep_mouse'] == None else \
        [peptide for peptide in range(len(row['list_of_peptides_from_comparison_with_mhcpep_mouse']))]
        protein.original_sequence_if_razor = row['original_sequence_if_razor']
        protein.tmhmm_seq = row['tmhmm_seq']
        list_of_proteins.append(protein)
    return list_of_proteins

def select(list_of_proteins, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent)->list:
    """Selects suitable candidate proteins for vaccine production"""
        
    final_list = []
    for protein in list_of_proteins:
        if protein.localization[0].localization == "Cytoplasmic" or protein.localization[0].reliability < 7.49: continue

        final_list.append(protein)
    return final_list   



### Prepare files for tuning

In [None]:
# prepare files for tuning
def to_fasta(df, outfile):
    """Converts pandas cells to fasta"""
    outfile = open(outfile, 'w')
    for index, row in df.iterrows():
        outfile.write('>' + row['protein']+ '_' + row['organism name'] + '\n'\
                      + row['sequence'].replace(" ", "") + '\n')
    outfile.close()

In [None]:
# NERVE2

df = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_2_tests')
df = df[df['Nerve_2_test'] != 1]
df_neg = df[df['gram'] == '-']
df_pos = df[df['gram'] == '+']
to_fasta(df_neg, './tuning/gram_neg_2/gram_neg_2.fasta')
to_fasta(df_pos, './tuning/gram_pos_2/gram_pos_2.fasta')

In [None]:
# NERVE 1

df = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_1_tests')
df_neg = df[df['gram'] == '-']
df_pos = df[df['gram'] == '+']
to_fasta(df_neg, './tuning/gram_neg_1/gram_neg_1.fasta')
to_fasta(df_pos, './tuning/gram_pos_1/gram_pos_1.fasta')

In [None]:
# Run nerve on gram positive and negative datasets

for test_set, gram in zip(['gram_neg_1', 'gram_pos_1', 'gram_neg_2', 'gram_pos_2'], 
                         ['n', 'p', 'n', 'p']):
    
    nerve_run = RunNerve()
    nerve_run.args['proteome1'] = f'./tuning/{test_set}/{test_set}.fasta'
    nerve_run.args['gram'] = gram
    nerve_run.args.pop('proteome2')
    nerve_run.args['select'] = False
    nerve_run.args['working_dir'] = f'./tuning/{test_set}/'
    out, err = nerve_run.run()


### Run tuning

In [None]:
len(pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_2_tests'))

In [None]:
nerve_2_tuning = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_2_tests')
nerve_2_tuning = nerve_2_tuning[(nerve_2_tuning['Nerve_2_test'] != 1)]
tuning_proteomes  = list(nerve_2_tuning['Proteome ID'].unique())
print(f'Proteomes: {len(tuning_proteomes)}, antigens: {len(nerve_2_tuning)}')

In [7]:
# run parameter grid search and collect accuracy, coverage, p-value
CONDITIONS = [[i/100, s/100] for i in range(50, 100, 5) for s in range(50, 100, 5)]

# read in precomputaded nerve predictions on known antigens
nerve_2_tuning_results = pd.read_csv('./tuning/nerve_2_tuning/vaccine_candidates.csv')

outlist = []
for condition in CONDITIONS:
    number_of_selected_proteins = 0
    number_of_total_proteins = 0
    virlimit, padlimit = condition
    
    # evaluate for known antigens
    list_of_known_antigens = protein_loader(nerve_2_tuning_results)
    list_of_known_antigens_seqs = [protein.sequence for protein in list_of_known_antigens]
    selected_antigens = select(list_of_proteins = list_of_known_antigens,
                        transmemb_doms_limit = 3,
                        padlimit = padlimit, mouse = "True",
                        mouse_peptides_sum_limit = .15,
                        virlimit = virlimit, virulent = "True")
    # evaluate for proteomes
    for proteome in tuning_proteomes:
        path = os.path.join('./tuning/', proteome, 'vaccine_candidates.csv')
        list_of_proteins = protein_loader(path)
        for protein in list_of_proteins:
            if protein.sequence in list_of_known_antigens_seqs:
                list_of_proteins.remove(protein)
        selected_proteins = select(list_of_proteins = list_of_proteins,
                            transmemb_doms_limit = 3,
                            padlimit = padlimit, mouse = "True",
                            mouse_peptides_sum_limit = .15,
                            virlimit = virlimit, virulent = "True")
        # populate values
        number_of_selected_proteins += len(selected_proteins)
        number_of_total_proteins += len(list_of_proteins)
        
    number_of_selected_proteins = number_of_selected_proteins + len(selected_antigens)
    number_of_total_proteins = number_of_total_proteins + len(list_of_known_antigens)
    
    expected_bpas = round((len(list_of_known_antigens) * number_of_selected_proteins) / number_of_total_proteins, 2)
    values = [virlimit, padlimit, round((len(selected_antigens) / len(list_of_known_antigens))*100, 2),
                    round((number_of_selected_proteins / number_of_total_proteins)*100, 2),  
                   expected_bpas, round(len(selected_antigens)/expected_bpas, 2), 
                    hg.sf(len(selected_antigens)-1, 
                          number_of_total_proteins, len(list_of_known_antigens), number_of_selected_proteins)]
    print(values)
    outlist.append(values)

[0.5, 0.5, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.55, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.6, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.65, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.7, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.75, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.8, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.85, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.9, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.5, 0.95, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.5, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.55, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.6, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.65, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.7, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.75, 45.61, 28.58, 32.59, 1.6, 8.217911521065035e-05]
[0.55, 0.8, 45.61, 28.58, 

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(outlist, columns = ['virlimit', 'padlimit', 'recall (%)',
                                'coverage (%)', 'expected_bpas', 'fold-enrichment', 'p-value'])
df

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['recall (%)'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='Recall (%)')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['coverage (%)'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='Coverage (%)')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['fold-enrichment'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='fold-enrichment')# format='%.0e')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
#df.to_csv('./tuning/NERVE_2_tuning.csv', index=False)

In [None]:
# same on nerve 1 dataset

nerve_1_tuning = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_1_tests')
nerve_1_tuning = nerve_1_tuning[nerve_1_tuning['nerve_1_test'] != 1]
tuning_proteomes  = list(nerve_1_tuning['Proteome ID'].unique())
print(f'Proteomes: {len(tuning_proteomes)}, antigens: {len(nerve_1_tuning)}')

In [None]:
# run parameter grid search and collect accuracy, coverage, p-value
CONDITIONS = [[i/100, s/100] for i in range(50, 100, 5) for s in range(50, 100, 5)]

# read in precomputaded nerve predictions on known antigens
nerve_2_tuning_results = pd.read_csv('./tuning/nerve_2_tuning/vaccine_candidates.csv')

outlist = []
for condition in CONDITIONS:
    number_of_selected_proteins = 0
    number_of_total_proteins = 0
    virlimit, padlimit = condition
    
    # evaluate for known antigens
    list_of_known_antigens = protein_loader(nerve_2_tuning_results)
    list_of_known_antigens_seqs = [protein.sequence for protein in list_of_known_antigens]
    selected_antigens = select(list_of_proteins = list_of_known_antigens,
                        transmemb_doms_limit = 3,
                        padlimit = padlimit, mouse = "True",
                        mouse_peptides_sum_limit = .15,
                        virlimit = virlimit, virulent = "True")
    # evaluate for proteomes
    for proteome in tuning_proteomes:
        path = os.path.join('./tuning/', proteome, 'vaccine_candidates.csv')
        list_of_proteins = protein_loader(path)
        for protein in list_of_proteins:
            if protein.sequence in list_of_known_antigens_seqs:
                list_of_proteins.remove(protein)
        selected_proteins = select(list_of_proteins = list_of_proteins,
                            transmemb_doms_limit = 3,
                            padlimit = padlimit, mouse = "True",
                            mouse_peptides_sum_limit = .15,
                            virlimit = virlimit, virulent = "True")
        # populate values
        number_of_selected_proteins += len(selected_proteins)
        number_of_total_proteins += len(list_of_proteins)
        
    number_of_selected_proteins = number_of_selected_proteins + len(selected_antigens)
    number_of_total_proteins = number_of_total_proteins + len(list_of_known_antigens)
    
    expected_bpas = round((len(list_of_known_antigens) * number_of_selected_proteins) / number_of_total_proteins, 2)
    values = [virlimit, padlimit, round((len(selected_antigens) / len(list_of_known_antigens))*100, 2),
                    round((number_of_selected_proteins / number_of_total_proteins)*100, 2),  
                   expected_bpas, round(len(selected_antigens)/expected_bpas, 2), 
                    hg.sf(len(selected_antigens)-1, 
                          number_of_total_proteins, len(list_of_known_antigens), number_of_selected_proteins)]
    #print(values)
    outlist.append(values)

In [None]:
df = pd.DataFrame(outlist, columns = ['virlimit', 'padlimit', 'recall (%)',
                                'coverage (%)', 'expected_bpas', 'fold-enrichment', 'p-value'])
df

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['recall (%)'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='recall (%)')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['coverage (%)'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='Coverage (%)')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
new_column = df.virlimit.unique()
new_index = df.padlimit.unique()
plot_outlist = []
for index in new_index:
    tmp_row = []
    for column in new_column:
        tmp_accuracy = df[(df.padlimit == index) & (df.virlimit == column)]['fold-enrichment'].to_list()[0]
        tmp_row.append(tmp_accuracy)
    plot_outlist.append(tmp_row)
plot_df = pd.DataFrame(plot_outlist, index=new_index, columns=new_column)

plt.imshow(plot_df, cmap ="RdYlBu", )
plt.colorbar(label='fold-enrichment')# format='%.0e')
plt.xticks(range(len(plot_df)), plot_df.columns)
plt.yticks(range(len(plot_df)), plot_df.index)
plt.xlabel('virlimit')
plt.ylabel('padlimit')
plt.show()

In [None]:
df.to_csv('./tuning/NERVE_1_tuning.csv', index=False)

### Run k-fold cross-validation on NERVE_2 data

In [None]:
nerve_2_tuning = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx',
                               sheet_name = 'nerve_2_tests')
nerve_2_tuning = nerve_2_tuning[(nerve_2_tuning['Nerve_2_test'] != 1)]

In [None]:
# find split with lower difference bwtween groups
sem = 100
random_num = 0
for i in range(2000):
    proteomes = nerve_2_tuning['Proteome ID'].unique()
    random.seed(i)
    random.shuffle(proteomes)
    outlist = []
    for index, row in nerve_2_tuning.iterrows():
        proteome = row['Proteome ID']
        if proteome in proteomes[0:7]:
            outlist.append(0)     
        if proteome in proteomes[7:15]:
            outlist.append(1)
        if proteome in proteomes[15:22]:
            outlist.append(2)
        if proteome in proteomes[22:34]:
            outlist.append(3)
    nerve_2_tuning['class'] = outlist
    tmp_sem = nerve_2_tuning[['class']].value_counts().sem()
    if tmp_sem < sem:
        sem = tmp_sem
        random_num = i
print(sem, random_num)

In [None]:
# split the dataset into 5

proteomes = nerve_2_tuning['Proteome ID'].unique()
random.seed(476)
random.shuffle(proteomes)
outlist = []
for index, row in nerve_2_tuning.iterrows():
    proteome = row['Proteome ID']
    if proteome in proteomes[0:7]:
        outlist.append(0)     
    if proteome in proteomes[7:15]:
        outlist.append(1)
    if proteome in proteomes[15:22]:
        outlist.append(2)
    if proteome in proteomes[22:34]:
        outlist.append(3)
nerve_2_tuning['class'] = outlist
tmp_sem = nerve_2_tuning[['class']].plot.hist()

In [None]:
# perform cross validation
outvalues = []
for split in nerve_2_tuning['class'].unique():
    test_set = nerve_2_tuning[nerve_2_tuning['class'] != split]
    val_set = nerve_2_tuning[nerve_2_tuning['class'] == split]
    test_proteomes = list(test_set['Proteome ID'].unique())
    val_proteomes = list(val_set['Proteome ID'].unique())
    test_antigens = list(test_set['protein'].unique())
    val_antigens = list(val_set['protein'].unique())
    
    # run parameter grid search and collect accuracy, coverage, p-value
    CONDITIONS = [[i/100, s/100] for i in range(50, 100, 5) for s in range(50, 100, 5)]

    # read in precomputaded nerve predictions on known antigens
    nerve_2_tuning_results = pd.read_csv('./tuning/nerve_2_tuning/vaccine_candidates.csv')
    nerve_2_tuning_results.id = [element[0] for element in nerve_2_tuning_results.id.str.split('_')]
    nerve_2_tuning_results = nerve_2_tuning_results[nerve_2_tuning_results.id.isin(test_antigens)]

    outlist = []
    for condition in CONDITIONS:
        number_of_selected_proteins = 0
        number_of_total_proteins = 0
        virlimit, padlimit = condition

        # evaluate for known antigens
        list_of_known_antigens = protein_loader(nerve_2_tuning_results)
        list_of_known_antigens_seqs = [protein.sequence for protein in list_of_known_antigens]
        selected_antigens = select(list_of_proteins = list_of_known_antigens,
                            transmemb_doms_limit = 3,
                            padlimit = padlimit, mouse = "True",
                            mouse_peptides_sum_limit = .15,
                            virlimit = virlimit, virulent = "True")
        # evaluate for proteomes
        for proteome in test_proteomes:
            path = os.path.join('./tuning/', proteome, 'vaccine_candidates.csv')
            list_of_proteins = protein_loader(path)
            for protein in list_of_proteins:
                if protein.sequence in list_of_known_antigens_seqs:
                    list_of_proteins.remove(protein)
            selected_proteins = select(list_of_proteins = list_of_proteins,
                                transmemb_doms_limit = 3,
                                padlimit = padlimit, mouse = "True",
                                mouse_peptides_sum_limit = .15,
                                virlimit = virlimit, virulent = "True")
            # populate values
            number_of_selected_proteins += len(selected_proteins)
            number_of_total_proteins += len(list_of_proteins)
        
        number_of_selected_proteins = number_of_selected_proteins + len(selected_antigens)
        number_of_total_proteins = number_of_total_proteins + len(list_of_known_antigens)
        
        expected_bpas = round((len(list_of_known_antigens) * number_of_selected_proteins /\
                               number_of_total_proteins), 2)
        recall = round((len(selected_antigens) / len(list_of_known_antigens))*100, 2)
        coverage = round((number_of_selected_proteins / number_of_total_proteins)*100, 2)
        fold_enrichment = round(len(selected_antigens)/expected_bpas, 2)
        p_value = hg.sf(len(selected_antigens)-1, 
                              number_of_total_proteins, len(list_of_known_antigens), number_of_selected_proteins)
        outlist.append([virlimit, padlimit, recall, coverage, expected_bpas, fold_enrichment,p_value])
        
    # get padlimit and virlimit values with lower fold-enrichment
    df = pd.DataFrame(outlist, columns = ['virlimit', 'padlimit', 'recall (%)',
                                'coverage (%)', 'expected_bpas', 'fold-enrichment', 'p-value'])
    best_values_df = df[df['fold-enrichment'] == min(df['fold-enrichment'])]
    virlimit = best_values_df[
        best_values_df['coverage (%)'] == min(best_values_df['coverage (%)'])]['virlimit'].unique()[0]
    padlimit = best_values_df[
        best_values_df['coverage (%)'] == min(best_values_df['coverage (%)'])]['padlimit'].unique()[0]
    print(split, df[df['fold-enrichment'] == min(df['fold-enrichment'])])
    # apply lower values to validation set
    
    # read in precomputaded nerve predictions on known antigens
    nerve_2_tuning_results = pd.read_csv('./tuning/nerve_2_tuning/vaccine_candidates.csv')
    nerve_2_tuning_results.id = [element[0] for element in nerve_2_tuning_results.id.str.split('_')]
    nerve_2_tuning_results = nerve_2_tuning_results[nerve_2_tuning_results.id.isin(val_antigens)]


    # evaluate for known antigens
    list_of_known_antigens = protein_loader(nerve_2_tuning_results)
    list_of_known_antigens_seqs = [protein.sequence for protein in list_of_known_antigens]
    selected_antigens = select(list_of_proteins = list_of_known_antigens,
                        transmemb_doms_limit = 3,
                        padlimit = padlimit, mouse = "True",
                        mouse_peptides_sum_limit = .15,
                        virlimit = virlimit, virulent = "True")
    # evaluate for proteomes
    for proteome in val_proteomes:
        path = os.path.join('./tuning/', proteome, 'vaccine_candidates.csv')
        list_of_proteins = protein_loader(path)
        for protein in list_of_proteins:
                if protein.sequence in list_of_known_antigens_seqs:
                    list_of_proteins.remove(protein)
        selected_proteins = select(list_of_proteins = list_of_proteins,
                            transmemb_doms_limit = 3,
                            padlimit = padlimit, mouse = "True",
                            mouse_peptides_sum_limit = .15,
                            virlimit = virlimit, virulent = "True")
        # populate values
        number_of_selected_proteins += len(selected_proteins)
        number_of_total_proteins += len(list_of_proteins)
    
    number_of_selected_proteins = number_of_selected_proteins + len(selected_antigens)
    number_of_total_proteins = number_of_total_proteins + len(list_of_known_antigens)
                              
    expected_bpas = round((len(list_of_known_antigens) * number_of_selected_proteins /\
                               number_of_total_proteins), 2)
    recall = round((len(selected_antigens) / len(list_of_known_antigens))*100, 2)
    coverage = round((number_of_selected_proteins / number_of_total_proteins)*100, 2)
    fold_enrichment = round(len(selected_antigens)/expected_bpas, 2)
    p_value = hg.sf(len(selected_antigens)-1, 
                          number_of_total_proteins, len(list_of_known_antigens), number_of_selected_proteins)
    outvalues.append([split, virlimit, padlimit, recall, coverage, expected_bpas, fold_enrichment,p_value])
    print([split, virlimit, padlimit, recall, coverage, expected_bpas, fold_enrichment,p_value])

In [None]:
cv_results = pd.DataFrame(outvalues, columns = ['split', 'virlimit', 'padlimit', 'recall',
                                                'coverage', 'expected_bpas', 'fold_enrichment', 'p_value']) 
cv_results

### Run NERVE with the optimal parameters on the test set

In [None]:
# NERVE2

df = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx', sheet_name = 'nerve_2_tests')
df = df[df['Nerve_2_test'] == 1]
df_neg = df[df['gram'] == '-']
df_pos = df[df['gram'] == '+']
to_fasta(df_neg, './test/gram_neg_2/gram_neg_2.fasta')
to_fasta(df_pos, './test/gram_pos_2/gram_pos_2.fasta')

In [None]:
# Run nerve on gram positive and negative datasets

for test_set, gram in zip(['gram_neg_2', 'gram_pos_2'], 
                         ['n', 'p']):
    
    nerve_run = RunNerve()
    nerve_run.args['proteome1'] = f'./test/{test_set}/{test_set}.fasta'
    nerve_run.args['gram'] = gram
    nerve_run.args.pop('proteome2')
    nerve_run.args['select'] = False
    nerve_run.args['working_dir'] = f'./test/{test_set}/'
    out, err = nerve_run.run()

In [None]:

# read in precomputaded nerve predictions on known antigens
nerve_2_test_results = pd.read_csv('./test/nerve_2_test/vaccine_candidates.csv')

nerve_2_test = pd.read_excel('../../database/antigens/test_antigens_summary_v2.xlsx',
                               sheet_name = 'nerve_2_tests')
nerve_2_test = nerve_2_test[nerve_2_test['Nerve_2_test'] == 1]
test_proteomes  = list(nerve_2_test['Proteome ID'].unique())

outlist = []
number_of_selected_proteins = 0
number_of_total_proteins = 0
virlimit = .55
padlimit = .50

# evaluate for known antigens
list_of_known_antigens = protein_loader(nerve_2_test_results)
selected_antigens = select(list_of_proteins = list_of_known_antigens,
                    transmemb_doms_limit = 3,
                    padlimit = padlimit, mouse = "True",
                    mouse_peptides_sum_limit = .15,
                    virlimit = virlimit, virulent = "True")
# evaluate for proteomes
for proteome in test_proteomes:
    path = os.path.join('./tuning/', proteome, 'vaccine_candidates.csv')
    list_of_proteins = protein_loader(path)
    selected_proteins = select(list_of_proteins = list_of_proteins,
                        transmemb_doms_limit = 3,
                        padlimit = padlimit, mouse = "True",
                        mouse_peptides_sum_limit = .15,
                        virlimit = virlimit, virulent = "True")
    # populate values
    number_of_selected_proteins += len(selected_proteins)
    number_of_total_proteins += len(list_of_proteins)
expected_bpas = round((len(list_of_known_antigens) * len(selected_proteins)) / len(list_of_proteins),2)
recall = round((len(selected_antigens) / len(list_of_known_antigens))*100, 2)
coverage = round((number_of_selected_proteins / number_of_total_proteins)*100, 2)
outlist.append([virlimit, padlimit, recall,
                coverage,  
               expected_bpas, recall / coverage, 
                hg.sf(len(selected_antigens)-1, 
                      number_of_total_proteins, len(list_of_known_antigens), number_of_selected_proteins)])


In [None]:
df = pd.DataFrame(outlist, columns = ['virlimit', 'padlimit', 'recall',
                                        'coverage', 'expected_bpas', 'fold_enrichment', 'p_value']) 
df

In [None]:
len(list_of_known_antigens)

### Run statistics

In [None]:
proteomes = os.path.join('./tuning/')
df = pd.DataFrame()
for proteome in os.listdir(proteomes):
    tmp_df = pd.read_csv(os.path.join(proteomes, proteome, 'vaccine_candidates.csv'))
    df = df.append(tmp_df)
df['index'] = range(0, len(df))
df = df.set_index('index')
df

In [None]:
df.hist('adhesin_probability')

In [None]:
df.hist('virulence_probability')

In [None]:
# run k-fold cross validation to find best parameters

# read in precomputaded nerve predictions on known antigens
nerve_2_tuning_results = pd.read_csv('./tuning/nerve_2_tuning/vaccine_candidates.csv')
conditions = [[.9, .9]]
result_dic = {}
# iterate over every validation proteome
for val_proteome in tuning_proteomes:
    
    val_set_df = nerve_2_tuning[nerve_2_tuning['Proteome ID'] == val_proteome]
    tuning_set_df = nerve_2_tuning[nerve_2_tuning['Proteome ID'] != val_proteome]
    
    # iterate over conditions
    for condition in conditions:
        for tuning_proteome in list(tuning_set_df['Proteome ID'].unique()):
            path = os.path.join('./tuning/', tuning_proteome)
            if os.path.isdir(path) == True:
                
                padlimit, virlimit = condition
                # run select on known antigens
                known_antigens = val_set_df['protein'].to_list()
                list_of_known_antigens = protein_loader(
                    nerve_2_tuning_results[nerve_2_tuning_results.id.str.contains('|'.join(known_antigens))])
                selected_antigens = select(list_of_proteins = list_of_known_antigens,
                                           transmemb_doms_limit = 3,
                                           padlimit = padlimit, mouse = True,
                                           mouse_peptides_sum_limit = .15,
                                           virlimit = virlimit, virulent = True, annotation = True)
                # run select on the full proteome
                list_of_proteins = protein_loader(os.path.join(path, 'vaccine_candidates.csv'))
                selected_proteins = select(list_of_proteins = list_of_proteins,
                                           transmemb_doms_limit = 3,
                                           padlimit = padlimit, mouse = True,
                                           mouse_peptides_sum_limit = .15,
                                           virlimit = virlimit, virulent = True, annotation = True)
                # collect data:
                sum_of_antigens
                sum_of_selected_antigens
                sum_of_proteins
                sum_of_selected_proteins
                
                #print(len(selected_antigens)/len(list_of_known_antigens),
                 #     len(list_of_proteins)/len(selected_proteins))
        # evaluate condition stats  
        stat_value
        condition = stat_value  
   
    # evaluate best condition
    # use best condition to evaluate val set