### Test NERVE

In [324]:
import subprocess, os
import pandas as pd

In [325]:
class RunNerve():
    """Class to store nerve parameters and run NERVE program"""
    def __init__(self):
        """Initiate arguments and program path"""
        self.args = {'annotation': True, 'e_value': 1e-10, 'gram': None, 'minlength': 9, 'mismatch': 1,
                     'mouse': True, 'mouse_peptides_sum_limit': .15, 'proteome1': None, 'proteome2': None, 
                     'p_ad_extracellular_filter': .38, 'p_ad_no_citoplasm_filter': .46, 'padlimit': .5, 
                     'razor': True, 'razlen': 50, 'select': True, 'substitution': 3, 
                     'transmemb_doms_limit': 3, 'virlimit': .5, 'virulent': True, 'working_dir': None, 
                     'NERVE_dir': '../../', 'iFeature_dir': '/iFeature', 'DeepFri_dir': '/DeepFri'
                     }
        self.program = "../../code/NERVE.py"

    def print_args(self):
        print(f'Arguments are:\n{self.args}')
        print(f'NERVE executable path is:\n{self.program}')
    
    def run(self):
        """Pass commands to line
        program: command/program to run
        args: arguments"""
        
        program = self.program
        args = self.args
        for key in args:
            args[key] = str(args[key])
        
        args_list = []
        for item in args.items():
            args_list.append(f'--{item[0]}')
            args_list.append(item[1])
        cmd = ['python3'] + [program] + args_list
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        output, error = process.communicate()
        return output, error

In [264]:
# prepare files for tuning (NERVE2)
def to_fasta(df, outfile):
    """Converts pandas cells to fasta"""
    outfile = open(outfile, 'w')
    for index, row in df.iterrows():
        outfile.write('>' + row['protein']+ '_' + row['organism name'] + '\n'\
                      + row['sequence'].replace(" ", "") + '\n')
    outfile.close()

In [323]:
df = pd.read_excel('../../database/antigens/test_antigens_summary.xlsx', sheet_name = 'nerve_2_tests')
df = df[df['nerve_train'] == 1]
len(df['Proteome ID'].unique())

25

In [248]:
df = pd.read_excel('../../database/antigens/test_antigens_summary.xlsx', sheet_name = 'nerve_2_tests')
df = df[df['nerve_train'] == 1]
df_neg = df[df['gram'] == '-']
df_pos = df[df['gram'] == '+']
to_fasta(df_neg, './tuning/gram_neg.fasta')
to_fasta(df_pos, './tuning/gram_pos.fasta')

In [265]:
# NERVE 1

df = pd.read_excel('../../database/antigens/test_antigens_summary.xlsx', sheet_name = 'nerve_1_tests')
df_neg = df[df['gram'] == '-']
df_pos = df[df['gram'] == '+']
to_fasta(df_neg, './tuning/gram_neg_1.fasta')
to_fasta(df_pos, './tuning/gram_pos_1.fasta')

In [83]:
df[df['gram'] == '+']['Proteome ID'].unique()

array(['UP000001978', 'UP000031961', 'UP000070260', 'UP000032903',
       'UP000217443', 'UP000001584', 'UP000001020', 'UP000007137'],
      dtype=object)

In [269]:
def tuning(padlimit, virlimit, proteomes_neg=[], proteomes_pos=[]):
    """Runs NERVE tuning"""
    outdic = {'n': {},
              'p': {}}
    
    gram_neg = RunNerve()
    gram_neg.args['proteome1'] = './tuning/gram_neg_2.fasta'
    gram_neg.args['gram'] = 'n'
    gram_neg.args.pop('proteome2')
    working_dir = f'./tuning/gram_neg_padlimit_{str(padlimit)}_virlimit_{str(virlimit)}'
    gram_neg.args['working_dir'] = working_dir
    out, err = gram_neg.run()
    df_vaccines = pd.read_csv(os.path.join(working_dir, 'vaccine_candidates.csv'))
    df_non_vaccines = pd.read_csv(os.path.join(working_dir, 'discarded_proteins.csv'))    
    outdic['n']['gram_neg_recall'] = len(df_vaccines) / (len(df_vaccines) + len(df_non_vaccines))
    
    gram_pos = RunNerve()
    gram_pos.args['proteome1'] = './tuning/gram_pos_2.fasta'
    gram_pos.args['gram'] = 'p'
    gram_pos.args.pop('proteome2')
    working_dir = f'./tuning/gram_pos_padlimit_{str(padlimit)}_virlimit_{str(virlimit)}'
    gram_pos.args['working_dir'] = working_dir
    out, err = gram_pos.run()
    df_vaccines = pd.read_csv(os.path.join(working_dir, 'vaccine_candidates.csv'))
    df_non_vaccines = pd.read_csv(os.path.join(working_dir, 'discarded_proteins.csv'))    
    outdic['p']['gram_pos_recall'] = len(df_vaccines) / (len(df_vaccines) + len(df_non_vaccines))
    
    for proteome in proteomes_neg:
        prot_run = RunNerve()
        prot_run.args['proteome1'] = proteome
        prot_run.args['gram'] = 'n'
        prot_run.args.pop('proteome2')
        working_dir = f'./tuning/{proteome}_gram_n_padlimit_{str(padlimit)}_virlimit_{str(virlimit)}'
        prot_run.args['working_dir'] = working_dir
        out, err = prot_run.run()
        df_vaccines = pd.read_csv(os.path.join(working_dir, 'vaccine_candidates.csv'))
        df_non_vaccines = pd.read_csv(os.path.join(working_dir, 'discarded_proteins.csv'))    
        outdic['n'][proteome] = len(df_vaccines) / (len(df_vaccines) + len(df_non_vaccines))
     
    for proteome in proteomes_pos:
        prot_run = RunNerve()
        prot_run.args['proteome1'] = proteome
        prot_run.args['gram'] = 'p'
        prot_run.args.pop('proteome2')
        working_dir = f'./tuning/{proteome}_gram_n_padlimit_{str(padlimit)}_virlimit_{str(virlimit)}'
        prot_run.args['working_dir'] = working_dir
        out, err = prot_run.run()
        df_vaccines = pd.read_csv(os.path.join(working_dir, 'vaccine_candidates.csv'))
        df_non_vaccines = pd.read_csv(os.path.join(working_dir, 'discarded_proteins.csv'))    
        outdic['p'][proteome] = len(df_vaccines) / (len(df_vaccines) + len(df_non_vaccines))
    
    return outdic

In [None]:
#outdic = tuning(0.5, 0.5, ['UP000000419', 'UP000002676', 'UP000000540'], ['UP000001978', 'UP000031961', 'UP000001584'])
outdic = tuning(0.5, 0.5, [], [])
outdic

### Run tuning on subset of pre-computed NERVE analsysis

In [170]:
from code import Protein
import numpy as np

In [169]:
# prepare files

for folder in os.listdir('./tuning'):
    if folder != 'results' and os.path.isdir('./tuning/'+folder) == True:
        df_vaccines = pd.read_csv(f'./tuning/{folder}/vaccine_candidates.csv')
        df_non_vaccines = pd.read_csv(f'./tuning/{folder}/discarded_proteins.csv')
        df_merged = pd.concat([df_vaccines, df_non_vaccines])
        df_merged.to_csv(f'./tuning/results/{folder}.csv', index=False)

In [316]:
class Localization:
        """class to store and handle protein subcellular localizations"""
        def __init__(self, localization, reliability):
            self.localization = str(localization)
            self.reliability = float(reliability)

def protein_loader(proteome_path):
    
    df = pd.read_csv(proteome_path)
    
    df = df.astype(object).where(pd.notnull(df),None)
    list_of_proteins = []
    for index, row in df.iterrows():
        protein = Protein.Protein(row['id '], row['sequence'])
        protein.accession = row['uniprot_accession_code']
        protein.length = row['length']
        protein.transmembrane_doms = 0 if row['transmembrane_doms'] == None else row['transmembrane_doms']
        protein.localization = [Localization(row['localization'], row['localization score'])]
        protein.p_vir = row['virulence_probability']
        protein.p_ad = row['adhesin_probability']
        protein.conservation_score = row['conservation_score']
        protein.shared_human_peps = 0 if row['shared_human_peps'] == None else row['shared_human_peps']
        protein.shared_mouse_peps = 0 if row['shared_mouse_peps'] == None else row['shared_mouse_peps']
        protein.shared_conserv_proteome_peps = 0 if row['shared_conserv_proteome_peps'] == None else row['shared_conserv_proteome_peps']
        protein.sapiens_peptides_sum = 0 if row['human_peptides_sum'] == None else row['human_peptides_sum']
        protein.mouse_peptides_sum = 0 if row['mouse_peptides_sum'] == None else row['mouse_peptides_sum']
        protein.annotations = row['annotations']
        protein.list_of_peptides_from_comparison_with_mhcpep_sapiens = [] if \
        row['list_of_peptides_from_comparison_with_mhcpep_sapiens'] == None else \
        [peptide for peptide in range(len(row['list_of_peptides_from_comparison_with_mhcpep_sapiens']))]
        protein.list_of_peptides_from_comparison_with_mhcpep_mouse = [] if \
        row['list_of_peptides_from_comparison_with_mhcpep_mouse'] == None else \
        [peptide for peptide in range(len(row['list_of_peptides_from_comparison_with_mhcpep_mouse']))]
        protein.original_sequence_if_razor = row['original_sequence_if_razor']
        protein.tmhmm_seq = row['tmhmm_seq']
        list_of_proteins.append(protein)
    return list_of_proteins

def select(list_of_proteins, p_ad_no_citoplasm_filter, p_ad_extracellular_filter, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent, annotation)->list:
    """Selects suitable candidate proteins for vaccine production"""
    
    # annotations to exclude proteins
    exclusion_annotations = ['structural constituent of ribosome', 'DNA binding', 'DNA-binding transcription factor activity',
                             'transcription regulator activity', 'rRNA binding', 'RNA binding',
                             'aminoacyl-tRNA ligase activity', 'sequence-specific DNA binding', 
                             'catalytic activity, acting on a tRNA', 'catalytic activity, acting on RNA',
                             'tyrosine-tRNA ligase activity', 'aminoacyl-tRNA editing activity',
                             'translation factor activity, RNA binding', 'translation regulator activity',
                             'translation regulator activity, nucleic acid binding', 
                             'translation elongation factor activity', 'catalytic activity, acting on DNA'
                            ]
    
    final_list = []
    for protein in list_of_proteins:
        # exclude cytoplasmatic proteins if low PAD or VIR
        if virulent == "True":
            if (protein.localization[0].localization == "Cytoplasmic" and (protein.p_ad < padlimit and protein.p_vir < virlimit)): continue 
        if virulent != "True":
            if protein.localization[0].localization == "Cytoplasmic" and protein.p_ad < padlimit: continue 
        
        # exlude low fidelty localization prediction proteins if low PAD or VIR
        if virulent == "True":
            if protein.localization[0].reliability < 7.49 and (protein.p_vir < virlimit and protein.p_ad < padlimit): continue
        if virulent != "True":
            if protein.localization[0].reliability < 7.49 and protein.p_ad < padlimit: continue
        
        
        if (protein.transmembrane_doms >= transmemb_doms_limit) and (protein.original_sequence_if_razor is None): continue
        if protein.sapiens_peptides_sum > .15: continue
        if len(protein.list_of_peptides_from_comparison_with_mhcpep_sapiens) >= 1: continue
        if mouse == "True":
            if protein.mouse_peptides_sum > mouse_peptides_sum_limit: continue 
            if len(protein.list_of_peptides_from_comparison_with_mhcpep_mouse) >= 1: continue 
        
        annotation_flag = "False"
        if annotation == "True":
            for annot in exclusion_annotations:
                if annot in str(protein.annotations): 
                    annotation_flag = "True"
        if annotation_flag == "True": continue

        if protein.p_ad < p_ad_no_citoplasm_filter and not protein.localization[0].localization == "Extracellular": continue 
        if protein.p_ad < p_ad_extracellular_filter and protein.localization[0].localization == "Extracellular": continue
        
        final_list.append(protein)
    return final_list   

def select_old(list_of_proteins, p_ad_no_citoplasm_filter, p_ad_extracellular_filter, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent)->list:
    """Selects suitable candidate proteins for vaccine production"""
    final_list = []
    for protein in list_of_proteins:
        if protein.localization[0].localization == "Cytoplasmic": continue 
        #if protein.localization[0].reliability <= 3: continue
        if protein.p_ad < p_ad_no_citoplasm_filter and not protein.localization[0].localization == "Extracellular": continue 
        if protein.p_ad < p_ad_extracellular_filter and protein.localization[0].localization == "Extracellular": continue 
        if (protein.transmembrane_doms >= transmemb_doms_limit) and (protein.original_sequence_if_razor is None): continue
        if protein.sapiens_peptides_sum > .15: continue
        if len(protein.list_of_peptides_from_comparison_with_mhcpep_sapiens) >= 1: continue
        if (float(protein.localization[0].reliability) < 7.49) and (protein.p_ad < padlimit): continue
        # proteins with Unknown localization have score==0
        #if (protein.localization[0].localization == "Unknown") and (protein.p_ad < padlimit): continue
        if mouse=="True":
            if protein.mouse_peptides_sum > mouse_peptides_sum_limit: continue 
            if len(protein.list_of_peptides_from_comparison_with_mhcpep_mouse) >= 1: continue 
        if virulent=="True":
            
            if protein.p_vir < virlimit: continue
        final_list.append(protein)
    return final_list
        
def tuner(padlimit=0.89, virlimit=0.5, list_of_proteins=[], p_ad_no_citoplasm_filter = .0,\
          p_ad_extracellular_filter = .0, transmemb_doms_limit = 3,
            mouse = "True", mouse_peptides_sum_limit = .15, virulent = "True", annotation = "True", s = 'new'):
    
    pre_len = len(list_of_proteins)
    if s == 'new':
        list_of_proteins = select(list_of_proteins, p_ad_no_citoplasm_filter, p_ad_extracellular_filter, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent, annotation)
    else:
        list_of_proteins = select_old(list_of_proteins, p_ad_no_citoplasm_filter, p_ad_extracellular_filter, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent)
    post_len = len(list_of_proteins)
    return round((post_len/pre_len), 2)



In [240]:
# test virlimit and padlimit on new select

PARAMS = [[0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], [0.95, 0.95],\
          [0.5, 0.6], [0.5, 0.7], [0.5, 0.8], \
          [0.5, 0.9], [0.6, 0.5], [0.7, 0.5], [0.8, 0.5], [0.9, 0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(param[0], param[1], list_of_proteins)
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]} Virlimit: {param[1]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')           

Padlimit: 0.5 Virlimit: 0.5
G_neg and G_pos test antigens recall: 0.87, 0.72
Gram n medium extraction: 0.33
Gram p medium extraction: 0.5
--------------------------------------
Padlimit: 0.6 Virlimit: 0.6
G_neg and G_pos test antigens recall: 0.87, 0.69
Gram n medium extraction: 0.29
Gram p medium extraction: 0.46
--------------------------------------
Padlimit: 0.7 Virlimit: 0.7
G_neg and G_pos test antigens recall: 0.84, 0.66
Gram n medium extraction: 0.25
Gram p medium extraction: 0.42
--------------------------------------
Padlimit: 0.8 Virlimit: 0.8
G_neg and G_pos test antigens recall: 0.81, 0.66
Gram n medium extraction: 0.21
Gram p medium extraction: 0.36
--------------------------------------
Padlimit: 0.9 Virlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.17
Gram p medium extraction: 0.25
--------------------------------------
Padlimit: 0.95 Virlimit: 0.95
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
G

In [268]:
# test best parameters on nerve 1 data

gram_neg = RunNerve()
gram_neg.args['proteome1'] = './tuning/gram_neg_1.fasta'
gram_neg.args['gram'] = 'n'
gram_neg.args.pop('proteome2')
working_dir = f'./tuning/gram_neg_1'
gram_neg.args['working_dir'] = working_dir
gram_neg.args['virlimit'] = .95
gram_neg.args['padlimit'] = .95
out, err = gram_neg.run()
# recall: 0.71

gram_pos = RunNerve()
gram_pos.args['proteome1'] = './tuning/gram_pos_1.fasta'
gram_pos.args['gram'] = 'p'
gram_pos.args.pop('proteome2')
working_dir = f'./tuning/gram_pos_1'
gram_pos.args['working_dir'] = working_dir
gram_pos.args['virlimit'] = 0.95
gram_pos.args['padlimit'] = 0.95
out, err = gram_pos.run()
# recall: 0.87

In [281]:
# test NERVE 1 params on new dataset

PARAMS = [[0.85, 0.]]
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(padlimit = param[0], virlimit = param[1], list_of_proteins = list_of_proteins,
                     p_ad_no_citoplasm_filter = 0.46, 
                    p_ad_extracellular_filter = 0.38, virulent = "False", s = 'old')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    
    print(f'Padlimit: {param[0]} Virlimit: {param[1]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------') 

Padlimit: 0.85 Virlimit: 0.0
G_neg and G_pos test antigens recall: 0.45, 0.28
Gram n medium extraction: 0.02
Gram p medium extraction: 0.03
--------------------------------------


In [273]:
# test old best parameters on nerve 1 data

gram_neg = RunNerve()
gram_neg.args['proteome1'] = './tuning/gram_neg_1.fasta'
gram_neg.args['gram'] = 'n'
gram_neg.args.pop('proteome2')
working_dir = f'./tuning/gram_neg_1_old'
gram_neg.args['working_dir'] = working_dir
gram_neg.args['virulent'] = "False"
out, err = gram_neg.run()
# recall: 0.47

gram_pos = RunNerve()
gram_pos.args['proteome1'] = './tuning/gram_pos_1.fasta'
gram_pos.args['gram'] = 'p'
gram_pos.args.pop('proteome2')
working_dir = f'./tuning/gram_pos_1_old'
gram_pos.args['working_dir'] = working_dir
gram_pos.args['virulent'] = "False"
out, err = gram_pos.run()
# recall: 0.72

In [320]:
# tests on old select
# test standard setting (padlimit, p_ad_no_citoplasm_filter, p_ad_extracellular_filter)

PARAMS = [[.89, .46, .38], [.95, .46, .38], [.85, .46, .38], [.8, .46, .38], [.75, .46, .38], [.65, .46, .38],\
          [.6, .46, .38], [.5, .46, .38], [.89, .5, .5], [.89, .6, .5], [.89, .4, .3], [.89, .3, .2], [.89, .2, .1],\
         [.89, .0, .0], [.80, .5, .5], [.70, .5, .5], [.60, .5, .5], [.50, .5, .5], [.95, .5, .5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='False', 
                     virulent='False', annotation = 'False', p_ad_no_citoplasm_filter=param[1],\
                     p_ad_extracellular_filter=param[2])
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}, p_ad_no_citoplasm_filter: {param[1]}, p_ad_extracellular_filter: {param[2]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')    

Padlimit: 0.89, p_ad_no_citoplasm_filter: 0.46, p_ad_extracellular_filter: 0.38
G_neg and G_pos test antigens recall: 0.45, 0.28
Gram n medium extraction: 0.02
Gram p medium extraction: 0.03
--------------------------------------
Padlimit: 0.95, p_ad_no_citoplasm_filter: 0.46, p_ad_extracellular_filter: 0.38
G_neg and G_pos test antigens recall: 0.45, 0.28
Gram n medium extraction: 0.02
Gram p medium extraction: 0.03
--------------------------------------
Padlimit: 0.85, p_ad_no_citoplasm_filter: 0.46, p_ad_extracellular_filter: 0.38
G_neg and G_pos test antigens recall: 0.45, 0.28
Gram n medium extraction: 0.02
Gram p medium extraction: 0.03
--------------------------------------
Padlimit: 0.8, p_ad_no_citoplasm_filter: 0.46, p_ad_extracellular_filter: 0.38
G_neg and G_pos test antigens recall: 0.45, 0.31
Gram n medium extraction: 0.02
Gram p medium extraction: 0.04
--------------------------------------
Padlimit: 0.75, p_ad_no_citoplasm_filter: 0.46, p_ad_extracellular_filter: 0.38
G

In [None]:
# tests on old select
# only tuning padlimit

PARAMS = [[0.94], [0.92], [0.90], [0.89], [0.88], [0.86], [0.84], [0.80], [0.75], [0.70], [0.65], [0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='False', 
                     virulent='False', annotation = 'False')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]} p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')    

In [317]:
# tests on old select
# tuning padlimit, mouse True

PARAMS = [[0.94], [0.92], [0.90], [0.89], [0.88], [0.86], [0.84], [0.80], [0.75], [0.70], [0.65], [0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='True', 
                     virulent='False', annotation = 'False')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')   

Padlimit: 0.94, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.15
--------------------------------------
Padlimit: 0.92, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.15
--------------------------------------
Padlimit: 0.9, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.89, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.88, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens r

In [319]:
# tests on old select
# tuning padlimit, mouse True, virulent true

PARAMS = [[0.94], [0.92], [0.90], [0.89], [0.88], [0.86], [0.84], [0.80], [0.75], [0.70], [0.65], [0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='True', 
                     virulent='True', annotation = 'False')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')   

Padlimit: 0.94, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.52, 0.41
Gram n medium extraction: 0.07
Gram p medium extraction: 0.1
--------------------------------------
Padlimit: 0.92, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.52, 0.41
Gram n medium extraction: 0.07
Gram p medium extraction: 0.1
--------------------------------------
Padlimit: 0.9, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.52, 0.41
Gram n medium extraction: 0.07
Gram p medium extraction: 0.1
--------------------------------------
Padlimit: 0.89, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recall: 0.52, 0.41
Gram n medium extraction: 0.07
Gram p medium extraction: 0.1
--------------------------------------
Padlimit: 0.88, p_ad_no_citoplasm_filter: 0, p_ad_extracellular_filter: 0
G_neg and G_pos test antigens recal

In [297]:
# test a variation of the new select module

def select_variation(list_of_proteins, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent, annotation)->list:
    """Selects suitable candidate proteins for vaccine production"""
    
    # annotations to exclude proteins
    exclusion_annotations = ['structural constituent of ribosome', 'DNA binding', 'DNA-binding transcription factor activity',
                             'transcription regulator activity', 'rRNA binding', 'RNA binding',
                             'aminoacyl-tRNA ligase activity', 'sequence-specific DNA binding', 
                             'catalytic activity, acting on a tRNA', 'catalytic activity, acting on RNA',
                             'tyrosine-tRNA ligase activity', 'aminoacyl-tRNA editing activity',
                             'translation factor activity, RNA binding', 'translation regulator activity',
                             'translation regulator activity, nucleic acid binding', 
                             'translation elongation factor activity', 'catalytic activity, acting on DNA'
                            ]
    
    final_list = []
    for protein in list_of_proteins:
        # exclude cytoplasmatic proteins if low PAD or VIR
        #if virulent == "True":
        #    if (protein.localization[0].localization == "Cytoplasmic" and (protein.p_ad < padlimit and protein.p_vir < virlimit)): continue 
        #if virulent != "True":
        #    if protein.localization[0].localization == "Cytoplasmic" and protein.p_ad < padlimit: continue 
        if protein.localization[0].localization == "Cytoplasmic" and protein.localization[0].reliability >= 7.49: continue
        
        # exlude low fidelty localization prediction proteins if low PAD or VIR
        if virulent == "True":
            if protein.localization[0].reliability < 7.49 and (protein.p_vir < virlimit and protein.p_ad < padlimit): continue
        if virulent != "True":
            if protein.localization[0].reliability < 7.49 and protein.p_ad < padlimit: continue
        
        if (protein.transmembrane_doms >= transmemb_doms_limit) and (protein.original_sequence_if_razor is None): continue
        if protein.sapiens_peptides_sum > .15: continue
        if len(protein.list_of_peptides_from_comparison_with_mhcpep_sapiens) >= 1: continue
        if mouse == "True":
            if protein.mouse_peptides_sum > mouse_peptides_sum_limit: continue 
            if len(protein.list_of_peptides_from_comparison_with_mhcpep_mouse) >= 1: continue 
        
        annotation_flag = "False"
        if annotation == "True":
            for annot in exclusion_annotations:
                if annot in str(protein.annotations): 
                    annotation_flag = "True"
        if annotation_flag == "True": continue

        final_list.append(protein)
    return final_list   

        
def tuner_variation(padlimit=0.89, virlimit=0.95, list_of_proteins=[], 
          transmemb_doms_limit = 3,
          mouse = "True", mouse_peptides_sum_limit = .15, virulent = "True", annotation = "True", s = 'new'):
    
    pre_len = len(list_of_proteins)

    list_of_proteins = select_variation(list_of_proteins, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent, annotation)
    post_len = len(list_of_proteins)
    return round((post_len/pre_len), 2)

In [298]:
# tests on a variation of the new select
# change padlimit and keep other modules disabled

PARAMS = [[0.5], [0.6], [0.7], [0.8], [0.85], [0.9], [0.95]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='False', 
                     virulent='False', annotation = 'False')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')    

Padlimit: 0.5
G_neg and G_pos test antigens recall: 0.77, 0.75
Gram n medium extraction: 0.16
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.6
G_neg and G_pos test antigens recall: 0.77, 0.75
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.7
G_neg and G_pos test antigens recall: 0.77, 0.72
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.8
G_neg and G_pos test antigens recall: 0.74, 0.72
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.85
G_neg and G_pos test antigens recall: 0.74, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.95


In [299]:
# tests on a variation of the new select
# change padlimit, mouse true

PARAMS = [[0.5], [0.6], [0.7], [0.8], [0.85], [0.9], [0.95]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='True', 
                     virulent='False', annotation = 'False')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')

Padlimit: 0.5
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.6
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.7
G_neg and G_pos test antigens recall: 0.77, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.8
G_neg and G_pos test antigens recall: 0.74, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.85
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.95


In [300]:
# tests on a variation of the new select
# change padlimit mouse true, annotation true

PARAMS = [[0.5], [0.6], [0.7], [0.8], [0.85], [0.9], [0.95]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='True', 
                     virulent='False', annotation = 'True')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')

Padlimit: 0.5
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.6
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.7
G_neg and G_pos test antigens recall: 0.77, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.8
G_neg and G_pos test antigens recall: 0.74, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.85
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.15
--------------------------------------
Padlimit: 0.95


In [303]:
# tests on a variation of the new select
# change padlimit mouse true, annotation true, virulent true

PARAMS = [[0.5], [0.6], [0.7], [0.8], [0.85], [0.9], [0.95]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation(padlimit = param[0], list_of_proteins=list_of_proteins, s = 'old', mouse='True', 
                     virulent='True', annotation = 'True')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')

Padlimit: 0.5
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.6
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.7
G_neg and G_pos test antigens recall: 0.77, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.8
G_neg and G_pos test antigens recall: 0.74, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.85
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.15
--------------------------------------
Padlimit: 0.95


In [305]:
# test another variation of select
def select_variation_2(list_of_proteins, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent)->list:
    """Selects suitable candidate proteins for vaccine production"""
    
    # annotations to exclude proteins
    exclusion_annotations = ['structural constituent of ribosome', 'DNA binding', 'DNA-binding transcription factor activity',
                             'transcription regulator activity', 'rRNA binding', 'RNA binding',
                             'aminoacyl-tRNA ligase activity', 'sequence-specific DNA binding', 
                             'catalytic activity, acting on a tRNA', 'catalytic activity, acting on RNA',
                             'tyrosine-tRNA ligase activity', 'aminoacyl-tRNA editing activity',
                             'translation factor activity, RNA binding', 'translation regulator activity',
                             'translation regulator activity, nucleic acid binding', 
                             'translation elongation factor activity', 'catalytic activity, acting on DNA'
                            ]
    
    final_list = []
    for protein in list_of_proteins:
        if protein.localization[0].localization == "Cytoplasmic" and protein.localization[0].reliability >= 7.5: continue 
        #if protein.localization[0].reliability <= 3: continue
        if (protein.transmembrane_doms >= transmemb_doms_limit) and (protein.original_sequence_if_razor is None): continue
        if protein.sapiens_peptides_sum > .15: continue
        if len(protein.list_of_peptides_from_comparison_with_mhcpep_sapiens) >= 1: continue
        if virulent == "True":
            if (float(protein.localization[0].reliability) < 7.49) and (protein.p_ad < padlimit\
                                                                       and protein.p_vir < virlimit): continue
        if virulent != "True":
            if (float(protein.localization[0].reliability) < 7.49) and (protein.p_ad < padlimit): continue
        # proteins with Unknown localization have score==0
        #if (protein.localization[0].localization == "Unknown") and (protein.p_ad < padlimit): continue
        if mouse==True:
            if protein.mouse_peptides_sum > mouse_peptides_sum_limit: continue 
            if len(protein.list_of_peptides_from_comparison_with_mhcpep_mouse) >= 1: continue 
        
        annotation_flag = "False"
        if annotation == "True":
            for annot in exclusion_annotations:
                if annot in str(protein.annotations): 
                    annotation_flag = "True"
        if annotation_flag == "True": continue
        
        final_list.append(protein)     
    return final_list
        
def tuner_variation_2(padlimit=0.89, virlimit=0.95, list_of_proteins=[], 
          transmemb_doms_limit = 3,
          mouse = "True", mouse_peptides_sum_limit = .15, virulent = "True", annotation = "True"):
    
    pre_len = len(list_of_proteins)

    list_of_proteins = select_variation(list_of_proteins, transmemb_doms_limit,
           padlimit, mouse, mouse_peptides_sum_limit, virlimit, virulent, annotation)
    post_len = len(list_of_proteins)
    return round((post_len/pre_len), 2)

In [307]:
# tests on a variation of the new select
# change padlimit mouse true, annotation true, virulent False

PARAMS = [[0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], [0.95, 0.95],\
          [0.5, 0.6], [0.5, 0.7], [0.5, 0.8], \
          [0.5, 0.9], [0.6, 0.5], [0.7, 0.5], [0.8, 0.5], [0.9, 0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation_2(padlimit = param[0], virlimit = param[1],\
                                 list_of_proteins=list_of_proteins, mouse='True', 
                     virulent='False', annotation = 'True')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}, virlimit: {param[1]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')

Padlimit: 0.5, virlimit: 0.5
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.17
--------------------------------------
Padlimit: 0.6, virlimit: 0.6
G_neg and G_pos test antigens recall: 0.77, 0.69
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.7, virlimit: 0.7
G_neg and G_pos test antigens recall: 0.77, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.8, virlimit: 0.8
G_neg and G_pos test antigens recall: 0.74, 0.66
Gram n medium extraction: 0.15
Gram p medium extraction: 0.16
--------------------------------------
Padlimit: 0.9, virlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.15
Gram p medium extraction: 0.15
--------------------------------------
Padlimit: 0.95, virlimit: 0.95
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction:

In [306]:
# tests on a variation of the new select
# change padlimit mouse true, annotation true, virulent true

PARAMS = [[0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], [0.95, 0.95],\
          [0.5, 0.6], [0.5, 0.7], [0.5, 0.8], \
          [0.5, 0.9], [0.6, 0.5], [0.7, 0.5], [0.8, 0.5], [0.9, 0.5]] 
PROTEOMES = ['gram_neg', 'UP000000419', 'UP000002676', 'UP000000540', 'UP000029988', 'UP000000579',\
             'gram_pos', 'UP000001978', 'UP000031961', 'UP000001584', 'UP000070260', 'UP000032903']


for param in PARAMS:
    
    outdic = {'n': {'gram_neg': 0, 'UP000000419': 0, 'UP000002676': 0, 'UP000000540': 0, 
                    'UP000029988': 0, 'UP000000579': 0}, 
              'p': {'gram_pos': 0, 'UP000001978': 0, 'UP000031961': 0, 'UP000001584': 0, 
                    'UP000070260': 0, 'UP000032903': 0}}
    
    for proteome in PROTEOMES:
        proteome_path = './tuning/results/' + proteome + '.csv'
        list_of_proteins = protein_loader(proteome_path)
        
        data = tuner_variation_2(padlimit = param[0], virlimit = param[1],\
                                 list_of_proteins=list_of_proteins, mouse='True', 
                     virulent='True', annotation = 'True')
        
        if proteome in outdic['n']:
            outdic['n'][proteome] = data
        else:
            outdic['p'][proteome] = data
    
    print(f'Padlimit: {param[0]}, virlimit: {param[1]}')
    print(f"G_neg and G_pos test antigens recall: {outdic['n']['gram_neg']}, {outdic['p']['gram_pos']}")
    for gram in outdic:
        res = 0
        for proteome in outdic[gram]:
            if proteome != 'gram_neg' and proteome != 'gram_pos':
                res += outdic[gram][proteome] / (len(outdic[gram]) - 1)
        print(f"Gram {gram} medium extraction: {round(res, 2)}")
    print('--------------------------------------')

Padlimit: 0.5, virlimit: 0.5
G_neg and G_pos test antigens recall: 0.84, 0.72
Gram n medium extraction: 0.26
Gram p medium extraction: 0.28
--------------------------------------
Padlimit: 0.6, virlimit: 0.6
G_neg and G_pos test antigens recall: 0.84, 0.69
Gram n medium extraction: 0.23
Gram p medium extraction: 0.27
--------------------------------------
Padlimit: 0.7, virlimit: 0.7
G_neg and G_pos test antigens recall: 0.81, 0.66
Gram n medium extraction: 0.22
Gram p medium extraction: 0.26
--------------------------------------
Padlimit: 0.8, virlimit: 0.8
G_neg and G_pos test antigens recall: 0.77, 0.66
Gram n medium extraction: 0.19
Gram p medium extraction: 0.25
--------------------------------------
Padlimit: 0.9, virlimit: 0.9
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 0.16
Gram p medium extraction: 0.2
--------------------------------------
Padlimit: 0.95, virlimit: 0.95
G_neg and G_pos test antigens recall: 0.74, 0.62
Gram n medium extraction: 

In [72]:
# examples:

test = RunNerve()
test.args['proteome1'] = './tuning/gram_neg.fasta'
test.args['gram'] = 'n'
test.args.pop('proteome2')
test.args['working_dir'] = './tuning/gram_neg_test'
test.args['virulent'] = False
test.print_args()
test.run()

Arguments are:
{'annotation': True, 'e_value': 1e-10, 'gram': 'n', 'minlength': 9, 'mismatch': 1, 'mouse': True, 'mouse_peptides_sum_limit': 0.15, 'proteome1': './tuning/gram_neg.fasta', 'p_ad_extracellular_filter': 0.38, 'p_ad_no_citoplasm_filter': 0.46, 'padlimit': 0.5, 'razor': True, 'razlen': 50, 'select': True, 'substitution': 3, 'transmemb_doms_limit': 3, 'virlimit': 0.5, 'virulent': False, 'working_dir': './tuning/gram_neg_test', 'NERVE_dir': '../../', 'iFeature_dir': '/iFeature', 'DeepFri_dir': '/DeepFri'}
NERVE executable path is:
../../code/NERVE.py


(b'Start NERVE 1.5\n10% done\n20% done\n30% done\n40% done\n50% done\n60% done\n70% done\n80% done\n90% done\n100% done\nEnd NERVE computation successfully.\n',
 None)

In [156]:
list_of_proteins = protein_loader('./tuning/results/gram_neg.csv')
tuner(list_of_proteins, .5, .5, 3, 0.8, "True", .15, 0.8, "True")

0.8064516129032258

In [347]:
test = RunNerve()
test.args['proteome1'] = './tuning/gram_pos_2.fasta'
test.args['gram'] = 'p'
test.args.pop('proteome2')
test.args['working_dir'] = './tuning/gram_pos_test'
#test.args['mouse'] = "False"
#test.args['virulent'] = "False"
test.print_args()
test.run()

Arguments are:
{'annotation': True, 'e_value': 1e-10, 'gram': 'p', 'minlength': 9, 'mismatch': 1, 'mouse': True, 'mouse_peptides_sum_limit': 0.15, 'proteome1': './tuning/gram_pos_2.fasta', 'p_ad_extracellular_filter': 0.38, 'p_ad_no_citoplasm_filter': 0.46, 'padlimit': 0.5, 'razor': True, 'razlen': 50, 'select': True, 'substitution': 3, 'transmemb_doms_limit': 3, 'virlimit': 0.5, 'virulent': True, 'working_dir': './tuning/gram_pos_test', 'NERVE_dir': '../../', 'iFeature_dir': '/iFeature', 'DeepFri_dir': '/DeepFri'}
NERVE executable path is:
../../code/NERVE.py


(b'Start NERVE 1.5\n10% done\n20% done\n30% done\n40% done\n50% done\n60% done\n70% done\n80% done\n90% done\n100% done\nEnd NERVE computation successfully.\n',
 None)