In [None]:
from pyteomics import pepxml, auxiliary as aux, fasta, achrom, mass, electrochem as ec, parser, mgf, mass, cmass, mzid, protxml
import pandas as pd
from collections import Counter, defaultdict
import random
from os import path, listdir, mkdir
%matplotlib inline
import pylab as plt
from matplotlib.patches import Patch
import numpy as np
import seaborn
import shutil
import glob

In [None]:
#Set path to study. If folder does not exist it will be created
infolder = '/home/mark/overfit_test/'
if not path.isdir(infolder):
    mkdir(infolder)
    
#Set path to folder where fasta files for study will be stored
fasta_folder = path.join(infolder, 'fasta')
if not path.isdir(fasta_folder):
    mkdir(fasta_folder)
    
#Please, put target fasta file to fasta folder and write filename below
path_to_input_target_fasta = path.join(fasta_folder, 'sprot_human.fasta')

In [None]:
#Function for shuffle DECOY database generation. Reduce intersection of target and decoy peptides to minimum.

def make_shuffle_new(infile, prefix='DECOY_', expasy_r=parser.expasy_rules['trypsin']):
    
    defaalist = set(mass.std_aa_mass.keys())
    defaalist.remove('O')
    defaalist.remove('U')
    defaalist.remove('R')
    defaalist.remove('K')
    defaalist = list(defaalist)
    
    cnt_shared = 0
    cnt_defshared = 0
    prots = []
    all_peps = set()
    for p in fasta.read(infile):
        rseq = p[1].replace('L', 'I')
        peps = parser.cleave(rseq, expasy_r, 2, min_length=6)
        all_peps.update(peps)
        prots.append((p[0], rseq))
        
    for p in fasta.read(infile):
        sh_seq = fasta.shuffle(p[1].replace('L', 'I'), keep_nterm=True)
        peps = parser.cleave(sh_seq, expasy_r, 2, min_length=6)
        for pep in peps:
            if pep in all_peps:
                cnt_defshared += 1
                i = 30
                flag = 1
                while i > 0:
                    npep = fasta.shuffle(pep, keep_cterm=True, keep_nterm=False)
                    if npep not in all_peps:
                        flag = 0
                        break
                    else:
                        i -= 1
                        

                i = 30
                while i > 0:
                    N = len(pep) - 1
                    npep = ''.join(random.choice(defaalist) for _ in range(N)) + pep[-1]
                    if npep not in all_peps:
                        flag = 0
                        break
                    else:
                        i -= 1
                        
                        
                if flag:
                    cnt_shared += 1
                sh_seq = sh_seq.replace(pep, npep)
        prots.append((prefix + p[0], sh_seq))
    print(cnt_shared)
    print(cnt_defshared)
    return prots

In [None]:
# PREPARE 9 FASTA FILES using double shuffling. Details can be found in the manuscript.

infile = path_to_input_target_fasta
i = 1
while i <= 9:
    tmpfile = infile.split('.fasta')[0] + '_shuffled_CHECK_%d.fasta' % (i, )
    outfile = infile.split('.fasta')[0] + '_shuffled_DECOY_%d.fasta' % (i, )
    prots1 = make_shuffle_new(infile, prefix='CHECK_')
    fasta.write(prots1, output=open(tmpfile, 'w')).close()
    prots2 = make_shuffle_new(tmpfile, prefix='DECOY_')
    fasta.write(prots2, output=open(outfile, 'w')).close()
    i += 1

In [None]:
# Calculate number of theoretical tryptic peptides
path_to_any_decoy_fasta = path_to_input_target_fasta.split('.fasta')[0] + '_shuffled_DECOY_1.fasta'
all_peps = set()
for p in fasta.read(path_to_any_decoy_fasta):
    all_peps.update(parser.cleave(p[1], parser.expasy_rules['trypsin'], 2, min_length=6))
print(len(all_peps))

In [None]:
# Set path to folder with spectra files. We have used in the study only mzML files
mgf_folder = path.join(infolder, 'mzml')
if not path.isdir(mgf_folder):
    mkdir(mgf_folder)
    
#Please, put spectra files to spectra folder

In [None]:
# RUN MSFragger

# MSFragger require file with parameters. It will be automatically taken from the parameters folder,
# but it should be in format name_of_spectra_file.params
# Path to fasta file inside the parameters will be replaced here in the script automatically

msfragger_folder = path.join(infolder, 'msfragger')
if not path.isdir(msfragger_folder):
    mkdir(msfragger_folder)
msfragger_params_folder = path.join(infolder, 'params')

for mgffile in listdir(mgf_folder):
    if mgffile.endswith('.mzML'):
        basic_name = path.splitext(mgffile)[0]
        inmgf = path.join(mgf_folder, mgffile)
        print(inmgf)
        msfragger_params_file = path.join(msfragger_params_folder, basic_name + '_fragger.params')
        print(msfragger_params_file)
        for fastafile in listdir(fasta_folder):
            if '_DECOY_' in fastafile and fastafile.endswith('.fasta'):
                fpath = path.join(fasta_folder, fastafile)
                search_number = fastafile.split('.fasta')[0].split('_')[-1]
                
                
                tmp_fragger_params = path.join(msfragger_params_folder, 'tmp_fragger.params')
                file_writer = open(tmp_fragger_params, 'w')
                for line in open(msfragger_params_file):
                    if line.startswith('database_name'):
                        line = 'database_name = %s\n' % (fpath, )
                    file_writer.write(line)
                file_writer.close()
                !MSFragger $tmp_fragger_params $inmgf
                pepxml_name = path.join(mgf_folder, basic_name + '.pepXML')
                pepxml_name_new = path.join(msfragger_folder, basic_name + '_%s.pep.xml' % (search_number, ))
                shutil.move(pepxml_name, pepxml_name_new)
                break

In [None]:
# RUN X!Tandem

# X!Tandem require file with parameters. It will be automatically taken from the parameters folder,
# but it should be in format name_of_spectra_file.xml

xtandem_folder = path.join(infolder, 'xtandem')
if not path.isdir(xtandem_folder):
    mkdir(xtandem_folder)
xtandem_params_folder = path.join(infolder, 'params')
path_to_tandem_exe = 'tandem.exe'

for mgffile in listdir(mgf_folder):
    if mgffile.endswith('.mzML'):
        basic_name = path.splitext(mgffile)[0]
        inmgf = path.join(mgf_folder, mgffile)
        print(inmgf)
        xtandem_params_file = path.join(xtandem_params_folder, basic_name + '.xml')
        print(xtandem_params_file)
        if not path.isdir(xtandem_folder):
            mkdir(xtandem_folder)
        for fastafile in listdir(fasta_folder):
            if '_DECOY_' in fastafile and fastafile.endswith('.fasta'):
                fpath = path.join(fasta_folder, fastafile)
                search_number = fastafile.split('.fasta')[0].split('_')[-1]
                !runtandem --tandem2xml pepxmltk.py --tandem.exe $path_to_tandem_exe $xtandem_params_file $xtandem_folder $fpath $inmgf
                pepxml_name = path.join(xtandem_folder, basic_name + '.pep.xml')
                pepxml_name_new = pepxml_name.replace('.pep.xml', '_%s.pep.xml' % (search_number, ))
                txml_name = path.join(xtandem_folder, basic_name + '.t.xml')
                txml_name_new = txml_name.replace('.t.xml', '_%s.t.xml' % (search_number, ))
                shutil.move(pepxml_name, pepxml_name_new)
                shutil.move(txml_name, txml_name_new)

In [None]:
# RUN msgf+

# msgf+ require file with modifications. Please, put msgf_mods.txt file to folder with parameters.

msgf_folder = path.join(infolder, 'msgf_mzml')
if not path.isdir(msgf_folder):
    mkdir(msgf_folder)
mzml_folder = path.join(infolder, 'mzml')

for mgffile in listdir(mzml_folder):
    if mgffile.endswith('.mzML'):
        basic_name = path.splitext(mgffile)[0]

        if 'confetti' in basic_name:
            inst_type = 0
            mods_file = path.join(infolder, 'params/msgf_mods_confetti.txt')
        else:
            inst_type = 3
            mods_file = path.join(infolder, 'params/msgf_mods.txt')
        if '20100609' in basic_name:
            ppmacc = '20ppm'
        else:
            ppmacc = '10ppm'

        inmgf = path.join(mzml_folder, mgffile)
        print(inmgf)
        for fastafile in listdir(fasta_folder):
            if '_DECOY_' in fastafile and fastafile.endswith('.fasta'):
                fpath = path.join(fasta_folder, fastafile)
                search_number = fastafile.split('.fasta')[0].split('_')[-1]
                fileroot = basic_name + '_' + search_number
                mzid_name = path.join(msgf_folder, basic_name + '_%s.mzid' % (search_number, ))
                !msgf+ -s $inmgf -d $fpath -o $mzid_name -t $ppmacc -addFeatures 1 -inst $inst_type -mod $mods_file


In [None]:
# RUN comet
comet_folder = path.join(infolder, 'comet')
if not path.isdir(comet_folder):
    mkdir(comet_folder)

for mgffile in listdir(mgf_folder):
    if mgffile.endswith('.mzML'):
        basic_name = path.splitext(mgffile)[0]

        if 'confetti' in basic_name:
            cys_fixed_mod = 125.047679
            frag_bin_tol = 0.3
        else:
            cys_fixed_mod = 57.021464
            frag_bin_tol = 0.05
        if '20100609' in basic_name:
            ppmacc = 20.0
        else:
            ppmacc = 10.0
            
        inmgf = path.join(mgf_folder, mgffile)
        print(inmgf)
        for fastafile in listdir(fasta_folder):
            if '_DECOY_' in fastafile and fastafile.endswith('.fasta'):
                fpath = path.join(fasta_folder, fastafile)
                search_number = fastafile.split('.fasta')[0].split('_')[-1]
                fileroot = basic_name + '_' + search_number
                !crux comet $inmgf $fpath --peptide_mass_tolerance $ppmacc --peptide_mass_units 2 \
                            --fragment_bin_tol $frag_bin_tol --output-dir $comet_folder --add_C_cysteine $cys_fixed_mod \
                            --fileroot $fileroot --num_output_lines 1 --isotope_error 1

In [None]:
# RUN Identipy
identipy_folder = path.join(infolder, 'identipy')
if not path.isdir(identipy_folder):
    mkdir(identipy_folder)

for mgffile in listdir(mgf_folder):
    if mgffile.endswith('.mzML'):
        
        basic_name = path.splitext(mgffile)[0]
        inmgf = path.join(mgf_folder, mgffile)
        
        if 'confetti' in basic_name:
            cys_fixed_mod = '125.047679@C'
            frag_bin_tol = 0.3
        else:
            cys_fixed_mod = '57.021464@C'
            frag_bin_tol = 0.05
        if '20100609' in basic_name:
            ppmacc = 20.0
        else:
            ppmacc = 10.0
        
        print(inmgf)
        for fastafile in listdir(fasta_folder):
            if '_DECOY_' in fastafile and fastafile.endswith('.fasta'):
                fpath = path.join(fasta_folder, fastafile)
                search_number = fastafile.split('.fasta')[0].split('_')[-1]
                
                
                !identipy $inmgf -db $fpath\
                -punit ppm -ptol $ppmacc -funit Da -ftol $frag_bin_tol -fminmz 100 -lmin 6 -lmax 60 -massmin 200\
                -massmax 10000 -mc 2 -cmin 1 -prefix DECOY_ -deistol $frag_bin_tol -cmax 9\
                -fmods $cys_fixed_mod -at yes -ime 1
                
                pepxml_name = path.join(mgf_folder, basic_name + '.pep.xml')
    
                pepxml_name_new = path.join(identipy_folder, basic_name + '.pep.xml').replace('.pep.xml', '_%s.pep.xml' % (search_number, ))
                shutil.move(pepxml_name, pepxml_name_new)

In [None]:
# RUN Scavager for all search engines results

decoy_prefix = 'DECOY_'

for ffolder in [
    msfragger_folder,
    xtandem_folder,
    identipy_folder,
    comet_folder,
    msgf_folder
]:
    for z in listdir(ffolder):
        if z.endswith('.pep.xml') or z.endswith('.mzid'):
            intf = path.join(ffolder, z)
            !scavager $intf -prefix $decoy_prefix

In [None]:
# RUN Peptide/ProteinProphet for all search engines results

# Please, convert msgf .mzid files to .pepXML format using idconvert.exe from ProteoWizard

decoy_prefix = 'DECOY_'
fasta_template = path_to_input_target_fasta.split('.fasta')[0] + '_shuffled_DECOY_%s.fasta'

!philosopher workspace --init

for ffolder in [
    msfragger_folder,
    xtandem_folder,
    comet_folder,
    msgf_folder
]:
    for z in listdir(ffolder):
        if (z.endswith('.pep.xml') or z.endswith('.pepXML')) and not z.startswith('interact'):

            basenum = z.split('.')[0].split('_')[-1]
            infasta = fasta_template % (basenum, )
            
            intf = path.join(ffolder, z)
#             !philosopher peptideprophet --accmass\
#             --database $infasta\
#             --decoy DECOY_ --decoyprobs --enzyme Trypsin --minpeplen 6 --expectscore $intf
            
            pepprophet_file = path.join(ffolder, 'interact-' + z)
#             protprophet_file = path.join(ffolder, 'interact-' + z.replace('.pep.xml', '').replace('.pepXML', ''))
            protprophet_file = 'interact-' + z.replace('.pep.xml', '').replace('.pepXML', '')
            !philosopher proteinprophet $pepprophet_file --output $protprophet_file --nooccam
            protprophet_file = protprophet_file + '.prot.xml'
            shutil.move(protprophet_file, path.join(ffolder, protprophet_file))
            
#             !echo proteinprophet $pepprophet_file --output $protprophet_file --nooccam

In [None]:
# RUN Percolator for msgf+ results

decoy_prefix = 'DECOY_'

for z in listdir(msgf_folder):
    if z.endswith('.mzid'):
        intf = path.join(msgf_folder, z)
        outpin = path.join(msgf_folder, z.replace('.mzid', '.pin'))
        !msgf2pin $intf -P $decoy_prefix -o $outpin
        outtarget = path.join(msgf_folder, z.replace('.mzid', '.target'))
        outdecoy = path.join(msgf_folder, z.replace('.mzid', '.decoy'))
        !percolator $outpin -U -m $outtarget -M $outdecoy
        
        outtarget2 = outtarget.replace('.target', '.target2')
        out = open(outtarget2, 'w')
        for x in open(outtarget):
            out.write('\t'.join(x.split('\t')[:5]) + '\t' + ';'.join(x.split('\t')[5:]) + '\n')
        out.close()

In [None]:
# RUN Percolator for X!Tandem output

decoy_prefix = 'DECOY_'

for z in listdir(xtandem_folder):
    if z.endswith('.t.xml'):
        intf = path.join(xtandem_folder, z)
        outpin = path.join(xtandem_folder, z.replace('.t.xml', '.pin'))
        !tandem2pin $intf -P $decoy_prefix -o $outpin
        outtarget = path.join(xtandem_folder, z.replace('.t.xml', '.target'))
        outdecoy = path.join(xtandem_folder, z.replace('.t.xml', '.decoy'))
        !percolator $outpin -U -m $outtarget -M $outdecoy
        
        outtarget2 = outtarget.replace('.target', '.target2')
        out = open(outtarget2, 'w')
        for x in open(outtarget):
            out.write('\t'.join(x.split('\t')[:5]) + '\t' + ';'.join(x.split('\t')[5:]) + '\n')
        out.close()

In [None]:
# RUN Percolator for comet output

perc_out = path.join(infolder, 'percolator_comet')
if not path.isdir(perc_out):
    mkdir(perc_out)

for z in listdir(comet_folder):
    if z.endswith('.target.pep.xml'):
        inmgf = path.join(mgf_folder, z[::-1].split('_', 1)[-1][::-1] + '.mgf')
        incomet = path.join(comet_folder, z)
        fileroot = z.replace('.comet.target.pep.xml', '')
        !crux percolator $incomet --decoy-prefix $decoy_prefix --output-dir $perc_out \
                        --fileroot $fileroot


In [None]:
# RUN Percolator for Identipy output

identipy_folder = path.join(infolder, 'identipy')
decoy_prefix = 'DECOY_'

for z in listdir(identipy_folder):
    if z.endswith('.pep.xml'):
        intf = path.join(identipy_folder, z)
        !identipy2pin $intf
        outtarget = path.join(identipy_folder, z.replace('.pep.xml', '.target'))
        outdecoy = path.join(identipy_folder, z.replace('.pep.xml', '.decoy'))
        outpin = path.join(identipy_folder, z.replace('.pep.xml', '.pin'))
        !percolator $outpin -U -m $outtarget -M $outdecoy
        
        outtarget2 = outtarget.replace('.target', '.target2')
        out = open(outtarget2, 'w')
        for x in open(outtarget):
            out.write('\t'.join(x.split('\t')[:5]) + '\t' + ';'.join(x.split('\t')[5:]) + '\n')
        out.close()

In [None]:
# RUN Percolator for MSFragger output

msfragger_folder = path.join(infolder, 'msfragger')
decoy_prefix = 'DECOY_'

for z in listdir(msfragger_folder):
    if z.endswith('.pep.xml'):
        intf = path.join(msfragger_folder, z)
        !identipy2pin $intf
        outtarget = path.join(msfragger_folder, z.replace('.pep.xml', '.target'))
        outdecoy = path.join(msfragger_folder, z.replace('.pep.xml', '.decoy'))
        outpin = path.join(msfragger_folder, z.replace('.pep.xml', '.pin'))
        !percolator $outpin -U -m $outtarget -M $outdecoy
        
        outtarget2 = outtarget.replace('.target', '.target2')
        out = open(outtarget2, 'w')
        for x in open(outtarget):
            out.write('\t'.join(x.split('\t')[:5]) + '\t' + ';'.join(x.split('\t')[5:]) + '\n')
        out.close()

In [None]:
# RUN Q-ranker for comet output
mzxml_folder = path.join(infolder, 'mzxml')
if not path.isdir(mzxml_folder):
    mkdir(mzxml_folder)

# Q-ranker require mzxml files. Please convert spectra files to mzxml format and put it to mzxml folder

q_ranker_out = path.join(infolder, 'qranker')
if not path.isdir(q_ranker_out):
    mkdir(q_ranker_out)

for z in listdir(comet_folder):
    if z.endswith('.target.txt'):
        inmgf = path.join(mzxml_folder, z[::-1].split('_', 1)[-1][::-1] + '.mzXML')# + '.mgf')
        incomet = path.join(comet_folder, z)
        fileroot = z.replace('.comet.target.txt', '')
        !crux q-ranker $inmgf $incomet --decoy-prefix $decoy_prefix --output-dir $q_ranker_out \
                        --fileroot $fileroot

In [None]:
# RUN MPscore1 for MSFragger, X!Tandem and Identipy output
# MPscore1 require file with parameters. Please, put file mpscore1.cfg to parameters folder.

path_to_mpscore_1 = '/home/mark/work/PycharmProjects/mp-score/MPscore.py'

params_folder = path.join(infolder, 'params')
fasta_template = path_to_input_target_fasta.split('.fasta')[0] + '_shuffled_DECOY_%s.fasta'

msfragger_mpscore1_folder = path.join(infolder, 'msfragger_mpscore1')
if not path.isdir(msfragger_mpscore1_folder):
    mkdir(msfragger_mpscore1_folder)
for file in glob.glob(path.join(msfragger_folder, '*.pep.xml')):
    shutil.copy(file, msfragger_mpscore1_folder)

xtandem_mpscore1_folder = path.join(infolder, 'xtandem_mpscore1')
if not path.isdir(xtandem_mpscore1_folder):
    mkdir(xtandem_mpscore1_folder)
for file in glob.glob(path.join(xtandem_folder, '*.pep.xml')):
    shutil.copy(file, xtandem_mpscore1_folder)

identipy_mpscore1_folder = path.join(infolder, 'identipy_mpscore1')
if not path.isdir(identipy_mpscore1_folder):
    mkdir(identipy_mpscore1_folder)
for file in glob.glob(path.join(identipy_folder, '*.pep.xml')):
    shutil.copy(file, identipy_mpscore1_folder)


for ffolder in [
    msfragger_mpscore1_folder,
    xtandem_mpscore1_folder,
    identipy_mpscore1_folder
]:
    for z in listdir(ffolder):
        if z.endswith('.pep.xml'):
            intf = path.join(ffolder, z)
            basenum = z.split('.')[0].split('_')[-1]
            infasta = fasta_template % (basenum, )
            if 'confetti' in z:
                inparams = path.join(params_folder, 'mpscore1_confetti.cfg')
            else:
                inparams = path.join(params_folder, 'mpscore1.cfg')

            !python2 $path_to_mpscore_1 $intf $infasta $inparams

In [None]:
# combine all results into dict

# Please, put PeptideProphet output files to 4 folders below:

prophet_comet_folder = path.join(infolder, 'prophet_comet')
prophet_msgf_folder = path.join(infolder, 'prophet_msgf')
prophet_fragger_folder = path.join(infolder, 'prophet_msfragger')
prophet_tandem_folder = path.join(infolder, 'prophet_xtandem')

decoy_prefix = 'DECOY_'

results_all = defaultdict(dict)

# X!Tandem def
def is_decoy_tandem(proteins):
    return all(z.startswith(decoy_prefix) for z in proteins)

def is_check_tandem(proteins):
    return all(z.startswith('CHECK_') for z in proteins)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-D'
    
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            df00 = pepxml.DataFrame(path.join(xtandem_folder, z))
            df00['decoy'] = df00['protein'].apply(is_decoy_tandem)
            df00_f = aux.filter(df00, fdr=0.01, key='expect', is_decoy='decoy', correction=1, remove_decoy=True)
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['protein'].apply(is_check_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    

# Comet def
def is_decoy_tandem(proteins):
    return all(z.startswith(decoy_prefix) for z in proteins)

def is_check_tandem(proteins):
    return all(z.startswith('CHECK_') for z in proteins)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-D'
    
    for z in listdir(comet_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            df00 = pepxml.DataFrame(path.join(comet_folder, z))
            df00 = df00[~pd.isna(df00['peptide'])]
            df00['decoy'] = df00['protein'].apply(is_decoy_tandem)
            df00_f = aux.filter(df00, fdr=0.01, key='expect', is_decoy='decoy', correction=1, remove_decoy=True)
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['protein'].apply(is_check_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# MSFragger def
def is_decoy_tandem(proteins):
    return all(z.startswith(decoy_prefix) for z in proteins)

def is_check_tandem(proteins):
    return all(z.startswith('CHECK_') for z in proteins)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-D'
    
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            df00 = pepxml.DataFrame(path.join(msfragger_folder, z))
            df00['decoy'] = df00['protein'].apply(is_decoy_tandem)
            df00_f = aux.filter(df00, fdr=0.01, key='expect', is_decoy='decoy', correction=1, remove_decoy=True)
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['protein'].apply(is_check_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Identipy def
def is_decoy_tandem(proteins):
    return all(z.startswith(decoy_prefix) for z in proteins)

def is_check_tandem(proteins):
    return all(z.startswith('CHECK_') for z in proteins)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-D'
    
    for z in listdir(identipy_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            df00 = pepxml.DataFrame(path.join(identipy_folder, z))
            df00['decoy'] = df00['protein'].apply(is_decoy_tandem)
            df00_f = aux.filter(df00, fdr=0.01, key='expect', is_decoy='decoy', correction=1, remove_decoy=True)
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['protein'].apply(is_check_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# MSGF def
def is_decoy_tandem(proteins):
    return all(z.startswith(decoy_prefix) for z in proteins)

def is_check_tandem(proteins):
    return all(z.startswith('CHECK_') for z in proteins)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-D'
    
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('.mzid'):
            df00 = mzid.DataFrame(path.join(msgf_folder, z))
            df00['decoy'] = df00['protein description'].apply(is_decoy_tandem)
            df00_f = aux.filter(df00, fdr=0.01, key='MS-GF:EValue', is_decoy='decoy', correction=1, remove_decoy=True)
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['protein description'].apply(is_check_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# # xtandem percolator

def is_check_percolator_tandem(proteins):
#     return 'CHECK_' in proteins
    return all('CHECK_' in z for z in proteins.split(';'))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-P'
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('.target2'):
            df00 = pd.read_table(path.join(xtandem_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['proteinIds'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
# # msgf percolator

def is_check_percolator_tandem(proteins):
#     return 'CHECK_' in proteins
    return all('CHECK_' in z for z in proteins.split(';'))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-P'
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('.target2'):
            df00 = pd.read_table(path.join(msgf_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['proteinIds'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)

# MSFragger percolator

def is_check_percolator_tandem(proteins):
#     return 'CHECK_' in proteins
    return all('CHECK_' in z for z in proteins.split(';'))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-P'
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('.target2'):
            df00 = pd.read_table(path.join(msfragger_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['proteinIds'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scavager MSFragger

def is_check_mpscore_tandem(proteins):
    return all('CHECK_' in z for z in eval(proteins))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-M'
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(msfragger_folder, z))
            tot = df00.shape[0]
            dec = df00[df00['protein'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scavager msgf

def is_check_mpscore_tandem(proteins):
    return all('CHECK_' in z for z in eval(proteins))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-M'
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(msgf_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['protein'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scavager tandem

def is_check_mpscore_tandem(proteins):
    return all('CHECK_' in z for z in eval(proteins))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-M'
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(xtandem_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['protein'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scavager comet

def is_check_mpscore_tandem(proteins):
    return all('CHECK_' in z for z in eval(proteins))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-M'
    for z in listdir(comet_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(comet_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['protein'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
# # comet qranker

def is_check_qranker(proteins):
#     return proteins.startswith('CHECK_')
    return all(z.startswith('CHECK_') for z in proteins.split(','))
#     return 'CHECK_' in proteins

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-Q'
    
    for z in listdir(q_ranker_out):
        if basic_name in z and z.endswith('.target.psms.txt'):
            df1 = pd.read_table(path.join(q_ranker_out, z))
            df1_t = df1[df1['q-ranker q-value'] <= 0.01][['protein id']]
            tot = df1_t.shape[0]
            dec = df1_t[df1_t['protein id'].apply(is_check_qranker)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    

# # comet percolator

def is_check_qranker(proteins):
#     return proteins.startswith('CHECK_')
    return all(z.startswith('CHECK_') for z in proteins.split(','))
#     return 'CHECK_' in proteins

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-P'
    
    for z in listdir(perc_out):
        if basic_name in z and z.endswith('percolator.target.psms.txt'):
            df1 = pd.read_table(path.join(perc_out, z))
            df1_t = df1[df1['percolator q-value'] <= 0.01][['protein id']]
            tot = df1_t.shape[0]
            dec = df1_t[df1_t['protein id'].apply(is_check_qranker)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
# # Identipy percolator

def is_check_percolator_identipy(proteins):
#     return 'CHECK_' in proteins
    return all('CHECK_' in z for z in proteins.split(';'))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-P'
    for z in listdir(identipy_folder):
        if basic_name in z and z.endswith('.target2'):
            df00 = pd.read_table(path.join(identipy_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['proteinIds'].apply(is_check_percolator_identipy)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scavager Identipy

def is_check_mpscore_identipy(proteins):
    return all('CHECK_' in z for z in eval(proteins))

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-M'
    for z in listdir(identipy_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(identipy_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['protein'].apply(is_check_mpscore_identipy)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# # Prophet X!Tandem    
    
def is_check_prophet_tandem(proteins):
    return all(z['protein'].startswith('CHECK_') for z in proteins['search_hit'][0]['proteins'])
    
def is_decoy_prophet_tandem(proteins):
    return all(z['protein'].startswith('DECOY_') for z in proteins['search_hit'][0]['proteins'])

score = lambda x: float(x['search_hit'][0]['analysis_result'][0]['peptideprophet_result']['probability'])

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-Pr'
    
    for z in listdir(prophet_tandem_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            
            ids = [x for x in pepxml.read(path.join(prophet_tandem_folder, z))]
            a_f = aux.filter(ids, fdr=0.01, key=score, is_decoy=is_decoy_prophet_tandem, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_tandem(z) for z in a_f)
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# # Prophet MSFragger   
    
def is_check_prophet_fragger(proteins):
    return all(z['protein'].startswith('CHECK_') for z in proteins['search_hit'][0]['proteins'])
    
def is_decoy_prophet_fragger(proteins):
    return all(z['protein'].startswith('DECOY_') for z in proteins['search_hit'][0]['proteins'])

score = lambda x: float(x['search_hit'][0]['analysis_result'][0]['peptideprophet_result']['probability'])

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-Pr'
    
    for z in listdir(prophet_fragger_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            
            ids = [x for x in pepxml.read(path.join(prophet_fragger_folder, z))]
            a_f = aux.filter(ids, fdr=0.01, key=score, is_decoy=is_decoy_prophet_fragger, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_fragger(z) for z in a_f)
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# # Prophet comet   

score = lambda x: float(x['search_hit'][0]['analysis_result'][0]['peptideprophet_result']['probability'])

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-Pr'
    
    for z in listdir(prophet_comet_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            
            ids = [x for x in pepxml.read(path.join(prophet_comet_folder, z))]
            a_f = aux.filter(ids, fdr=0.01, key=score, is_decoy=is_decoy_prophet_fragger, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_fragger(z) for z in a_f)
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# # Prophet msgf   

score = lambda x: float(x['search_hit'][0]['analysis_result'][0]['peptideprophet_result']['probability'])

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-Pr'
    
    for z in listdir(prophet_msgf_folder):
        if basic_name in z and z.endswith('.pep.xml'):
            
            ids = [x for x in pepxml.read(path.join(prophet_msgf_folder, z))]
            a_f = aux.filter(ids, fdr=0.01, key=score, is_decoy=is_decoy_prophet_fragger, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_fragger(z) for z in a_f)
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)


# MPscore1 X!Tandem

def is_check_mpscore_tandem(proteins):
    return all('CHECK_' in z for z in proteins.split(';') if z)

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-M1'
    for z in listdir(xtandem_mpscore1_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(xtandem_mpscore1_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['proteins'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# MPscore1 MSFragger

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-M1'
    for z in listdir(msfragger_mpscore1_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(msfragger_mpscore1_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['proteins'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# MPscore1 IdentiPy

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-M1'
    for z in listdir(identipy_mpscore1_folder):
        if basic_name in z and z.endswith('_PSMs.tsv'):
            df00 = pd.read_table(path.join(identipy_mpscore1_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['proteins'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all[basic_name][labelname] = dict()
    results_all[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all[basic_name][labelname]['ids_std'] = np.std(ids_list)

In [None]:
title_map = {
    'confetti_trypsin_01': 'Guo et al.',
    '20100609_Velos1_TaGe_SA_293_4': 'Geiger et al.',
    'olsen_100ng_30min_15k_01': 'Kelstrup et al.'
}

plt.figure(dpi=600)
i = 0
for dataset, workflows in results_all.items():
    print(dataset)
    for val_type in ['fdr',]:
        X_labels = []
        Y_vals = []
        Y_std = []
        for workflow, res in sorted(workflows.items()):
            X_labels.append(workflow)
            Y_vals.append(res['%s_mean' % (val_type, )] * 2)
            Y_std.append(res['%s_std' % (val_type, )] * 2)
        
        if val_type == 'ids':
            min_Y_vals = dict()
            for lbl, val in zip(X_labels, Y_vals):
                se = lbl.split('-')[0]
                min_Y_vals[se] = min(min_Y_vals.get(se, 1e6), val)
#             min_Y_val = min(Y_vals)
            print(min_Y_vals)
            Y_vals = [(x - min_Y_vals[lbl.split('-')[0]])/min_Y_vals[lbl.split('-')[0]]*100
                      for x, lbl in zip(Y_vals, X_labels)]
            Y_std = [0 for x in Y_std]
            
        X_array = np.array(range(len(X_labels)))
        colors_dict = {
            'C': 'y',
            'X': 'b',
            'I': 'g',
            'M': 'k',
            'F': 'm',
        }
        plt.subplot(3, 1, i+1)
        i += 1
        if val_type == 'fdr':
            plt.hlines(1.0, -0.17, X_array[-1] + 0.17, color='r')
            plt.vlines(4.5, 0, 2.5, linestyles='--')
            plt.vlines(9.5, 0, 2.5, linestyles='--')
            plt.vlines(13.5, 0, 2.5, linestyles='--')
            plt.vlines(17.5, 0, 2.5, linestyles='--')
            plt.text(0.5, 1.7, 'Comet')
            plt.text(5.5, 1.7, 'MSFragger')
            plt.text(10.25, 1.7, 'IdentiPy')
            plt.text(14.5, 1.7, 'MSGF+')
            plt.text(18.25, 1.7, 'X!Tandem')
        plt.bar(X_array, Y_vals, yerr=Y_std, width=0.33, color=[colors_dict[zz[0]] for zz in X_labels])
        x_ticks_arr = [z.split('-')[-1] for z in X_labels]
        x_ticks_arr = [zzz if zzz != 'M' else 'S' for zzz in x_ticks_arr]
        x_ticks_arr = [zzz if zzz != 'M1' else 'M' for zzz in x_ticks_arr]
#         print(x_ticks_arr)
        plt.xticks(X_array, x_ticks_arr, size=10)
        plt.yticks([0,1,2])
        plt.ylim(0, 2.5)
            
        plt.ylabel('FDR, %')
        plt.title(title_map.get(dataset), size=9)
        plt.tight_layout()

In [None]:
# RUN Percolator for X!Tandem output

decoy_prefix = 'DECOY_'

fasta_template = path.join(fasta_folder, 'sprot_human_shuffled_DECOY_%s.fasta')

mgf_folder = path.join(infolder, 'mzml')
fasta_folder = path.join(infolder, 'fasta')

xtandem_folder = path.join(infolder, 'xtandem')
msgf_folder = path.join(infolder, 'msgf_mzml')
identipy_folder = path.join(infolder, 'identipy')
comet_folder = path.join(infolder, 'comet')
msfragger_folder = path.join(infolder, 'msfragger')
q_ranker_out = path.join(infolder, 'qranker')
perc_out = path.join(infolder, 'percolator_comet')

for ffolder in [
    xtandem_folder,
    identipy_folder,
    msgf_folder,
    msfragger_folder,
]:
    for z in listdir(ffolder):
        if z.endswith('.pin'):
            basenum = z.split('.')[0].split('_')[-1]
            infasta = fasta_template % (basenum, )
            intf = path.join(ffolder, z)
            outpin = intf.replace('.pin', '.proteins_target')
            !percolator $intf -A -l $outpin -P DECOY_ -f $infasta -r /dev/null 

In [None]:
perc_out = path.join(infolder, 'percolator_comet_proteins')
if not path.isdir(perc_out):
    mkdir(perc_out)

for z in listdir(comet_folder):
    if z.endswith('.target.pep.xml'):
        inmgf = path.join(mgf_folder, z[::-1].split('_', 1)[-1][::-1] + '.mgf')
        incomet = path.join(comet_folder, z)
        fileroot = z.replace('.comet.target.pep.xml', '')
        !/home/mark/crux/bin/crux percolator $incomet --decoy-prefix $decoy_prefix --output-dir $perc_out \
                        --fileroot $fileroot --protein T --overwrite T --fido-empirical-protein-q T --protein-report-duplicates T

In [None]:
xtandem_folder = path.join(infolder, 'xtandem')
msgf_folder = path.join(infolder, 'msgf_mzml')
identipy_folder = path.join(infolder, 'identipy')
comet_folder = path.join(infolder, 'comet')
msfragger_folder = path.join(infolder, 'msfragger')
q_ranker_out = path.join(infolder, 'qranker')
perc_out = path.join(infolder, 'percolator_comet_proteins')

decoy_prefix = 'DECOY_'

# combine all results into dict

results_all_proteins = defaultdict(dict)

# Scavager tandem

def is_check_mpscore_tandem(proteins):
    return proteins.startswith('CHECK_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-M'
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('_proteins.tsv'):
            df00 = pd.read_table(path.join(xtandem_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['dbname'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    

# Scavager msgf

def is_check_mpscore_tandem(proteins):
    return proteins.startswith('CHECK_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-M'
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('_proteins.tsv'):
            df00 = pd.read_table(path.join(msgf_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['dbname'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
# Scaveger fragger

def is_check_mpscore_tandem(proteins):
    return proteins.startswith('CHECK_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-M'
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('_proteins.tsv'):
            df00 = pd.read_table(path.join(msfragger_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['dbname'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
# Scavager identipy

def is_check_mpscore_tandem(proteins):
    return proteins.startswith('CHECK_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-M'
    for z in listdir(identipy_folder):
        if basic_name in z and z.endswith('_proteins.tsv'):
            df00 = pd.read_table(path.join(identipy_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['dbname'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
# Scavager comet

def is_check_mpscore_tandem(proteins):
    return proteins.startswith('CHECK_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-M'
    for z in listdir(comet_folder):
        if basic_name in z and z.endswith('_proteins.tsv'):
            df00 = pd.read_table(path.join(comet_folder, z))
#             df00 = df00[df00['length']>7]
#             df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00.shape[0]
            dec = df00[df00['dbname'].apply(is_check_mpscore_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
            
# # Prophet X!Tandem    
    
prophet_tandem_folder = path.join(infolder, 'prophet_xtandem')
prophet_comet_folder = path.join(infolder, 'prophet_comet')
prophet_fragger_folder = path.join(infolder, 'prophet_msfragger')
prophet_msgf_folder = path.join(infolder, 'prophet_msgf')

def is_check_prophet_tandem(proteins):
    return proteins['protein_name'].str.startswith('CHECK_')
    
def is_decoy_prophet_tandem(proteins):
    return proteins['protein_name'].startswith('DECOY_')

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
#     basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-Pr'
    
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('.prot.xml'):
#             print(z)
            
            ids = protxml.DataFrame(path.join(xtandem_folder, z))
            a_f = aux.filter(ids, fdr=0.01, key='confidence', is_decoy=is_decoy_prophet_tandem, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_tandem(a_f))
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    basic_name = basic_name.split('teract-')[-1]
#     print('OK')
#     print(fdrs_list)
#     print(ids_list)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
#     basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-Pr'
    
    for z in listdir(comet_folder):
        if basic_name in z and z.endswith('.prot.xml'):
#             print(z)
            
            ids = protxml.DataFrame(path.join(comet_folder, z))
            a_f = aux.filter(ids, fdr=0.01, key='confidence', is_decoy=is_decoy_prophet_tandem, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_tandem(a_f))
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    basic_name = basic_name.split('teract-')[-1]
#     print('OK')
#     print(fdrs_list)
#     print(ids_list)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
    
for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
#     basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-Pr'
    
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('.prot.xml'):
#             print(z)
            
            ids = protxml.DataFrame(path.join(msfragger_folder, z))
            a_f = aux.filter(ids, fdr=0.01, key='confidence', is_decoy=is_decoy_prophet_tandem, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_tandem(a_f))
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    basic_name = basic_name.split('teract-')[-1]
#     print('OK')
#     print(fdrs_list)
#     print(ids_list)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
#     basic_name = basic_name.lower()
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-Pr'
    
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('.prot.xml'):
#             print(z)
            
            ids = protxml.DataFrame(path.join(msgf_folder, z))
            a_f = aux.filter(ids, fdr=0.01, key='confidence', is_decoy=is_decoy_prophet_tandem, reverse=True, remove_decoy=True, correction=1, formula=1)
            tot = len(a_f)
            dec = sum(is_check_prophet_tandem(a_f))
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    if basic_name == '20100609_velos1_tage_sa_293_4':
        basic_name = '20100609_Velos1_TaGe_SA_293_4'
    basic_name = basic_name.split('teract-')[-1]
#     print('OK')
#     print(fdrs_list)
#     print(ids_list)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)

In [None]:

# # X!Tandem percolator

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'X-P'
    for z in listdir(xtandem_folder):
        if basic_name in z and z.endswith('.proteins_target'):
            df00 = pd.read_table(path.join(xtandem_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['ProteinId'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
    
# # msgf percolator

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'M-P'
    for z in listdir(msgf_folder):
        if basic_name in z and z.endswith('.proteins_target'):
            df00 = pd.read_table(path.join(msgf_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['ProteinId'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
    
    
# # Comet percolator

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'C-P'
    for z in listdir(perc_out):
        if basic_name in z and z.endswith('.percolator.target.proteins.txt'):
            df00 = pd.read_table(path.join(perc_out, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['ProteinId'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    
   
# Identipy percolator

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'I-P'
    for z in listdir(identipy_folder):
        if basic_name in z and z.endswith('.proteins_target'):
            df00 = pd.read_table(path.join(identipy_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['ProteinId'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)

    
# # MSFragger percolator

for mgffile in listdir(mgf_folder):
    basic_name = path.splitext(mgffile)[0]
    
    fdrs_list = []
    ids_list = []
    labelname = 'F-P'
    for z in listdir(msfragger_folder):
        if basic_name in z and z.endswith('.proteins_target'):
            df00 = pd.read_table(path.join(msfragger_folder, z))
            df00_f = df00[df00['q-value'] <= 0.01]
            tot = df00_f.shape[0]
            dec = df00_f[df00_f['ProteinId'].apply(is_check_percolator_tandem)].shape[0]
            fdrs_list.append(dec * 100 / tot)
            ids_list.append(tot)
    print(labelname, basic_name, len(fdrs_list), np.mean(ids_list), np.std(ids_list), np.mean(fdrs_list), np.std(fdrs_list))
    results_all_proteins[basic_name][labelname] = dict()
    results_all_proteins[basic_name][labelname]['fdr_mean'] = np.mean(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_mean'] = np.mean(ids_list)
    results_all_proteins[basic_name][labelname]['fdr_std'] = np.std(fdrs_list)
    results_all_proteins[basic_name][labelname]['ids_std'] = np.std(ids_list)
    

In [None]:
plt.figure(dpi=600)
i = 0
for dataset, workflows in results_all_proteins.items():
    print(dataset)
    for val_type in ['fdr',]:#, 'ids']:
        X_labels = []
        Y_vals = []
        Y_std = []
        for workflow, res in sorted(workflows.items()):
            X_labels.append(workflow)
            Y_vals.append(res['%s_mean' % (val_type, )] * 2)
            Y_std.append(res['%s_std' % (val_type, )] * 2)
        
        if val_type == 'ids':
            min_Y_vals = dict()
            for lbl, val in zip(X_labels, Y_vals):
                se = lbl.split('-')[0]
                min_Y_vals[se] = min(min_Y_vals.get(se, 1e6), val)
#             min_Y_val = min(Y_vals)
            print(min_Y_vals)
            Y_vals = [(x - min_Y_vals[lbl.split('-')[0]])/min_Y_vals[lbl.split('-')[0]]*100
                      for x, lbl in zip(Y_vals, X_labels)]
            Y_std = [0 for x in Y_std]
            
        X_array = np.array(range(len(X_labels)))
        colors_dict = {
            'C': 'y',
            'X': 'b',
            'I': 'g',
            'M': 'k',
            'F': 'm',
        }
        plt.subplot(3, 1, i+1)
        i += 1
        if val_type == 'fdr':
            plt.hlines(1.0, -0.17, X_array[-1] + 0.17, color='r')
        plt.bar(X_array, Y_vals, yerr=Y_std, width=0.33, color=[colors_dict[zz[0]] for zz in X_labels])
        x_ticks_arr = [z.split('-')[-1] for z in X_labels]
        x_ticks_arr = [zzz if zzz != 'M' else 'S' for zzz in x_ticks_arr]
        x_ticks_arr = [zzz if zzz != 'M1' else 'M' for zzz in x_ticks_arr]
#         print(x_ticks_arr)
        plt.xticks(X_array, x_ticks_arr, size=10)
        if i == 1:
            plt.ylim(0, 3)
            
            
            legend_elements1 = [
                Patch(facecolor='xkcd:dirty yellow', edgecolor='y',
                         label='Color Patch'),  
                
                Patch(facecolor='xkcd:pinkish purple', edgecolor='m',
                         label='Color Patch'), 
            ]
            legend_elements2 = [
                Patch(facecolor='green', edgecolor='g',
                         label='Color Patch'), 
                plt.hlines(1.0, -0.17, X_array[-1] + 0.17, color='r')
            ]
            
            legend_elements3 = [
                Patch(facecolor='black', edgecolor='k',
                         label='Color Patch'), 
                Patch(facecolor='blue', edgecolor='b',
                         label='Color Patch'), 
                              ]

            legend_names1 = ['Comet', 'MSFragger']
            legend_names2 = ['IdentiPy', 'expected FDR']
            legend_names3 = ['MSGF+', 'X!Tandem']
#             # Create the figure
#             fig, ax = plt.subplots()
#             ax.legend(handles=legend_elements, loc='center')
            legend1 = plt.legend(legend_elements1, legend_names1, loc=2, prop={'size': 8})
            legend2 = plt.legend(legend_elements2, legend_names2, loc=9, prop={'size': 8})
#             pyplot.legend([l[0] for l in plot_lines], parameters, loc=4)
            plt.legend(legend_elements3, legend_names3, loc=1, prop={'size': 8})
            plt.gca().add_artist(legend1)
            plt.gca().add_artist(legend2)
#             plt.legend(legend_elements, loc=2)
        plt.ylabel('real FDR, %')
        plt.title(title_map.get(dataset), size=9)
        plt.tight_layout()
#             pylab.bar(xrang, np.array(ms2h) - np.array(ms2h_single_peptide), width=0.33, edgecolor='black', color='#6495ED', hatch="//\\\\")
#             pylab.bar(xrang + 0.33, ms1h, yerr=errs, width=0.33, color='r')
#         break
        
        
#     break
# plt.bar()