In [3]:
import pandas as pd

In [4]:
def add_protein_groups(df, df_ms1_path, sort_random=True):
    # input: df - pandas dataframe which contains columns "dbname" with protein names and 
    #        "peptides set" with set of peptide sequences belong to this protein and identified in MS/MS analysis. 
    #        df_ms1_path - path to the table *_proteins_full_noexclusion.tsv which is output by DirectMS1 analysis.
    #        sort_random - if True, the proteins with same scores are chosen randomly. Otherwise, they are chosen
    #        in alphabetical order.
    # output: pandas dataframe with new columns "groupleader" (True for protein group leaders) and 
    #         "all proteins" (list of proteins belong to protein group and separeted by ';')
    
    pept_prots = defaultdict(set)
    prot_prots = defaultdict(set)
    prot_pepts = dict()
    if not sort_random:
        iter_list = df.sort_values(by='dbname').reset_index(drop=True)[['peptides set', 'dbname']].values
    else:
        iter_list = df.sample(frac=1).reset_index(drop=True)[['peptides set', 'dbname']].values
    for peptides, dbname in iter_list:
        prot_pepts[dbname] = peptides
        for peptide in peptides:
            pept_prots[peptide].add(dbname)
    for prots in pept_prots.values():
        for dbname in prots:
            prot_prots[dbname].update(prots)
    prot_pepts_count = dict()
    prot_pepts_count2 = dict()
    
    if not df_ms1_path:
        for k, v in prot_pepts.items():
            prot_pepts_count[k] = len(v)
            prot_pepts_count2[k] = len(v)

    else:
        df_ms1 = pd.read_csv(df_ms1_path, sep='\t')
        ms1s = dict()
        for qval, prot in df_ms1[['score', 'dbname']].values:
            ms1s[prot] = float(qval)

        max_k = max(ms1s.values())
        for k, v in prot_pepts.items():
            prot_pepts_count[k] = len(v) + ms1s.get(k, 0) / max_k
            prot_pepts_count2[k] = len(v)
    tostay = set()
    while pept_prots:
        bestprot = keywithmaxval(prot_pepts_count)
        tostay.add(bestprot)
        for pep in prot_pepts[bestprot]:
            for k in pept_prots[pep]:
                prot_pepts_count[k] -= 1
                prot_pepts_count2[k] -= 1
            del pept_prots[pep]
            
        for k, v in list(prot_pepts_count2.items()):
            if v == 0:
                del prot_pepts_count[k]
                del prot_pepts_count2[k]
    df['groupleader'] = df['dbname'].apply(lambda x: x in tostay)
    df['all proteins'] = df['dbname'].apply(lambda x: ';'.join(prot_prots[x]))