In [69]:

from glob import glob
from collections import defaultdict

def sort_key(f):
    num_attributes = sum([
        0,
        "h2b" in f,
        "h2d" in f,
        "cd4" in f,
        "cd8" in f,
    ])
    return ("bcell" in f), ("cd4" in f), num_attributes

for length in [15, 21, 27]:

    dataframes = []
    seen_peptides = set()
    peptide_to_sources = defaultdict(list)
    for f in reversed(sorted(glob("selected-*%dmer.csv" % length), key=sort_key,)):
        # if "cd4" not in f and "cd8" not in f:
        #    continue
        df_full = pd.read_csv(f)
        peptides = df_full.Sequence.values
        seen_already = [p in seen_peptides for p in peptides]
        n_old = sum(seen_already)
        n_new = len(df_full) - n_old
        print("%s: %d peptides (%d new)" % (f, len(df_full), n_new))

        for p in peptides:
            peptide_to_sources[p].append(f)

        df_sub = df_full[["Sequence", "Protein", "Start", "End"]].copy()
        df_sub["B-cell Epitope"] = [x if type(x) is str else "" for x in df_full["full_bcell_epitope"]]
        df_sub["HLA-I alleles"] = [x if type(x) is str else "" for x in df_full["HLA-I_haplotypes"]]
        df_sub["HLA-II alleles"] = [x if type(x) is str else "" for x in df_full["HLA-II_haplotypes"]]

        df_sub["HLA-I coverage"] = df_full['HLA-I_pop_freq_rounded']
        df_sub["HLA-II coverage"] = df_full['HLA-II_pop_freq_rounded']
        df_sub['Class I H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-I b"]]
        df_sub['Class II H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-II b"]]
        df_sub['Class I H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-I d"]]
        df_sub['Class II H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-II d"]]

        seen_peptides.update(peptides)
        dataframes.append(df_sub[~np.array(seen_already)])
    df = pd.concat(dataframes); 
    df["sources"] = [";".join(peptide_to_sources[p]) for p in df.Sequence]
    df["num_sources"] = [len(peptide_to_sources[p]) for p in df.Sequence]
    df = df.set_index("Sequence")
    print("==> Total %dmer vaccine peptides: %d" % (length, len(df),))
    df.to_csv("final-vaccine-peptides-%dmer.csv" % length)
    
    h2b_mask = (df["Class I H2-b"] == "+") | (df["Class II H2-b"] == "+")
    df_h2b = df[h2b_mask]
    print("==> %d/%d with H2-b predicted ligands" % (len(df_h2b), len(df)))
    df_h2b.to_csv("final-vaccine-peptides-h2b-%dmer.csv" % length)
    
    h2d_mask = (df["Class I H2-d"] == "+") | (df["Class II H2-d"] == "+")
    df_h2d = df[h2d_mask]
    print("==> %d/%d with H2-d predicted ligands" % (len(df_h2d), len(df)))
    df_h2d.to_csv("final-vaccine-peptides-h2d-%dmer.csv" % length)
    
    murine_mask = h2d_mask | h2b_mask
    df_murine = df[murine_mask]
    print("==> %d/%d with any murine predicted ligands" % (len(df_murine), len(df)))
    df_murine.to_csv("final-vaccine-peptides-murine-%dmer.csv" % length)
    

selected-bcell-cd4-h2b-15mer.csv: 2 peptides (2 new)
selected-bcell-cd4-cd8-15mer.csv: 5 peptides (5 new)
selected-bcell-cd4-15mer.csv: 6 peptides (1 new)
selected-bcell-cd8-h2b-h2d-15mer.csv: 1 peptides (0 new)
selected-bcell-cd8-h2b-15mer.csv: 3 peptides (3 new)
selected-bcell-cd8-h2d-15mer.csv: 2 peptides (0 new)
selected-bcell-cd8-15mer.csv: 6 peptides (2 new)
selected-bcell-15mer.csv: 4 peptides (0 new)
selected-tcell-cd4-cd8-h2b-h2d-15mer.csv: 2 peptides (2 new)
selected-tcell-cd4-cd8-h2b-15mer.csv: 3 peptides (2 new)
selected-tcell-cd4-cd8-h2d-15mer.csv: 4 peptides (3 new)
selected-tcell-cd4-h2b-h2d-15mer.csv: 3 peptides (2 new)
selected-tcell-cd4-cd8-15mer.csv: 5 peptides (2 new)
selected-tcell-cd4-h2b-15mer.csv: 3 peptides (1 new)
selected-tcell-cd4-h2d-15mer.csv: 3 peptides (0 new)
selected-tcell-cd4-15mer.csv: 3 peptides (0 new)
selected-tcell-cd8-h2b-h2d-15mer.csv: 3 peptides (2 new)
selected-tcell-cd8-h2d-15mer.csv: 3 peptides (2 new)
selected-tcell-cd8-h2b-15mer.csv: 3 pe

In [70]:
df

Unnamed: 0_level_0,Protein,Start,End,B-cell Epitope,HLA-I alleles,HLA-II alleles,HLA-I coverage,HLA-II coverage,Class I H2-b,Class II H2-b,Class I H2-d,Class II H2-d,sources,num_sources
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
FRKSNLKPFERDISTEIYQAGSTPCNG,S,456,482,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-B*07:02,HLA-A*11:01","HLA-DQA1*05:05/DQB1*03:01,HLA-DRB1*04:01",0.46,0.3,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv,1
IYKTPPIKDFGGFNFSQILPDPSKPSK,S,788,814,PSKP,"HLA-A*03:01,HLA-A*24:02","HLA-DRB1*04:01,HLA-DRB1*01:01",0.35,0.23,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv;selected-bcel...,2
PSKPSKRSFIEDLLFNKVTLADAGFIK,S,809,835,PSKP,"HLA-A*02:01,HLA-A*11:01,HLA-B*08:01,HLA-A*03:01","HLA-DRB1*13:01,HLA-DRB1*01:01,HLA-DRB1*07:01",0.66,0.4,+,-,-,+,selected-bcell-cd4-h2d-27mer.csv;selected-bcel...,5
YLYRLFRKSNLKPFERDISTEIYQAGS,S,451,477,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-B*07:02,HLA-A*11:01,HLA-B*08:0...","HLA-DRB1*04:01,HLA-DRB1*13:01,HLA-DRB1*11:01,H...",0.78,0.46,+,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
PQTLEILDITPCSFGGVSVITPGTNTS,S,579,605,QTLE,HLA-C*05:01,HLA-DQA1*05:05/DQB1*03:01,0.13,0.21,-,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
TFVSGNCDVVIGIVNNTVYDPLQPELD,S,1120,1146,QPELD,HLA-C*04:01,HLA-DRB1*01:01,0.23,0.13,-,-,+,-,selected-bcell-cd4-cd8-27mer.csv,1
LQPELDSFKEELDKYFKNHTSPDVDLG,S,1141,1167,QPELD,,"HLA-DRB1*01:01,HLA-DRB1*04:01,HLA-DRB1*07:01",0.0,0.41,-,-,-,-,selected-bcell-cd4-27mer.csv,1
GNYNYLYRLFRKSNLKPFERDISTEIY,S,447,473,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-A*24:02,HLA-B*07:02,HLA-A*11:0...","HLA-DRB1*11:01,HLA-DRB1*13:01,HLA-DRB1*07:01",0.82,0.38,+,-,+,-,selected-bcell-cd8-h2b-h2d-27mer.csv;selected-...,4
VYDPLQPELDSFKEELDKYFKNHTSPD,S,1137,1163,QPELD,HLA-C*04:01,,0.23,0.0,-,-,+,-,selected-bcell-cd8-h2d-27mer.csv;selected-bcel...,3
KFLPFQQFGRDIADTTDAVRDPQTLEI,S,558,584,QTLE,,,0.0,0.0,-,-,-,-,selected-bcell-27mer.csv,1


Unnamed: 0_level_0,Protein,Start,End,B-cell Epitope,HLA-I alleles,HLA-II alleles,HLA-I coverage,HLA-II coverage,Class I H2-b,Class II H2-b,Class I H2-d,Class II H2-d,sources,num_sources
Sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
FRKSNLKPFERDISTEIYQAGSTPCNG,S,456,482,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-B*07:02,HLA-A*11:01","HLA-DQA1*05:05/DQB1*03:01,HLA-DRB1*04:01",0.46,0.3,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv,1
IYKTPPIKDFGGFNFSQILPDPSKPSK,S,788,814,PSKP,"HLA-A*03:01,HLA-A*24:02","HLA-DRB1*04:01,HLA-DRB1*01:01",0.35,0.23,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv;selected-bcel...,2
PSKPSKRSFIEDLLFNKVTLADAGFIK,S,809,835,PSKP,"HLA-A*02:01,HLA-A*11:01,HLA-B*08:01,HLA-A*03:01","HLA-DRB1*13:01,HLA-DRB1*01:01,HLA-DRB1*07:01",0.66,0.4,+,-,-,+,selected-bcell-cd4-h2d-27mer.csv;selected-bcel...,5
YLYRLFRKSNLKPFERDISTEIYQAGS,S,451,477,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-B*07:02,HLA-A*11:01,HLA-B*08:0...","HLA-DRB1*04:01,HLA-DRB1*13:01,HLA-DRB1*11:01,H...",0.78,0.46,+,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
PQTLEILDITPCSFGGVSVITPGTNTS,S,579,605,QTLE,HLA-C*05:01,HLA-DQA1*05:05/DQB1*03:01,0.13,0.21,-,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
TFVSGNCDVVIGIVNNTVYDPLQPELD,S,1120,1146,QPELD,HLA-C*04:01,HLA-DRB1*01:01,0.23,0.13,-,-,+,-,selected-bcell-cd4-cd8-27mer.csv,1
LQPELDSFKEELDKYFKNHTSPDVDLG,S,1141,1167,QPELD,,"HLA-DRB1*01:01,HLA-DRB1*04:01,HLA-DRB1*07:01",0.0,0.41,-,-,-,-,selected-bcell-cd4-27mer.csv,1
GNYNYLYRLFRKSNLKPFERDISTEIY,S,447,473,RKSNLKPFERDISTEIY,"HLA-C*07:02,HLA-A*24:02,HLA-B*07:02,HLA-A*11:0...","HLA-DRB1*11:01,HLA-DRB1*13:01,HLA-DRB1*07:01",0.82,0.38,+,-,+,-,selected-bcell-cd8-h2b-h2d-27mer.csv;selected-...,4
VYDPLQPELDSFKEELDKYFKNHTSPD,S,1137,1163,QPELD,HLA-C*04:01,,0.23,0.0,-,-,+,-,selected-bcell-cd8-h2d-27mer.csv;selected-bcel...,3
KFLPFQQFGRDIADTTDAVRDPQTLEI,S,558,584,QTLE,,,0.0,0.0,-,-,-,-,selected-bcell-27mer.csv,1


In [61]:
df.to_latex("table.tex")