In [7]:

from glob import glob
from collections import defaultdict

def sort_key(f):
    num_attributes = sum([
        0,
        "h2b" in f,
        "h2d" in f,
        "cd4" in f,
        "cd8" in f,
    ])
    return ("bcell" in f), ("cd4" in f), num_attributes

def save_dataframe_as_fasta(df, prefix):
    with open(prefix + ".fa", "w") as f:
        for seq, protein, start in zip(df.Sequence, df.Protein, df.Start):
            f.write(">SARS2-%s-%d-%d-%dmer\n%s\n" % (
                protein,
                start,
                start + len(seq) - 1,
                len(seq),
                seq,
            ))
    
def save_dataframe(df, prefix):
    df = df.copy()
    df = df.sort_values([ "Protein", "Start", "B-cell Epitope",], ascending=True)
    save_dataframe_as_fasta(df, prefix)
    df = df.set_index("Sequence")
    df.to_csv(prefix + ".csv")
    df.to_html(prefix + ".html")
    # prepare table for LaTeX
    
    del df["HLA-I alleles"] 
    del df["HLA-II alleles"]
    del df["sources"]
    del df["num_sources"]
    df = df.rename(columns={
        "Class I H2-b": "H2-b I", 
        "Class II H2-b": "H2-b II", 
        "Class I H2-d": "H2-d I", 
        "Class II H2-d": "H2-d II", 
    })
    df.to_latex(prefix + ".tex")
    
    

for length in [15, 21, 27]:

    dataframes = []
    seen_peptides = set()
    peptide_to_sources = defaultdict(list)
    for f in reversed(sorted(glob("selected-*%dmer.csv" % length), key=sort_key,)):
        # if "cd4" not in f and "cd8" not in f:
        #    continue
        df_full = pd.read_csv(f)
        peptides = df_full.Sequence.values
        seen_already = [p in seen_peptides for p in peptides]
        n_old = sum(seen_already)
        n_new = len(df_full) - n_old
        # print("%s: %d peptides (%d new)" % (f, len(df_full), n_new))

        for p in peptides:
            peptide_to_sources[p].append(f)

        df_sub = df_full[["Sequence", "Protein", "Start", "End"]].copy()
        df_sub["B-cell Epitope"] = [x if type(x) is str else "" for x in df_full["full_bcell_epitope"]]
        df_sub["HLA-I alleles"] = [x.replace(",", " ").replace("HLA-", "") if type(x) is str else "" for x in df_full["HLA-I_haplotypes"]]
        df_sub["HLA-II alleles"] = [x.replace(",", " ").replace("HLA-", "") if type(x) is str else "" for x in df_full["HLA-II_haplotypes"]]

        df_sub["HLA-I coverage"] = df_full['HLA-I_pop_freq_rounded']
        df_sub["HLA-II coverage"] = df_full['HLA-II_pop_freq_rounded']
        df_sub['Class I H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-I b"]]
        df_sub['Class II H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-II b"]]
        df_sub['Class I H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-I d"]]
        df_sub['Class II H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-II d"]]

        seen_peptides.update(peptides)
        dataframes.append(df_sub[~np.array(seen_already)])
    df = pd.concat(dataframes); 
    df["sources"] = [";".join(peptide_to_sources[p]) for p in df.Sequence]
    df["num_sources"] = [len(peptide_to_sources[p]) for p in df.Sequence]
    
    print("==> Total %dmer vaccine peptides: %d" % (length, len(df),))
    save_dataframe (df, "final-vaccine-peptides-%dmer" % length)
    
    h2b_mask = (df["Class I H2-b"] == "+") | (df["Class II H2-b"] == "+")
    df_h2b = df[h2b_mask]
    print("==> %d/%d with H2-b predicted ligands" % (len(df_h2b), len(df)))
    save_dataframe(df_h2b, "final-vaccine-peptides-h2b-%dmer" % length)
    
    h2d_mask = (df["Class I H2-d"] == "+") | (df["Class II H2-d"] == "+")
    df_h2d = df[h2d_mask]
    print("==> %d/%d with H2-d predicted ligands" % (len(df_h2d), len(df)))
    save_dataframe(df_h2d, "final-vaccine-peptides-h2d-%dmer" % length)
    
    murine_mask = h2d_mask | h2b_mask
    df_murine = df[murine_mask]
    print("==> %d/%d with any murine predicted ligands" % (len(df_murine), len(df)))
    save_dataframe(df_murine, "final-vaccine-peptides-murine-%dmer" % length)
    
    

==> Total 15mer vaccine peptides: 36
==> 23/36 with H2-b predicted ligands
==> 18/36 with H2-d predicted ligands
==> 26/36 with any murine predicted ligands
==> Total 21mer vaccine peptides: 30
==> 22/30 with H2-b predicted ligands
==> 18/30 with H2-d predicted ligands
==> 27/30 with any murine predicted ligands
==> Total 27mer vaccine peptides: 27
==> 19/27 with H2-b predicted ligands
==> 17/27 with H2-d predicted ligands
==> 23/27 with any murine predicted ligands


In [8]:
df

Unnamed: 0,Sequence,Protein,Start,End,B-cell Epitope,HLA-I alleles,HLA-II alleles,HLA-I coverage,HLA-II coverage,Class I H2-b,Class II H2-b,Class I H2-d,Class II H2-d,sources,num_sources
0,FRKSNLKPFERDISTEIYQAGSTPCNG,S,456,482,FRKSNLKPFERDISTEIY,C*07:02 B*07:02 A*11:01,DQA1*05:05/DQB1*03:01 DRB1*04:01,0.46,0.3,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv;selected-bcel...,2
1,IYKTPPIKDFGGFNFSQILPDPSKPSK,S,788,814,PSKP,A*03:01 A*24:02,DRB1*04:01 DRB1*01:01,0.35,0.23,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv;selected-bcel...,3
0,PSKPSKRSFIEDLLFNKVTLADAGFIK,S,809,835,PSKP,A*02:01 A*11:01 B*08:01 A*03:01,DRB1*13:01 DRB1*01:01 DRB1*07:01,0.66,0.4,+,-,-,+,selected-bcell-cd4-h2d-27mer.csv;selected-bcel...,6
0,YLYRLFRKSNLKPFERDISTEIYQAGS,S,451,477,FRKSNLKPFERDISTEIY,C*07:02 B*07:02 A*11:01 B*08:01 A*03:01 C*06:0...,DRB1*04:01 DRB1*13:01 DRB1*11:01 DRB1*07:01,0.78,0.46,+,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
1,PQTLEILDITPCSFGGVSVITPGTNTS,S,579,605,QTLE,C*05:01,DQA1*05:05/DQB1*03:01,0.13,0.21,-,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
3,TFVSGNCDVVIGIVNNTVYDPLQPELD,S,1120,1146,QPELD,C*04:01,DRB1*01:01,0.23,0.13,-,-,+,-,selected-bcell-cd4-cd8-27mer.csv,1
3,LQPELDSFKEELDKYFKNHTSPDVDLG,S,1141,1167,QPELD,,DRB1*01:01 DRB1*04:01 DRB1*07:01,0.0,0.41,-,-,-,-,selected-bcell-cd4-27mer.csv;selected-bcell-va...,2
0,GNYNYLYRLFRKSNLKPFERDISTEIY,S,447,473,FRKSNLKPFERDISTEIY,C*07:02 A*24:02 B*07:02 A*11:01 B*08:01 A*03:0...,DRB1*11:01 DRB1*13:01 DRB1*07:01,0.82,0.38,+,-,+,-,selected-bcell-cd8-h2b-h2d-27mer.csv;selected-...,4
1,VYDPLQPELDSFKEELDKYFKNHTSPD,S,1137,1163,QPELD,C*04:01,,0.23,0.0,-,-,+,-,selected-bcell-cd8-h2d-27mer.csv;selected-bcel...,3
1,KFLPFQQFGRDIADTTDAVRDPQTLEI,S,558,584,QTLE,,,0.0,0.0,-,-,-,-,selected-bcell-27mer.csv,1


In [61]:
df.to_latex("table.tex")