In [1]:

from glob import glob
from collections import defaultdict

def parse_fasta(filename):
    result = {}
    with open(filename) as f:
        current_id = None
        lines = []
        for l in f:
            l = l.strip()
            if not l:
                continue
            elif l.startswith(">"):
                if current_id:
                    result[current_id] = "".join(lines)
                current_id = l[1:].split("|")[0]
                lines = []
            else:
                lines.append(l)
        if current_id:
            result[current_id] = "".join(lines)
        return result

proteome = parse_fasta("../Figures/COVID/AA_sequence.txt")

def sort_key(f):
    num_attributes = sum([
        0,
        "h2b" in f,
        "h2d" in f,

    ])
    return ("bcell" in f), ("cd4" in f), ("cd8" in f), num_attributes

def save_dataframe_as_fasta(df, prefix):
    with open(prefix + ".fa", "w") as f:
        for seq, protein, start in zip(df.Sequence, df.Protein, df.Start):
            f.write(">SARS2-%s-%d-%d-%dmer\n%s\n" % (
                protein,
                start,
                start + len(seq) - 1,
                len(seq),
                seq,
            ))
            
            
def filename_to_symbol(source):
    bcell = "bcell" in source
    cd4 = "cd4" in source
    cd8 = "cd8" in source
    h2b  = "h2b" in source
    h2d = "h2d" in source

    if bcell:
        if cd4 and cd8:
             c = "\\setlength{\\fboxsep}{0.5pt} \\boxed{\\circledast}"
               
        elif cd4:
            c = "\\boxcircle"
        elif cd8:
            c = "\\boxast"
        else:
            c = "\\boxempty"
    else:
        if cd4 and cd8:
            c = "\\circledast"
        elif cd4:
            c = "\\circ"
        elif cd8:
            c = "\\ast"
        else:
            raise ValueError("Expected either CD4 or CD8")

    if h2b and h2d:
        c += "^{bd}"
    elif h2b:
        c += "^b"
    elif h2d:
        c += "^d"
    return c
    
def sources_to_hieroglyphics(sources_string):
    sources = sources_string.split(";")
    characters = []
    for source in sorted(sources, key=sort_key):
        characters.append(filename_to_symbol(source))
    if len(characters) > 4:
        symbol_string = "\Centerstack{ "
        i_values = range(len(characters) // 4 + 1)
        for i in i_values:
            subchars = characters[i * 4: (i + 1) * 4]
            if subchars:
                symbol_string += " $%s$" % (" ".join(subchars))
            if i != i_values[-1]:
                symbol_string += " \\\\ "
        symbol_string += " }"
    else:
        symbol_string = "$ %s $" % (" ".join(characters))
        
    return symbol_string
    

def format_epitope(s):
    if not s:
        return ""
    start = proteome["S"].index(s) + 1
    end = proteome["S"].index(s) + len(s)
    return "S$_{%d-%d}$" % (start, end)
    # return "\\texttt{{\\scriptsize %d-}%s{\\scriptsize -%d}}" % (
    #    start,
    #    s,
    #    end
    #)

def save_dataframe_as_latex_table(df, filename):
    df["Selection Sets"] = df.sources

    # prepare table for LaTeX
    del df["sources"]
    del df["HLA-I alleles"] 
    del df["HLA-II alleles"]
    del df["num_sources"]
    class1_cov_column = "\Centerstack{HLA-I \\\\ Coverage}"
    class2_cov_column = "\Centerstack{HLA-II \\\\ Coverage}"
    df = df.rename(columns={
        "HLA-I coverage": class1_cov_column,
        "HLA-II coverage": class2_cov_column,
        "Class I H2-b": "H2$^{b}$ I", 
        "Class II H2-b": "H2$^{b}$ II", 
        "Class I H2-d": "H2$^{d}$ I", 
        "Class II H2-d": "H2$^{d}$ II", 
        #"B-cell Epitope": "B-cell Epitope Region",
        
    })
    df = df.reset_index()
    
    df.index = df.index + 1
    with pd.option_context("max_colwidth", 1000):
        df.to_latex(
            filename,
            formatters={
                "Selection Sets": sources_to_hieroglyphics,
                "Sequence": lambda s: "\\texttt{%s}" % s,
                "B-cell Epitope": format_epitope,
                class1_cov_column: lambda x: "%0.1f\\%%" % (100 * x),
                class2_cov_column: lambda x: "%0.1f\\%%" % (100 * x),
            },
            column_format="r" + "c" * len(df.columns),
            escape=False,
        )
    
def save_dataframe(df, prefix):
    df = df.copy()
    df["has_bcell"] = df["B-cell Epitope"].str.len() > 0
    df = df.sort_values([ "has_bcell", "Protein", "Start", ], ascending=True)
    del df["has_bcell"]
    save_dataframe_as_fasta(df, prefix)
    df = df.set_index("Sequence")
    df.to_csv(prefix + ".csv")
    df.to_html(prefix + ".html")
    save_dataframe_as_latex_table(df,  prefix + ".tex")
    


    

for length in [15, 21, 27]:
    dataframes = []
    seen_peptides = set()
    peptide_to_sources = defaultdict(list)
    for f in reversed(sorted(glob("selected-*cell*%dmer.csv" % length), key=sort_key,)):
        # if "cd4" not in f and "cd8" not in f:
        #    continue
        df_full = pd.read_csv(f)
        peptides = df_full.Sequence.values
        seen_already = [p in seen_peptides for p in peptides]
        n_old = sum(seen_already)
        n_new = len(df_full) - n_old
        # print("%s: %d peptides (%d new)" % (f, len(df_full), n_new))

        for p in peptides:
            peptide_to_sources[p].append(f)

        df_sub = df_full[["Sequence", "Protein", "Start", "End"]].copy()
        df_sub["B-cell Epitope"] = [x if type(x) is str else "" for x in df_full["full_bcell_epitope"]]
        df_sub["HLA-I alleles"] = [x.replace(",", " ").replace("HLA-", "") if type(x) is str else "" for x in df_full["HLA-I_haplotypes"]]
        df_sub["HLA-II alleles"] = [x.replace(",", " ").replace("HLA-", "") if type(x) is str else "" for x in df_full["HLA-II_haplotypes"]]

        df_sub["HLA-I coverage"] = df_full['HLA-I_pop_freq_rounded']
        df_sub["HLA-II coverage"] = df_full['HLA-II_pop_freq_rounded']
        df_sub['Class I H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-I b"]]
        df_sub['Class II H2-b'] = ["+" if x else "-" for x in df_full["has mouse MHC-II b"]]
        df_sub['Class I H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-I d"]]
        df_sub['Class II H2-d'] = ["+" if x else "-" for x in df_full["has mouse MHC-II d"]]

        seen_peptides.update(peptides)
        dataframes.append(df_sub[~np.array(seen_already)])
    df = pd.concat(dataframes); 
    df["sources"] = [";".join(peptide_to_sources[p]) for p in df.Sequence]
    df["num_sources"] = [len(peptide_to_sources[p]) for p in df.Sequence]
    
    print("==> Total %dmer vaccine peptides: %d" % (length, len(df),))
    save_dataframe (df, "final-vaccine-peptides-%dmer" % length)
    
    h2b_mask = (df["Class I H2-b"] == "+") | (df["Class II H2-b"] == "+")
    df_h2b = df[h2b_mask]
    print("==> %d/%d with H2-b predicted ligands" % (len(df_h2b), len(df)))
    save_dataframe(df_h2b, "final-vaccine-peptides-h2b-%dmer" % length)
    
    h2d_mask = (df["Class I H2-d"] == "+") | (df["Class II H2-d"] == "+")
    df_h2d = df[h2d_mask]
    print("==> %d/%d with H2-d predicted ligands" % (len(df_h2d), len(df)))
    save_dataframe(df_h2d, "final-vaccine-peptides-h2d-%dmer" % length)
    
    murine_mask = h2d_mask | h2b_mask
    df_murine = df[murine_mask]
    print("==> %d/%d with any murine predicted ligands" % (len(df_murine), len(df)))
    save_dataframe(df_murine, "final-vaccine-peptides-murine-%dmer" % length)
    
    

==> Total 15mer vaccine peptides: 36
==> 29/36 with H2-b predicted ligands
==> 22/36 with H2-d predicted ligands
==> 29/36 with any murine predicted ligands
==> Total 21mer vaccine peptides: 22
==> 17/22 with H2-b predicted ligands
==> 13/22 with H2-d predicted ligands
==> 18/22 with any murine predicted ligands
==> Total 27mer vaccine peptides: 19
==> 17/19 with H2-b predicted ligands
==> 14/19 with H2-d predicted ligands
==> 17/19 with any murine predicted ligands


In [2]:
df

Unnamed: 0,Sequence,Protein,Start,End,B-cell Epitope,HLA-I alleles,HLA-II alleles,HLA-I coverage,HLA-II coverage,Class I H2-b,Class II H2-b,Class I H2-d,Class II H2-d,sources,num_sources
0,YLYRLFRKSNLKPFERDISTEIYQAGS,S,451,477,FRKSNLKPFERDISTEIY,C*07:02 B*07:02 A*11:01 B*08:01 A*03:01 C*06:0...,DRB1*04:01 DRB1*13:01 DRB1*11:01 DRB1*07:01,0.78,0.46,+,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
1,PQTLEILDITPCSFGGVSVITPGTNTS,S,579,605,QTLE,C*05:01,DQA1*05:05/DQB1*03:01,0.13,0.21,-,-,-,-,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,3
2,PSKPSKRSFIEDLLFNKVTLADAGFIK,S,809,835,PSKP,A*02:01 A*11:01 B*08:01 A*03:01,DRB1*13:01 DRB1*01:01 DRB1*07:01,0.66,0.4,+,-,-,+,selected-bcell-cd4-cd8-27mer.csv;selected-bcel...,5
0,FRKSNLKPFERDISTEIYQAGSTPCNG,S,456,482,FRKSNLKPFERDISTEIY,C*07:02 B*07:02 A*11:01,DQA1*05:05/DQB1*03:01 DRB1*04:01,0.46,0.3,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv,1
1,IYKTPPIKDFGGFNFSQILPDPSKPSK,S,788,814,PSKP,A*03:01 A*24:02,DRB1*04:01 DRB1*01:01,0.35,0.23,-,+,-,-,selected-bcell-cd4-h2b-27mer.csv;selected-bcel...,2
0,GNYNYLYRLFRKSNLKPFERDISTEIY,S,447,473,FRKSNLKPFERDISTEIY,C*07:02 A*24:02 B*07:02 A*11:01 B*08:01 A*03:0...,DRB1*11:01 DRB1*13:01 DRB1*07:01,0.82,0.38,+,-,+,-,selected-bcell-cd8-h2b-h2d-27mer.csv;selected-...,4
1,KFLPFQQFGRDIADTTDAVRDPQTLEI,S,558,584,QTLE,,,0.0,0.0,-,-,-,-,selected-bcell-27mer.csv,1
0,GLTVLPPLLTDEMIAQYTSALLAGTIT,S,857,883,,A*02:01 C*03:04 A*01:01 B*35:01,DQA1*05:05/DQB1*03:01 DRB1*01:01 DRB1*07:01 DR...,0.66,0.73,+,+,+,+,selected-tcell-cd4-cd8-h2b-h2d-27mer.csv;selec...,4
1,FAYTKRNVIPTITQMNLKYAISAKNRA,orf1ab,4920,4946,,A*01:01 B*08:01 C*03:04 C*06:02 C*07:01 C*07:0...,DRB1*01:01 DRB1*04:01 DRB1*07:01 DRB1*11:01 DR...,0.86,0.62,+,+,+,+,selected-tcell-cd4-cd8-h2b-h2d-27mer.csv;selec...,2
2,TLMIERFVSLAIDAYPLTKHPNQEYAD,orf1ab,5245,5271,,A*03:01 A*11:01 A*02:01 B*08:01 C*03:04 C*07:0...,DQA1*05:01/DQB1*02:01 DRB1*03:01 DRB1*04:01 DR...,0.81,0.71,+,+,+,+,selected-tcell-cd4-cd8-h2b-h2d-27mer.csv;selec...,2


In [3]:
old_peptides = set([l.strip() for l in open("order-old-sorted.txt")])
new_peptides = set([l.strip() for l in open("order-new-sorted.txt")])


In [4]:
{p for p in new_peptides if p not in old_peptides}

{'GAAAYYVGYLQPRTFLLKYNENGTITD', 'SETKCTLKSFTVEKGIYQTSNFRVQPT'}

In [5]:
{p for p in old_peptides if p not in new_peptides}

{'GKGYHLMSFPQSAPHGVVFLHVTYVPA',
 'LQPELDSFKEELDKYFKNHTSPDVDLG',
 'SLIDLQELGKYEQYIKWPWYIWLGFIA',
 'TFVSGNCDVVIGIVNNTVYDPLQPELD',
 'VYDPLQPELDSFKEELDKYFKNHTSPD'}

In [6]:
!xelatex tables.tex

This is XeTeX, Version 3.14159265-2.6-0.999992 (TeX Live 2020) (preloaded format=xelatex)
 restricted \write18 enabled.
entering extended mode
(./tables.tex
LaTeX2e <2020-02-02> patch level 5
L3 programming layer <2020-03-06>
(/usr/local/texlive/2020/texmf-dist/tex/latex/base/article.cls
Document Class: article 2019/12/20 v1.4l Standard LaTeX document class
(/usr/local/texlive/2020/texmf-dist/tex/latex/base/size10.clo))
(/usr/local/texlive/2020/texmf-dist/tex/latex/geometry/geometry.sty
(/usr/local/texlive/2020/texmf-dist/tex/latex/graphics/keyval.sty)
(/usr/local/texlive/2020/texmf-dist/tex/generic/iftex/ifvtex.sty
(/usr/local/texlive/2020/texmf-dist/tex/generic/iftex/iftex.sty)))
(/usr/local/texlive/2020/texmf-dist/tex/latex/base/inputenc.sty


) (/usr/local/texlive/2020/texmf-dist/tex/latex/graphics/color.sty
(/usr/local/texlive/2020/texmf-dist/tex/latex/graphics-cfg/color.cfg)
(/usr/local/texlive/2020/texmf-dist/tex/latex/graphics-def/xetex.def))
(/usr/local/texl

In [7]:
!open tables.pdf