In [1]:
from collections import defaultdict
from glob import glob
for length in [15, 21, 27]:
    human_class1_peptides = defaultdict(set)
    human_class2_peptides = defaultdict(set)
    h2b_class1_peptides = defaultdict(set)
    h2d_class1_peptides = defaultdict(set)
    h2b_class2_peptides = defaultdict(set)
    h2d_class2_peptides = defaultdict(set)
    
    for fname in glob("selected*%dmer.csv" % length):
        df = pd.read_csv(fname)
        df = df.replace(np.nan, "", regex=True)
        for _, row in df.iterrows():
            vaccine_peptide = row.Sequence
            for p in row["HLA-I_peptides"].split(','):
                human_class1_peptides[p].add(vaccine_peptide)
            for p in row["HLA-II_peptides"].split(','):
                human_class2_peptides[p].add(vaccine_peptide)
            for p in row["Mouse MHC-I b"].split(','):
                h2b_class1_peptides[p].add(vaccine_peptide)
            for p in row["Mouse MHC-II b"].split(','):
                h2b_class2_peptides[p].add(vaccine_peptide)
            for p in row["Mouse MHC-I d"].split(','):
                h2d_class1_peptides[p].add(vaccine_peptide)
            for p in row["Mouse MHC-II d"].split(','):
                h2d_class2_peptides[p].add(vaccine_peptide)
    all_peptides = set.union(
        *map(lambda x: set(x.keys()), [human_class1_peptides,
        human_class2_peptides,
        h2b_class1_peptides,
        h2b_class2_peptides,
        h2d_class1_peptides,
        h2d_class2_peptides]))
    all_peptides_list = sorted(all_peptides, key=lambda x: (len(x), x))
    all_peptides_list = [p for p in all_peptides_list if p ]
    sources = defaultdict(set)
    for p in all_peptides_list:
        sources[p].update(human_class1_peptides.get(p, []))
        sources[p].update(human_class2_peptides.get(p, []))
        sources[p].update(h2b_class1_peptides.get(p, []))
        sources[p].update(h2b_class2_peptides.get(p, []))
        sources[p].update(h2d_class1_peptides.get(p, []))
        sources[p].update(h2d_class2_peptides.get(p, []))
        
    df_combined = pd.DataFrame({
        "peptide": all_peptides_list,
        "length": [len(p) for p in all_peptides_list],
        "vaccine_peptide_length": [length] * len(all_peptides_list),
        "vaccine_peptides": [";".join(sorted(sources[p])) for p in all_peptides_list],
        "human_class1": [p in human_class1_peptides for p in all_peptides_list],
        "human_class2": [p in human_class2_peptides for p in all_peptides_list],
        "h2b_class1": [p in h2b_class1_peptides for p in all_peptides_list],
        "h2b_class2": [p in h2b_class2_peptides for p in all_peptides_list],
        "h2d_class1": [p in h2d_class1_peptides for p in all_peptides_list],
        "h2d_class2": [p in h2d_class2_peptides for p in all_peptides_list],
    })
    df_combined["any_human"] = (
        df_combined["human_class1"] | df_combined["human_class2"]
    )
    
    df_combined["any_h2b"] = (
        df_combined["h2b_class1"] | df_combined["h2b_class2"]
    )
    df_combined["any_h2d"] = (
        df_combined["h2d_class1"] | df_combined["h2d_class2"]
    )
    df_combined["any_murine"] = df_combined["any_h2b"] | df_combined["any_h2d"]
    df_combined.to_csv("minimal-epitopes-in-%dmer-vaccine-peptides.csv" % length, index=False)
    df_combined[df_combined.any_murine].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-murine.csv" % length, index=False)
    df_combined[df_combined.any_human].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-human.csv" % length, index=False)
    df_combined[df_combined.any_h2b].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-h2b.csv" % length, index=False)
    df_combined[df_combined.any_h2d].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-h2d.csv" % length, index=False)
    df_combined[df_combined.any_h2d & df_combined.any_h2b].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-h2b-h2d.csv" % length, index=False)
    df_combined[df_combined.any_h2d & df_combined.any_human].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-human-h2d.csv" % length, index=False)
    df_combined[df_combined.any_h2b & df_combined.any_human].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-human-h2b.csv" % length, index=False)
    
    df_combined[df_combined.any_h2d & df_combined.any_h2b & df_combined.any_human].to_csv(
        "minimal-epitopes-in-%dmer-vaccine-peptides-human-h2b-h2d.csv" % length, index=False)
    
    
    

In [2]:
!cat minimal-epitopes-in-27mer-vaccine-peptides.csv | cut -d, -f1 




peptide
LLFNKVTL
MIERFVSL
NLKTLLSL
RLFRKSNL
RLRAKHYV
TDEMIAQY
YANRNRFL
AEAELAKNV
AEIRASANL
AELAKNVSL
AIDAYPLTK
ANRNRFLYI
ATAEAELAK
ATIPIQASL
AYANRNRFL
EVITFDNLK
FAYANRNRF
FAYTKRNVI
FIEDLLFNK
FRKSNLKPF
FVSLAIDAY
GLTVLPPLL
HAASGNLLL
HLDGEVITF
IAQYTSALL
ILDITPCSF
IPIQASLPF
IPTITQMNL
ITFDNLKTL
KSNLKPFER
LLFNKVTLA
LMIERFVSL
LPPLLTDEM
LRAKHYVYI
LSFKELLVY
LTDEMIAQY
LVATAEAEL
LVYAADPAM
MIAQYTSAL
NLKYAISAK
NRFLYIIKL
NRNRFLYII
NVIPTITQM
NYNYLYRLF
REVRTIKVF
RFLYIIKLI
RLFRKSNLK
RLQSLQTYV
RLRAKHYVY
RLSFKELLV
SEFSSLPSY
SLREVRTIK
SRLSFKELL
SSLPSYAAF
STFNVPMEK
TFDNLKTLL
TLADAGFIK
VPMEKLKTL
VVNARLRAK
VYIGDPAQL
WLIVGVALL
YAADPAMHA
YANRNRFLY
YRLFRKSNL
ASEFSSLPSY
FAYANRNRFL
FIEDLLFNKV
FLYIIKLIFL
GEVITFDNLK
GNYNYLYRLF
ILPDPSKPSK
IYKTPPIKDF
LAIDAYPLTK
LLTDEMIAQY
LTDEMIAQYT
LYIIKLIFLW
MHAASGNLLL
NVPMEKLKTL
PTITQMNLKY
RFLYIIKLIF
RLRAKHYVYI
RLSFKELLVY
SFIEDLLFNK
STFNVPMEKL
SVVNARLRAK
TLMIERFVSL
TPSDFVRATA
VPMEKLKTLV
VTLADA

In [3]:
!cat minimal-epitopes-in-27mer-vaccine-peptides-murine.csv | cut -d, -f1 



peptide
LLFNKVTL
MIERFVSL
RLFRKSNL
YANRNRFL
ANRNRFLYI
ATIPIQASL
AYANRNRFL
FAYANRNRF
FAYTKRNVI
IAQYTSALL
ITFDNLKTL
LMIERFVSL
MIAQYTSAL
NRFLYIIKL
NYNYLYRLF
RFLYIIKLI
RLQSLQTYV
SSLPSYAAF
TFDNLKTLL
VYIGDPAQL
YANRNRFLY
FAYANRNRFL
GNYNYLYRLF
MHAASGNLLL
TLMIERFVSL
YANRNRFLYI
YVYIGDPAQL
AADPAMHAASGNLLL
AAFATAQEAYEQAVA
ADPAMHAASGNLLLD
AKHYVYIGDPAQLPA
ASEFSSLPSYAAFAT
ATPSDFVRATATIPI
DEMIAQYTSALLAGT
DFVRATATIPIQASL
EDLLFNKVTLADAGF
EFSSLPSYAAFATAQ
EKLKTLVATAEAELA
ELLVYAADPAMHAAS
EMIAQYTSALLAGTI
ERFVSLAIDAYPLTK
FGGFNFSQILPDPSK
FKELLVYAADPAMHA
FNVPMEKLKTLVATA
FSSLPSYAAFATAQE
FVRATATIPIQASLP
GFNFSQILPDPSKPS
GGFNFSQILPDPSKP
IASEFSSLPSYAAFA
IERFVSLAIDAYPLT
ISTEIYQAGSTPCNG
ITQMNLKYAISAKNR
KELLVYAADPAMHAA
KLKTLVATAEAELAK
KRNVIPTITQMNLKY
KTLLSLREVRTIKVF
KTLVATAEAELAKNV
LIRAAEIRASANLAA
LKTLLSLREVRTIKV
LKTLVATAEAELAKN
LLFNKVTLADAGFIK
LLQFAYANRNRFLYI
LLVYAADPAMHAASG
LMIERFVSLAIDAYP
LPSYAAFATAQEAYE
LQTYVTQQLIRAAEI
LSFKELLVYAADPAM
LTDEMIAQYTSA

In [4]:

!cat minimal-epitopes-in-27mer-vaccine-peptides-human-h2b-h2d.csv | cut -d, -f1 

!cat minimal-epitopes-in-27mer-vaccine-peptides-human-h2b-h2d.csv | cut -d, -f1 | wc -l


peptide
ANRNRFLYI
FAYANRNRF
FAYTKRNVI
IAQYTSALL
LMIERFVSL
SSLPSYAAF
FAYANRNRFL
YANRNRFLYI
ASEFSSLPSYAAFAT
ATPSDFVRATATIPI
DEMIAQYTSALLAGT
DFVRATATIPIQASL
EFSSLPSYAAFATAQ
EKLKTLVATAEAELA
ELLVYAADPAMHAAS
EMIAQYTSALLAGTI
ERFVSLAIDAYPLTK
FKELLVYAADPAMHA
FSSLPSYAAFATAQE
FVRATATIPIQASLP
IASEFSSLPSYAAFA
IERFVSLAIDAYPLT
ITQMNLKYAISAKNR
KELLVYAADPAMHAA
KLKTLVATAEAELAK
KTLVATAEAELAKNV
LIRAAEIRASANLAA
LKTLVATAEAELAKN
LLQFAYANRNRFLYI
LLVYAADPAMHAASG
LPSYAAFATAQEAYE
LSFKELLVYAADPAM
LTDEMIAQYTSALLA
LVYAADPAMHAASGN
MEKLKTLVATAEAEL
MIAQYTSALLAGTIT
MIERFVSLAIDAYPL
PSDFVRATATIPIQA
PSYAAFATAQEAYEQ
QLIRAAEIRASANLA
QQLIRAAEIRASANL
RATATIPIQASLPFG
SDFVRATATIPIQAS
SEFSSLPSYAAFATA
SFKELLVYAADPAMH
SLPSYAAFATAQEAY
SSLPSYAAFATAQEA
TDEMIAQYTSALLAG
TITQMNLKYAISAKN
TPSDFVRATATIPIQ
TQMNLKYAISAKNRA
TQQLIRAAEIRASAN
VRATATIPIQASLPF
VTQQLIRAAEIRASA
YVTQQLIRAAEIRAS
      56


In [5]:

!cat minimal-epitopes-in-27mer-vaccine-peptides-h2d.csv | cut -d, -f1 

!cat minimal-epitopes-in-27mer-vaccine-peptides-h2d.csv | cut -d, -f1 | wc -l


peptide
ANRNRFLYI
AYANRNRFL
FAYANRNRF
FAYTKRNVI
IAQYTSALL
LMIERFVSL
NYNYLYRLF
RFLYIIKLI
RLQSLQTYV
SSLPSYAAF
TFDNLKTLL
VYIGDPAQL
FAYANRNRFL
MHAASGNLLL
YANRNRFLYI
YVYIGDPAQL
ASEFSSLPSYAAFAT
ATPSDFVRATATIPI
DEMIAQYTSALLAGT
DFVRATATIPIQASL
EDLLFNKVTLADAGF
EFSSLPSYAAFATAQ
EKLKTLVATAEAELA
ELLVYAADPAMHAAS
EMIAQYTSALLAGTI
ERFVSLAIDAYPLTK
FKELLVYAADPAMHA
FNVPMEKLKTLVATA
FSSLPSYAAFATAQE
FVRATATIPIQASLP
IASEFSSLPSYAAFA
IERFVSLAIDAYPLT
ITQMNLKYAISAKNR
KELLVYAADPAMHAA
KLKTLVATAEAELAK
KRNVIPTITQMNLKY
KTLLSLREVRTIKVF
KTLVATAEAELAKNV
LIRAAEIRASANLAA
LKTLLSLREVRTIKV
LKTLVATAEAELAKN
LLFNKVTLADAGFIK
LLQFAYANRNRFLYI
LLVYAADPAMHAASG
LMIERFVSLAIDAYP
LPSYAAFATAQEAYE
LQTYVTQQLIRAAEI
LSFKELLVYAADPAM
LTDEMIAQYTSALLA
LVYAADPAMHAASGN
MEKLKTLVATAEAEL
MIAQYTSALLAGTIT
MIERFVSLAIDAYPL
NVPMEKLKTLVATAE
NYDLSVVNARLRAKH
PMEKLKTLVATAEAE
PSDFVRATATIPIQA
PSYAAFATAQEAYEQ
QLIRAAEIRASANLA
QQLIRAAEIRASANL
QSLQTYVTQQLIRAA
QTYVTQQLIRAAEIR
RATATIPIQASLPFG
SDFVRATATIPI

In [6]:

!cat minimal-epitopes-in-27mer-vaccine-peptides-human-h2d.csv | cut -d, -f1 

!cat minimal-epitopes-in-27mer-vaccine-peptides-human-h2d.csv | cut -d, -f1 | wc -l


peptide
ANRNRFLYI
AYANRNRFL
FAYANRNRF
FAYTKRNVI
IAQYTSALL
LMIERFVSL
NYNYLYRLF
RFLYIIKLI
RLQSLQTYV
SSLPSYAAF
TFDNLKTLL
VYIGDPAQL
FAYANRNRFL
MHAASGNLLL
YANRNRFLYI
YVYIGDPAQL
ASEFSSLPSYAAFAT
ATPSDFVRATATIPI
DEMIAQYTSALLAGT
DFVRATATIPIQASL
EDLLFNKVTLADAGF
EFSSLPSYAAFATAQ
EKLKTLVATAEAELA
ELLVYAADPAMHAAS
EMIAQYTSALLAGTI
ERFVSLAIDAYPLTK
FKELLVYAADPAMHA
FNVPMEKLKTLVATA
FSSLPSYAAFATAQE
FVRATATIPIQASLP
IASEFSSLPSYAAFA
IERFVSLAIDAYPLT
ITQMNLKYAISAKNR
KELLVYAADPAMHAA
KLKTLVATAEAELAK
KRNVIPTITQMNLKY
KTLLSLREVRTIKVF
KTLVATAEAELAKNV
LIRAAEIRASANLAA
LKTLLSLREVRTIKV
LKTLVATAEAELAKN
LLFNKVTLADAGFIK
LLQFAYANRNRFLYI
LLVYAADPAMHAASG
LMIERFVSLAIDAYP
LPSYAAFATAQEAYE
LQTYVTQQLIRAAEI
LSFKELLVYAADPAM
LTDEMIAQYTSALLA
LVYAADPAMHAASGN
MEKLKTLVATAEAEL
MIAQYTSALLAGTIT
MIERFVSLAIDAYPL
NVPMEKLKTLVATAE
NYDLSVVNARLRAKH
PMEKLKTLVATAEAE
PSDFVRATATIPIQA
PSYAAFATAQEAYEQ
QLIRAAEIRASANLA
QQLIRAAEIRASANL
QSLQTYVTQQLIRAA
QTYVTQQLIRAAEIR
RATATIPIQASLPFG
SDFVRATATIPI