## 1. Download epitopes from IEDB.org

http://www.iedb.org/

Choose the following in the search
* Epitope: Linear Epitopes
* Assay: Positive Assays Only, MHC Ligand Assays
* MHC Restriction: MHC Class I
* Host: Human

Hit the "search" button. Once the search is complete, hit the "Export Epitopes Results" button to get a CSV file.

## 2. Cleanup IEDB Data

In [1]:
import pandas as pd
import numpy as np

data_dir = "data/epitope_protein/"
max_epitope_len = 25
min_epitope_len = 5

# Load the CSV from IEDB (skipping the first line, [2:])
epi = pd.DataFrame.from_csv(data_dir + "epitope_table_export_1477949116.csv")[2:]
# Rename the columns to remove whitespace
epi.columns=['type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein', 'protein_id', 'organism', 'oid', 'comments']
# Remove GI entries that start with prefix "SRC"
epi = epi[epi.protein_id.str.startswith("SRC") == False]
# Remove entries with '+' notation (note: looking into this, e.g. "PLNISLGDVVLY + DEAM(N3)")
epi = epi[epi.epitope_sequence.str.find('+') == -1]
# Remove the "GI:" prefix from the GIs provided by IEDB
epi["protein_id"] = epi.protein_id.str.replace("GI:", "") 
# Drop any epitopes that are not the desired length
iedb_epitopes = epi[(epi.epitope_sequence.str.len() >= min_epitope_len) \
                    & (epi.epitope_sequence.str.len() <= max_epitope_len)]\
                .loc[:, ["epitope_sequence", "protein_id", "start", "end"]]
# Create a file with a list of unique protein (antigen) IDs for use with BLAST
antigen_ids = iedb_epitopes["protein_id"].unique()
np.savetxt(data_dir + "protein_ids.txt", antigen_ids, fmt="%s")

num_iedb_epitopes = iedb_epitopes.shape[0]
num_iedb_proteins = antigen_ids.shape[0]

print "There are %d IEDB MHC-1 epitopes from %d unique proteins (antigens)." % (num_iedb_epitopes, num_iedb_proteins)

  if self.run_code(code, result):


There are 95489 IEDB MHC-1 epitopes from 32234 unique proteins (antigens).


## 3. BLAST the Swiss-Prot sequence database

Download [the latest swiss database from NCBI](ftp://ftp.ncbi.nlm.nih.gov/blast/db/) and save it locally.

In [2]:
# Update this to point to your SWISS protein database location
swiss_db_dir = "/workspace/blast/swiss/"

In [3]:
from itertools import groupby

# https://www.biostars.org/p/710/
def fasta_iter(fasta_name):
    """
    given a fasta file. yield tuples of header, sequence
    """
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.next())
        yield header, seq

In [4]:
protein_fasta = data_dir + "proteins.fasta"

print "Using BLAST to find the sequence for all the GIs"
print "Writing proteins to %s" % (protein_fasta)
blast = !blastdbcmd -db $swiss_db_dir/swissprot.00 -entry_batch $data_dir/protein_ids.txt \
                -out $protein_fasta
data = []
for (header, seq) in fasta_iter(protein_fasta):
    header_fields = header.split('|')
    assert(header_fields[0] == 'gi')
    gi = header_fields[1]
    data.append((gi, seq))
swiss_prot_db = pd.DataFrame(data, columns=['protein_id', 'protein_sequence'])
num_swiss_proteins = swiss_prot_db.shape[0]
print "Found %d of the %d IEDB proteins in the Swiss-Prot sequence database" % (num_swiss_proteins, num_iedb_proteins)

Using BLAST to find the sequence for all the GIs
Writing proteins to data/epitope_protein/proteins.fasta
Found 10114 of the 32234 IEDB proteins in the Swiss-Prot sequence database


## 4. Merge the IEDB epitope and Swiss-Prot sequence data

In [5]:
# Merge the IEDB epitops with the SWISS protein data and drop any NaN values (proteins not in SWISSprot)
iedb_swiss_merged = pd.merge(iedb_epitopes, swiss_prot_db, on="protein_id", how="right").dropna()
# Since we're going to sort on position, convert to numeric columns
iedb_swiss_merged["start"] = pd.to_numeric(iedb_swiss_merged["start"])
iedb_swiss_merged["end"] = pd.to_numeric(iedb_swiss_merged["end"])

In [6]:
for (protein_id, row_ids) in iedb_swiss_merged.groupby("protein_id").groups.items():
    epitope_rows = iedb_swiss_merged.loc[row_ids]
    # Grab the protein sequence from the first row
    protein_sequence = epitope_rows['protein_sequence'].iloc[0]
    #print protein_sequence
    # We don't know anything about the cuts to start: 50/50 chance of a cut
    cut_scores = [0.5]*len(protein_sequence)
    sorted_epitopes = epitope_rows.sort_values(by="start")
    ends = []
    for (i, epitope_sequence, start, end) in sorted_epitopes[["epitope_sequence", "start", "end"]].itertuples():
        start = int(start) - 1
        end = int(end)
        # Double check that the start and end positions are correct
        (a, b) = (protein_sequence[start:end], epitope_sequence)
        assert a == b, "'%s' != '%s'" % (a, b)
        #print ' ' * start,
        #print epitope_sequence
        for i in range(start, end):
            cut_scores[i] = 0
        ends.append(end)
    # Mark the ends last due to overlapping epitopes
    for end in ends:
        cut_scores[end] = 1
    #for (score, amino_acid) in zip(cut_scores, protein_sequence):
    #    print "%.1f%s" % (score, amino_acid),
    #print "\n"
    
    chunk_len = 20
    cut_score_chunks = [cut_scores[i:i+chunk_len] for i in range(0, len(cut_scores), chunk_len)]
    protein_sequence_chunks = [protein_sequence[i:i+chunk_len] for i in range(0, len(protein_sequence), chunk_len)]
    for (scores, peptide) in zip(cut_score_chunks, protein_sequence_chunks):
        print peptide
        print scores
    
    break

MKKKLVVLGLLAVVLVLVIV
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
GLCLWLPSASKEPDNHVYTR
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0.5]
AAVAADAKQCSKIGRDALRD
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
GGSAVDAAIAALLCVGLMNA
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
HSMGIGGGLFLTIYNSTTRK
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
AEVINAREVAPRLAFATMFN
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0.5, 0.5, 0.5, 0.5]
SSEQSQKGGLSVAVPGEIRG
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
YELAHQRHGRLPWARLFQPS
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
IQLARQGFPVGKGLAAALEN
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,

In [7]:
# Outputs each protein and epitope alignments, e.g.
# MVNLLQIVRDHWVHVLVPMGFVIGCYLDRKSDERLTAFRNKSMLFKRELQPSEEVTWK
#       IVRDHWVHV
#                                               RELQPSEEVTW
def write_epitope_protein_alignments(outfile):
    alignments = open(outfile, "w")
    output_width = 128
    for (protein_id, row_ids) in iedb_swiss_merged.groupby("protein_id").groups.items():
        epitope_rows = iedb_swiss_merged.loc[row_ids]
        # Grab the protein sequence from the first row
        protein_sequence = epitope_rows['protein_sequence'].iloc[0]
        sorted_epitopes = epitope_rows.sort_values(by="start")
        epitope_sequences = []
        for (i, epitope_sequence, start, end) in sorted_epitopes[["epitope_sequence", "start", "end"]].itertuples():
            start = int(start) - 1
            end = int(end)
            # Double check that the start and end positions are correct
            (a, b) = (protein_sequence[start:end], epitope_sequence)
            assert a == b, "'%s' != '%s'" % (a, b)
            epitope_sequences.append(' ' * start + epitope_sequence)
        for i in range(0, len(protein_sequence), output_width):
            alignments.write("Pro: %s\n" % (protein_sequence[i:i+output_width]))
            for epitope_sequence in epitope_sequences:
                alignments.write("Epi: %s\n" % (epitope_sequence[i:i+output_width]))
        alignments.write('-' * (output_width + 5) + "\n")
    alignments.close()
        
#write_epitope_protein_alignments(data_dir + "epitope_protein_alignments.txt")