## Download epitopes from IEDB.org

http://www.iedb.org/

Choose the following in the search
* Epitope: Linear Epitopes
* Assay: Positive Assays Only, MHC Ligand Assays
* MHC Restriction: MHC Class I
* Host: Human

Hit the "search" button. Once the search is complete, hit the "Export Epitopes Results" button to get a CSV file.

## Cleanup IEDB Data

In [1]:
import pandas as pd
import numpy as np

data_dir = "data/epitope_protein/"

# Load the CSV from IEDB (skipping the first line, [2:])
epi = pd.DataFrame.from_csv(data_dir + "epitope_table_export_1477949116.csv")[2:]
# Rename the columns to remove whitespace
epi.columns=['type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'protein', 'protein_id', 'organism', 'oid', 'comments']
# Remove GI entries that start with prefix "SRC"
epi = epi[epi.protein_id.str.startswith("SRC") == False]
# Remove the "GI:" prefix from the GIs provided by IEDB
epi["protein_id"] = epi.protein_id.str.replace("GI:", "") 
# Drop any epitopes that are not [8-12] amino acids long
iedb_epitopes = epi[(epi.epitope_sequence.str.len() >= 8) & (epi.epitope_sequence.str.len() <= 12)]\
                .loc[:, ["epitope_sequence", "protein_id", "start", "end"]]
# Create a file with a list of unique protein (antigen) IDs for use with BLAST
antigen_ids = iedb_epitopes["protein_id"].unique()
np.savetxt(data_dir + "protein_ids.txt", antigen_ids, fmt="%s")

num_iedb_epitopes = iedb_epitopes.shape[0]
num_iedb_proteins = antigen_ids.shape[0]

print "There are %d IEDB MHC-1 epitopes from %d unique proteins (antigens)." % (num_iedb_epitopes, num_iedb_proteins)

  if self.run_code(code, result):


There are 88793 IEDB MHC-1 epitopes from 31375 unique proteins (antigens).


## BLAST the SWISS Protein database in order to find protein data

In [2]:
from itertools import groupby

# https://www.biostars.org/p/710/
def fasta_iter(fasta_name):
    """
    given a fasta file. yield tuples of header, sequence
    """
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.next())
        yield header, seq

In [3]:
print "Using BLAST to find the sequence for all the GIs"
blast = !blastdbcmd -db /workspace/blast/swiss/swissprot.00 -entry_batch data/epitope_protein/protein_ids.txt \
                -out data/epitope_protein/proteins.fasta
print "BLAST Finished.\nProcessing data..."
data = []
for (header, seq) in fasta_iter(data_dir + "proteins.fasta"):
    header_fields = header.split('|')
    assert(header_fields[0] == 'gi')
    gi = header_fields[1]
    data.append((gi, seq))

swiss_prot_db = pd.DataFrame(data, columns=['protein_id', 'protein_sequence'])
num_swiss_proteins = swiss_prot_db.shape[0]
print "Found %d of the %d IEDB proteins in the SWISS protein database" % (num_swiss_proteins, num_iedb_proteins)
print "Done."

Using BLAST to find the sequence for all the GIs
BLAST Finished.
Processing data...
Found 9968 of the 31375 IEDB proteins in the SWISS protein database
Done.


## Merge the IEDB epitope and SWISS protein data

In [4]:
# Merge the IEDB epitops with the SWISS protein data and drop any NaN values (proteins not in SWISSprot)
iedb_swiss_merged = pd.merge(iedb_epitopes, swiss_prot_db, on="protein_id", how="right").dropna()
# Since we're going to sort on position, convert to numeric columns
iedb_swiss_merged["start"] = pd.to_numeric(iedb_swiss_merged["start"])
iedb_swiss_merged["end"] = pd.to_numeric(iedb_swiss_merged["end"])

In [5]:
alignments = open(data_dir + "epitope_protein_alignments.txt", "w")

for (protein_id, row_ids) in iedb_swiss_merged.groupby("protein_id").groups.items():
    
    epitope_rows = iedb_swiss_merged.loc[row_ids]
    # Grab the protein sequence from the first row
    protein_sequence = epitope_rows['protein_sequence'].iloc[0]
    alignments.write(protein_sequence + "\n")
    sorted_epitopes = epitope_rows.sort_values(by="start")
    for (i, epitope_sequence, start, end) in sorted_epitopes[["epitope_sequence", "start", "end"]].itertuples():
        alignments.write(' ' * (int(start)-1))
        alignments.write(epitope_sequence + "\n")

alignments.close()
        