## Download epitopes from IEDB.org

http://www.iedb.org/

Choose the following in the search
* Epitope: Linear Epitopes
* Assay: Positive Assays Only, MHC Ligand Assays
* MHC Restriction: MHC Class I
* Host: Human

Hit the "search" button. Once the search is complete, hit the "Export Epitopes Results" button to get a CSV file.

In [1]:
import pandas as pd
import numpy as np

data_dir = "data/epitope_protein/"

# Load the CSV from IEDB (skipping the first line, [2:])
epi = pd.DataFrame.from_csv(data_dir + "epitope_table_export_1477949116.csv")[2:]
# Rename the columns to remove whitespace
epi.columns=['type', 'epitope_sequence', 'start', 'end', 'chebi', 'syn', 'antigen', 'antigenid', 'organism', 'oid', 'comments']
# Remove GI entries that start with prefix "SRC"
epi = epi[epi.antigenid.str.startswith("SRC") == False]
# Remove the "GI:" prefix from the GIs provided by IEDB
epi["antigenid"] = epi.antigenid.str.replace("GI:", "")
# Drop any epitopes that are not [8-12] amino acids long
id_seq_antigen = epi[(epi.epitope_sequence.str.len() >= 8) & (epi.epitope_sequence.str.len() <= 12)]\
                .loc[:, ["epitope_sequence", "antigenid"]]
# Save the data to a CSV file
id_seq_antigen.to_csv(data_dir + "epitope_sequence_antigen.csv")
# Create a file with a list of unique antigen IDs
antigen_ids = id_seq_antigen["antigenid"].unique()
np.savetxt(data_dir + "antigen_ids.txt", antigen_ids, fmt="%s")

  if self.run_code(code, result):


In [2]:
from itertools import groupby

# https://www.biostars.org/p/710/
def fasta_iter(fasta_name):
    """
    given a fasta file. yield tuples of header, sequence
    """
    fh = open(fasta_name)
    # ditch the boolean (x[0]) and just keep the header or sequence since
    # we know they alternate.
    faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.next())
        yield header, seq

In [3]:
print "Using BLAST to find the sequence for all the GIs"
blast = !blastdbcmd -db /workspace/blast/swiss/swissprot.00 -entry_batch data/epitope_protein/antigen_ids.txt \
                -out data/epitope_protein/antigens.fasta
print "BLAST Finished.\nProcessing data..."
data = []
for (header, seq) in fasta_iter(data_dir + "antigens.fasta"):
    header_fields = header.split('|')
    assert(header_fields[0] == 'gi')
    gi = header_fields[1]
    data.append((gi, seq))

gi_seq = pd.DataFrame(data, columns=['antigenid', 'antigen_sequence'])
print "Done."

Using BLAST to find the sequence for all the GIs
BLAST Finished.
Processing data...
Done.


In [8]:
# Merging data from IEDB and Swiss
#print id_seq_antigen.head(10)
#print gi_seq.head(10)

#gi_seq[gi_seq.antigenid == "121919"]
id_seq_antigen[id_seq_antigen.antigenid == "121919"]

merged = pd.merge(id_seq_antigen, gi_seq, on="antigenid", how="left")
merged.to_csv(data_dir + "epitope_protein_sequences.csv")