In [1]:
input_name = "2021-01-06-published-sars-cov2-tcell-epitopes.xlsx"
output_name = input_name.replace(".xlsx", "-corrected-protein-names.csv")
df = pd.read_excel(input_name)

In [2]:
print(len(df))

973


In [3]:
df.Species.value_counts()

Human    958
Mouse     15
Name: Species, dtype: int64

In [4]:
orf1ab_nsp_to_seq = {}
last_id = None
buffer = []

def add_last_seq_and_clear_buffer():
    if last_id:
        orf1ab_nsp_to_seq[last_id] = "".join(buffer)
        buffer.clear()
    elif len(buffer) > 0:
        raise ValueError("Unexpected data in buffer")
        
with open("orf1ab-non-structural-proteins.fasta") as f:

    for l in f:
        if l.startswith(">"):
            add_last_seq_and_clear_buffer()
            last_id = l.split()[1]
        else:
            l = l.strip()
            if l:
                buffer.append(l)
    add_last_seq_and_clear_buffer()
orf1ab_nsp_to_len = {k: len(s) for k, s in orf1ab_nsp_to_seq.items()}
orf1ab_nsp_to_len

{'nsp1': 180,
 'nsp2': 638,
 'nsp3': 1945,
 'nsp4': 500,
 'nsp5': 306,
 'nsp6': 290,
 'nsp7': 83,
 'nsp8': 198,
 'nsp9': 113,
 'nsp10': 139,
 'nsp12': 932,
 'nsp13': 601,
 'nsp14': 527,
 'nsp15': 346,
 'nsp16': 298}

In [5]:
df.Protein.value_counts()

S         355
nsp3      163
N         127
M          87
ORF3a      55
nsp12      55
nsp4       45
ORF1ab     38
ORF8       20
nsp16      11
nsp6        5
nsp13       5
ORF7a       3
ORF3        2
ORF1a       1
nsp7        1
Name: Protein, dtype: int64

In [6]:
df[df.Protein == "ORF1a"]

Unnamed: 0,Source,Species,Exposure,Assay,Protein,Start,End,Length,# Responding,# Total,% Responding,Peptide,MHC,Peptide Selection Method,Allele Attribution,Response type,T-cell Stimulation,Notes
111,Schulien,Human,Infection,pMHC Multimer,ORF1a,2332,2340,9,10.0,11.0,0.909091,ILFTRFFYV,A*02:01,Predicted Binding Affinity,Biological,CD8,,


In [7]:
total_ORF1ab_amino_acids = sum([n for n in orf1ab_nsp_to_len.values()])

In [8]:
total_ORF1ab_amino_acids

7096

In [9]:
nsps_in_order = ["nsp%d" % (i + 1) for i in range(16) if i != 10]

In [10]:
ORF1ab_seq = "".join([orf1ab_nsp_to_seq[nsp_name] for nsp_name in nsps_in_order])

In [11]:
for _, row in df[df.Protein.str.startswith("ORF1a")].iterrows():
    start = row.Start
    end = row.End

    

In [12]:
ORF1ab_len = len(ORF1ab_seq)

In [13]:
base0_coord_to_nsp_name = [None] * ORF1ab_len
amino_acids_before_nsp = {}
last_pos = 0
for nsp_name in nsps_in_order:
    amino_acids_before_nsp[nsp_name] = last_pos
    n = orf1ab_nsp_to_len[nsp_name]
    for i in range(n):
        base0_coord_to_nsp_name[i + last_pos] = nsp_name
    last_pos += n

In [14]:
def drop_invisible_chars(s):
    s = s.strip()
    # strip out weird unicode spaces
    char_codes = [ord(c) for c in s]
    ascii_char_codes = [c for c in char_codes if c <= 128]
    chars = [chr(c) for c in ascii_char_codes]
    s_without_unicode = "".join(chars)
    if s != s_without_unicode:
        print("Converted %s (%d chars) => %s (%d chars)" % (
            s, len(s),
            s_without_unicode, len(s_without_unicode)))
    return s_without_unicode

def convert_string_to_number(s):
    if type(s) is int:
        return s
    elif type(s) is float:
        return int(s)
    else:
        assert type(s) is str
        s_without_unicode = drop_invisible_chars(s)
        return int(s_without_unicode)
        
rows = []
for row_num , row in df.iterrows():
    
    original_start = row.Start
    original_end = row.End
    original_protein = row.Protein
    start = convert_string_to_number(original_start)
    end = convert_string_to_number(original_end)
    protein = drop_invisible_chars(original_protein)
    row["Start"] = start
    row["End"] = end
    row["Protein"] = protein
    row["Original Location"] = "%s %s-%d" % (protein, start, end)
    if protein.startswith("nsp"):
        max_len = orf1ab_nsp_to_len[protein]
        if start >= max_len or end >= max_len:
            print("Row #%d has %s %d-%d but %s len = %d" % (
                    row_num + 1,
                    protein,
                    start,
                    end,
                    protein,
                    max_len
            ))
            protein = "ORF1ab"
        elif orf1ab_nsp_to_seq[protein][start - 1:end] != row.Peptide:
            print("Row #%d has wrong peptide ('%s') for %s %d-%d" % (
                    row_num + 1,
                    row.Peptide,
                    protein,
                    start,
                    end,
            ))
            protein = "ORF1ab"
    if protein.startswith("ORF1a"):
        nsp_name_of_start = base0_coord_to_nsp_name[start - 1]
        nsp_name_of_end = base0_coord_to_nsp_name[end - 1]
        row = row.copy()
        # make sure peptide matches ORF1ab reference if it has ORF1ab coordinates
        slice_peptide = ORF1ab_seq[start - 1:end]
        if row.Peptide != slice_peptide:
            raise ValueError("Wrong entry row #%d from %s (%s %d-%d), expected '%s' but got '%s'" % (
                row_num + 1,
                row.Source,
                protein,
                start,
                end,
                slice_peptide,
                row.Peptide))
       
        row["Protein"] = nsp_name_of_start
        row["Start"] = start - amino_acids_before_nsp[nsp_name_of_start]
        row["End"] = end - amino_acids_before_nsp[nsp_name_of_start]
    
    # make sure that peptide matches reference nsp proteins after remapping of coordinates
    if protein.startswith("nsp"):
        nsp_seq = orf1ab_nsp_to_seq[protein]
        slice_peptide = nsp_seq[row.Start - 1:row.End]
        peptide = row.Peptide
        if peptide != slice_peptide:
            raise ValueError("Wrong entry after mapping in row #%d from %s (%s %d-%d, originally %s), expected '%s' but got '%s'" % (
                row_num + 1,
                row.Source,
                row.Protein,
                row.Start,
                row.End,
                row["Original Location"],
                slice_peptide,
                row.Peptide,
                ))
    rows.append(row)

print("Processed %d records" % (len(rows),))

df_corrected = pd.DataFrame.from_records(rows)

# this column is incorrectly sometimes between [0,1] and other times between [0, 100]
df_corrected["% Responding"] = df_corrected["# Responding"] / df_corrected["# Total"]


Converted ​269 (4 chars) => 269 (3 chars)
Converted ​919 (4 chars) => 919 (3 chars)
Row #164 has wrong peptide ('HTTDPSFLGRY') for nsp3 1636-1646
Row #165 has wrong peptide ('TTDPSFLGRYM') for nsp3 1637-1647
Row #174 has wrong peptide ('ITFDNLKTL') for nsp3 1551-1559
Row #180 has wrong peptide ('TDNYITTY') for nsp3 1322-1329
Row #185 has nsp4 2898-2905 but nsp4 len = 500
Row #186 has nsp4 3136-3144 but nsp4 len = 500
Row #202 has nsp12 4586-4594 but nsp12 len = 932
Row #203 has nsp12 5220-5229 but nsp12 len = 932
Row #228 has nsp4 3155-3163 but nsp4 len = 500
Row #229 has nsp16 6875-6886 but nsp16 len = 298
Row #232 has nsp16 6913-6922 but nsp16 len = 298
Row #234 has nsp4 3199-3207 but nsp4 len = 500
Row #240 has nsp4 2788-2795 but nsp4 len = 500
Row #241 has nsp3 2017-2028 but nsp3 len = 1945
Row #242 has nsp12 4645-4657 but nsp12 len = 932
Row #253 has wrong peptide ('TTDPSFLGRY') for nsp3 1637-1646
Row #255 has nsp12 4892-4901 but nsp12 len = 932
Row #257 has nsp3 2495-2504 but nsp

In [15]:
df_corrected.to_csv(output_name, index=False)

In [16]:
df_corrected["Protein"].value_counts()

S        355
nsp3     177
N        127
M         87
nsp12     58
ORF3a     55
nsp4      47
ORF8      20
nsp16     11
nsp13     10
nsp6       6
nsp2       4
nsp9       4
nsp8       3
ORF7a      3
nsp7       2
ORF3       2
nsp14      1
nsp5       1
Name: Protein, dtype: int64

In [17]:
window_size = 10
rounded_start = ((df_corrected["Start"].map(int) / window_size).map(int) * window_size)
(df_corrected["Protein"] + "-" + rounded_start.map(str)).value_counts()[:20]

N-320        14
N-100        13
S-1200       11
M-170        10
N-360         9
ORF3a-200     8
N-310         8
S-260         8
S-710         8
nsp3-810      8
S-1050        8
S-860         8
S-680         7
M-130         7
S-160         7
S-440         7
N-300         7
S-30          7
S-370         6
nsp3-1500     6
dtype: int64

In [18]:
df_corrected[df.Species== "Human"].Source.value_counts()

Tarke          803
Peng            46
Schulien        36
Ferretti        29
Nelde           17
Shomuradova     13
Le Bert          8
Chour            2
Snyder           2
Minervina        2
Name: Source, dtype: int64