In [5]:
cols = [
    "cell-line",
    "gene",
    "chr",
    "pos",
    "mut",
    "allele",
    "mut_aff",
    "mut_pr",
    "mut_seq",
    "wt_aff",
    "wt_pr",
    "wt_seq"
]

from collections import OrderedDict
data = OrderedDict()
for col in cols:
    data[col] = []
    
with open("neoepitope_catalog_20151210.csv") as f:
    for i, line in enumerate(f):
        # get rid of comma in "Jurkat,Clone61"
        line = line.replace(",CloneE61", "-CloneE61")
        line = line.replace("\n", "")
        parts = line.split(",")
        n_fields = len(parts)
        if n_fields == 13:
          # position is sometimes doubled
          parts = parts[:4] + parts[5:]
        elif n_fields != 12:
            print("Bad data '%s' with %d parts on line %d"
                 % (line, n_fields, i + 1))
            break
        for col, value in zip(cols, parts):
            data[col].append(value)

df = pd.DataFrame(data)

In [6]:
len(df)

1514091

In [7]:
df.wt_seq

0          LKSYSGDVTAVEQPI
1          SSWAAQETCYQLRYT
2          SWAAQETCYQLRYTG
3          WAAQETCYQLRYTGE
4          AAQETCYQLRYTGEG
5          AQETCYQLRYTGEGH
6          QETCYQLRYTGEGHQ
7          ETCYQLRYTGEGHQD
8          TCYQLRYTGEGHQDW
9          CYQLRYTGEGHQDWK
10         YQLRYTGEGHQDWKV
11         QLRYTGEGHQDWKVL
12         LRYTGEGHQDWKVLE
13         RYTGEGHQDWKVLEP
14         YTGEGHQDWKVLEPP
15         TGEGHQDWKVLEPPL
16         SSWAAQETCYQLRYT
17         SWAAQETCYQLRYTG
18         WAAQETCYQLRYTGE
19         AAQETCYQLRYTGEG
20         AQETCYQLRYTGEGH
21         QETCYQLRYTGEGHQ
22         ETCYQLRYTGEGHQD
23         TCYQLRYTGEGHQDW
24         CYQLRYTGEGHQDWK
25         YQLRYTGEGHQDWKV
26         QLRYTGEGHQDWKVL
27         LRYTGEGHQDWKVLE
28         RYTGEGHQDWKVLEP
29         YTGEGHQDWKVLEPP
                ...       
1514061        LLRQRLEREAR
1514062           IRMEATRV
1514063         SYKSITTDDW
1514064           SPLPRGII
1514065          KAFCLEALK
1514066           ILMPLLKK
1

In [8]:
with open("cell-line-alleles.csv", "w") as f:
    f.write("cell-line,alleles\n")
    for cell_line, g in df.groupby("cell-line"):
        alleles = g.allele.unique()
        class1 = [a for a in alleles if a.startswith("HLA-")]
        f.write(cell_line)
        f.write(",")
        f.write(";".join(sorted(class1)))
        f.write("\n")
        

In [9]:
df.to_csv("fixed-neoepitope-catalog.csv", index=False)

In [10]:
!cat cell-line-alleles.csv

cell-line,alleles
143B,HLA-A*02:11;HLA-B*52:01;HLA-C*12:02
2313287,HLA-A*25:01;HLA-B*18:01;HLA-C*12:03
253J,HLA-A*01:01;HLA-A*03:01;HLA-B*08:01;HLA-B*51:01;HLA-C*07:01;HLA-C*12:03
253JBV,HLA-A*01:01;HLA-A*03:01;HLA-B*08:01;HLA-B*51:01;HLA-C*07:06;HLA-C*12:03
42MGBA,HLA-A*33:03;HLA-B*39:01;HLA-C*12:03
501A,HLA-A*02:01;HLA-A*03:01;HLA-B*07:02;HLA-B*14:02;HLA-C*07:02;HLA-C*08:02
537MEL,HLA-A*01:01;HLA-A*26:01;HLA-B*15:32;HLA-B*44:02;HLA-C*01:02;HLA-C*05:01
59M,HLA-A*03:01;HLA-A*11:01;HLA-B*35:01;HLA-B*44:02;HLA-C*04:01;HLA-C*05:01
624mel,HLA-A*02:01;HLA-A*03:01;HLA-B*07:02;HLA-B*14:02;HLA-C*07:02;HLA-C*08:02
639V,HLA-A*01:01;HLA-A*02:01;HLA-B*44:02;HLA-B*57:01;HLA-C*05:01;HLA-C*06:02
647V,HLA-A*02:01;HLA-B*27:05;HLA-B*40:01;HLA-C*02:02;HLA-C*03:04
697,HLA-A*02:01;HLA-A*25:01;HLA-B*07:02;HLA-B*15:01
769P,HLA-A*03:01;HLA-A*24:02;HLA-B*07:02;HLA-C*07:02
786O,HLA-A*03:01;HLA-B*07:02;HLA-B*44:02;HLA-C*05:01;HLA-C*07:02
8305C,HLA-A*11:01;HLA-B*15:01;HLA-B*54:01;HLA-C*01:02;HLA-C*