In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [3]:
export = pd.read_csv('../data/2023-02-27_tcrdb_summary_all.tsv', sep = '\t')
print(export.shape)

(918, 39)


In [4]:
export.loc[export['pdb'] == '3pl6']

Unnamed: 0,pdb,Bchain,Achain,Dchain,Gchain,TCRtype,model,antigen_chain,antigen_type,antigen_name,...,authors,resolution,method,r_free,r_factor,affinity,affinity_method,affinity_temperature,affinity_pmid,engineered
239,3pl6,D,C,,,abTCR,0,,,,...,"Sethi, D.K., Wucherpfennig, K.W.",2.55,X-RAY DIFFRACTION,0.258,0.232,,,,,True


In [5]:
# remove gamma-delta
df1 = export.loc[export.TCRtype != 'gdTCR']
print(df1.shape)
# remove TCRs that have no annotation for alpha or beta chain
df2 = df1.dropna(subset = ['Bchain', 'Achain'], how = 'any')
print(df2.shape)
df3 = df2.drop(['Dchain', 'Gchain', 'gamma_subgroup', 'delta_subgroup', 'gamma_organism', 'delta_organism'], axis = 1).reset_index(drop = True)
print(df3.shape)
# keep only mhc class I or mhc class II or nothing
df4 = df3.loc[[x not in ['CD1', 'GA', 'GB', 'MR1'] for x in df3.mhc_type]]
df4.to_csv('../data/tcrdb_export_cleaned.csv')

(900, 39)
(804, 39)
(804, 33)


In [6]:
# get info from vdjdb
info =  pd.read_csv('../data/vdj-export-03022023.tsv', sep = '\t')
aDict = {x:json.loads(info.Meta.loc[x]) for x in info.index.values}
df = pd.DataFrame.from_dict(aDict, orient='index')
merged = pd.merge(info, df, left_index=True, right_index=True)
structures = merged.loc[merged['structure.id'] != '']
# print(structures)
structures.to_csv('../data/vdjdb_structure_info.csv')

- The chunk below cleans up STCRDab export .tsv, to only one chain combination per each structure
- It also extracts sequences for each of your structures.
- It uses custom functions, which it looks for in a functions/ folder within repo

In [7]:
from Bio import PDB
import os
os.chdir('..')
import functions.myfunctions as mf
import functions.sequencefunctions as sf
import functions.structurefunctions as stf
from datetime import date
import pymol

In [8]:
structures_dir1 = 'data/structures/structures_from_pdb/'
structures_dir = 'data/structures/stcrdab_all_structures/imgt/'
structures_dir2 = 'data/structures/stcrdab_all_structures/raw/'

In [9]:
datalist = df4.copy()
vdjdb_info = structures.copy()

In the chunk of code below, I use the information from vdjdb, pymol fetch and stcrdab imgt-numbered structures to cross-ref the sequence and make sure I am extracting the correct information. A few sequences are manually corrected, a few have been dropped to make my life easier. 

Note: my handling of epitope checks is slightly different from my handling of TCR chain checks, so when choosing best pdb the epitope does not play a part

In [10]:
## a little manual dataset cleanup
# please note that I could probably keep some of these (especially the ones where ep is not in imgt file), with some substantial rewriting of code
idx = datalist.loc[(datalist.pdb == '5yxu') & (datalist.Achain == 'F')].index.values.tolist()
idx1 = datalist.loc[(datalist.pdb == '6rp9') & (datalist.Achain == 'K')].index.values.tolist()
idx2 = datalist.loc[(datalist.pdb == '4c56')].index.values.tolist()
idx3 = datalist.loc[(datalist.pdb == '4ei6')].index.values.tolist() # weird, one chain is missing one res (can't figure out why)
idx4 = datalist.loc[(datalist.pdb == '2p1y')].index.values.tolist() # singlechain, but chains are weird in datalist
idx5 = datalist.loc[(datalist.pdb == '2wbj') & (datalist.Achain == 'G')].index.values.tolist()
idx6 = datalist.loc[(datalist.pdb == '4qrp') & (datalist.Achain == 'K')].index.values.tolist()
idx7 = datalist.loc[(datalist.pdb == '3d39')].index.values.tolist() # 3d39 is complexed with an epitope that has a fluoride attached
idx8 = datalist.loc[(datalist.pdb == '6bga')].index.values.tolist() # it has a loaded peptide and a velcro peptide
idx9 = datalist.loc[(datalist.pdb == '7sg2')].index.values.tolist() # this is odd because vdjdb and structure ep seq don't correspond for any obvious reasons
idx10 = datalist.loc[(datalist.pdb == '2wbj')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx11 = datalist.loc[(datalist.pdb == '2xn9')].index.values.tolist() # with enterotoxin
idx12 = datalist.loc[(datalist.pdb == '3c5z')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx13 = datalist.loc[(datalist.pdb == '3c60')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx14 = datalist.loc[(datalist.pdb == '3c6l')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx15 = datalist.loc[(datalist.pdb == '3o6f')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx16 = datalist.loc[(datalist.pdb == '3pl6')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx17 = datalist.loc[(datalist.pdb == '3rdt')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx18 = datalist.loc[(datalist.pdb == '3t0e')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx19 = datalist.loc[(datalist.pdb == '4grl')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx20 = datalist.loc[(datalist.pdb == '4may')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx21 = datalist.loc[(datalist.pdb == '4p23')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx22 = datalist.loc[(datalist.pdb == '4p4k')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx23 = datalist.loc[(datalist.pdb == '4p46')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx24 = datalist.loc[(datalist.pdb == '4p5t')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx25 = datalist.loc[(datalist.pdb.isin(['4y19', '4y1a']))].index.values.tolist() # reversed polarity
idx26 = datalist.loc[(datalist.pdb.isin(['5xos','5xot', '5xov']))].index.values.tolist() # Vdelta with Vbeta
idx27 = datalist.loc[(datalist.pdb == '6dfs')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx28 = datalist.loc[(datalist.pdb == '6dfw')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx29 = datalist.loc[(datalist.pdb == '6dfx')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx30 = datalist.loc[(datalist.pdb == '6mkd')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx31 = datalist.loc[(datalist.pdb == '6mkr')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx32 = datalist.loc[(datalist.pdb == '6mng')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx33 = datalist.loc[(datalist.pdb == '6mnm')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx34 = datalist.loc[(datalist.pdb == '6mnn')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx35 = datalist.loc[(datalist.pdb == '6mno')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx36 = datalist.loc[(datalist.pdb == '6xc9')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx37 = datalist.loc[(datalist.pdb == '6xco')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx38 = datalist.loc[(datalist.pdb == '6xcp')].index.values.tolist() # cannot find epitope in imgt-numbered file
idx39 = datalist.loc[(datalist.pdb == '6zky')].index.values.tolist() # added cysteine trap
idx40 = datalist.loc[(datalist.pdb == '6zkz')].index.values.tolist() # added cysteine trap
idx41 = datalist.loc[(datalist.pdb.isin(['2xna', '2nx9', '4udu', '5fk9', '5fka']))].index.values.tolist() # with staph enterotoxin
idx42 = datalist.loc[(datalist.pdb.isin(['4iiq', '4lcc', '4l8s']))].index.values.tolist() # MAIT TCRs with MR1
idx43 = datalist.loc[(datalist.pdb == '2icw')].index.values.tolist() # TCR with superantigen
idx44 = datalist.loc[(datalist.pdb.isin(['6v13', '6v15', '6v18', '6v19', '6v1a', '6v0y']))].index.values.tolist() # citrulline in peptide
idx45 = datalist.loc[(datalist.pdb == '3d3v')].index.values.tolist() # modified residue in peptide - please note vdjdb entry for this is wrong!


datalist.loc[datalist['pdb'] == '6vma', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '6vm7', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '6vm8', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '6vmc', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '3pl6', 'antigen_chain'] = 'D'
datalist.loc[datalist['pdb'] == '7l1d', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '7rrg', 'antigen_chain'] = 'C'
datalist.loc[datalist['pdb'] == '6p64', 'antigen_chain'] = ['C','H']

In [11]:
idx0 = datalist.loc[datalist.pdb == '1kb5'].index.values.tolist()
datalist.at[idx0[0], 'antigen_chain'] = np.nan
datalist_clean = datalist.drop(index= idx+idx1+idx2+idx3+idx4+idx5+idx6+idx7+idx8+idx9+idx10+idx11+
                               idx12+idx13+idx14+idx15+idx16+idx17+idx18+idx19+idx20+idx21+idx22+idx23+idx24+
                               idx25+idx26+idx27+idx28+idx29+idx30+idx31+idx32+idx33+idx34+idx35+idx36+
                               idx37+idx38+idx39+idx40+idx41+idx42+idx43+idx44+idx45)

In [12]:
p = PDB.PDBParser()
pymol.cmd.set('fetch_path', structures_dir1, quiet=0)
pymol.cmd.set('cif_keepinmemory')

 Setting: fetch_path set to ../data/structures_from_pdb/.


In [13]:
datalist_unique_pdbs = pd.DataFrame() # keep the most complete of each for avg
seqs = {}
# removed_pdbs = []
rows_to_remove = []
singlechain = ['2p1y']
missing_residues_inAorB = []
no_vdjdb_val = []
pdbnotes = {}
i = 1
j = 1

for num, pdb in enumerate(sorted(list(set(datalist_clean.pdb)))):

    # get information about pdb and handle special cases
    achain = list(datalist_clean.loc[datalist_clean.pdb == pdb]['Achain'])
    bchain = list(datalist_clean.loc[datalist_clean.pdb == pdb]['Bchain'])
    echain = list(datalist_clean.loc[datalist_clean.pdb == pdb]['antigen_chain'])

    if pdb in ['4z7w', '2gj6']:
        # 4z7w has the ag encoded by two chains because it's a peptide + sugar (I think...)
        echain = [x.split('|')[0].strip() for x in echain]
    
    if [x.upper() for x in achain] == [x.upper() for x in bchain]:
        print('Single chain TCR, ', pdb, ', total count: ', j)
        j += 1
        singlechain.append(pdb)
        continue

    chain_pairs = list(zip(achain, bchain, echain))

    s = pdb + '.pdb'
    print(num, pdb)

    pymol.cmd.fetch(pdb)
    
    manual = stf.get_missing_residues_from_header(structures_dir2, pdb)
    if manual is not None:
        print('checking out missing residues')
        manual.SSSEQI = manual.SSSEQI.astype('str')
        # automatic = stf.bioPDB_missing_residues(structures_dir2, pdb)
        automatic = stf.pymol_missing_residues(pdb)
        automatic.SSSEQI = automatic.SSSEQI.astype('str')
        try:
            assert manual.equals(automatic)
        except:
            print(pdb, ' - missing residues lists do not correspond')
            problematic_chains = set(manual.loc[~manual.SSSEQI.eq(automatic.SSSEQI)].C)
            print(manual)
            print(automatic)
            achain = [x for x in achain if x not in problematic_chains]
            bchain = [x for x in bchain if x not in problematic_chains]
            chain_pairs = [x for x in chain_pairs if x[0] in achain and x[1] in bchain]
            if len(achain) > 0 and len(bchain)>0 and len(chain_pairs)>0:
                print(pdb, ' has missing residues, but not in all A or B')
                print('allowed pairs: ', chain_pairs)
                rows_to_remove += datalist_clean.loc[(datalist_clean.pdb == pdb) & ((datalist_clean.Achain.isin(problematic_chains)) | (datalist_clean.Bchain.isin(problematic_chains)))].index.values.tolist()
                pass
            else:
                print(pdb, ' has problematic missing residues, skipped')
                missing_residues_inAorB.append(pdb)
                continue
    else:
        automatic = stf.pymol_missing_residues(pdb)
        assert manual == automatic, 'ERROR: Pymol finds missing residues, ' + pdb
    
    ## expected sequence (pymol includes the missing residues)
    # if you only extract sequence from the renumbered pdb, it might miss unresolved residues
    
    pymolseqs = pymol.cmd.get_fastastr(pdb).split('>')
    pymol.cmd.delete(pdb)
    pymolseqs = [x.replace('\n', '') for x in pymolseqs if x!='']
    pymolseqs_d = {x[5:6]:x[6:].strip('\n') for x in pymolseqs}

    if pdb == '6bj2':
        # alpha starts with PYROGLUTAMIC ACID
        pymolseqs_d['D'] = pymolseqs_d['D'].strip('?')
    if pdb == '6vm8':
        pymolseqs_d['D'] = pymolseqs_d['D'].strip('?') # remove CITRATE ANION
    #['6vm8', '5xot', '6bj2']:

    # renumber pymol sequences to find sequence of v region only

    pymol_imgt_a = {a:sf.get_vregion_seq(pymolseqs_d[a]) for a in achain}
    pymol_imgt_a_gaps = {a:sf.get_vregion_seq_with_gaps(pymolseqs_d[a]) for a in achain}
    pymol_imgt_b = {b:sf.get_vregion_seq(pymolseqs_d[b]) for b in bchain}
    pymol_imgt_b_gaps = {b:sf.get_vregion_seq_with_gaps(pymolseqs_d[b]) for b in bchain}

    assert len(set(pymol_imgt_a.values())) == 1, pymol_imgt_a # sanity check that all pymol sequences are the same
    assert len(set(pymol_imgt_b.values())) == 1, pymol_imgt_a_gaps # sanity check that all pymol sequences are the same
    assert len(set(pymol_imgt_a_gaps.values())) == 1, pymol_imgt_b # sanity check that all pymol sequences are the same
    assert len(set(pymol_imgt_b_gaps.values())) == 1, pymol_imgt_b_gaps # sanity check that all pymol sequences are the same

    ## check cdr3s are what I expect them to be, since I can get that info from vdjdb

    if pdb in set(vdjdb_info['structure.id']):
        cdr3a = vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb) & (vdjdb_info.Gene == 'TRA'), 'CDR3'].tolist()
        cdr3b = vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb) & (vdjdb_info.Gene == 'TRB'), 'CDR3'].tolist()

        # print(cdr3a)

        assert len(set(cdr3a)) == 1
        assert len(set(cdr3b)) == 1

        assert cdr3a[0] == sf.get_cdr3_seq(pymolseqs_d[achain[0]])
        assert cdr3b[0] == sf.get_cdr3_seq(pymolseqs_d[bchain[0]])
    else:
        print('cdr3 sequence cannot be validated as not in vdjdb')
        no_vdjdb_val.append(pdb)

    ## look at imgt pdb file 
    ## make a note of which chains have the correct amino acids in the v region
    struc = p.get_structure(pdb, structures_dir + s)
    sa = {a:sf.get_chain_seq(struc[0][a]) for a in achain}
    sb = {b:sf.get_chain_seq(struc[0][b]) for b in bchain}

    # handle epitope (less concerned about this)

    pymol_e = {}
    se = {}

    for e in echain:
        if pd.isna(e):
            pymol_e[e] = ''
            se[e] = ''
        else:
            print('getting sequence from chain ', e)
            if pdb == '1ymm':
                # flexible linker sequence
                se[e] = sf.get_chain_seq(struc[0][e]).replace('RGGSGGGGG', '')
                pymol_e[e] = pymolseqs_d[e].strip('RGGSGGGGG')
            elif pdb == '1zgl':
                # flexible linker sequence
                se[e] = sf.get_chain_seq(struc[0][e]).strip('G')
                pymol_e[e] = pymolseqs_d[e].strip('G')
                if e == 'I':
                    se[e] = 'V' + se[e]
            elif pdb == '2pxy':
                # leftover leader sequence
                se[e] = sf.get_chain_seq(struc[0][e]).strip('HS')
                pymol_e[e] = pymolseqs_d[e].strip('HS')
            elif pdb == '2pxy':
                # leftover leader sequence
                se[e] = sf.get_chain_seq(struc[0]['C']).strip('HS')
                pymol_e[e] = pymolseqs_d['F'].strip('HS')
            elif pdb == '3mbe':
                # this has an extra residue which is not solved in the pdb sequence
                pymol_e[e] = pymolseqs_d[e].strip('N')
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '4gg6':
                # the core sequence is binding P1-P9, which is the same across. Changing to pass test
                pymol_e[e] = pymolseqs_d[e].strip('QQYP') + 'P'
                se[e] = sf.get_chain_seq(struc[0][e])
            elif (pdb == '4ozg') and (e=='J'):
                pymol_e[e] = pymolseqs_d[e]
                se[e] = sf.get_chain_seq(struc[0][e]) + 'S' # final S missing in one of the two chains
            elif (pdb == '4ozh') and (e=='I'):
                pymol_e[e] = pymolseqs_d[e]
                se[e] = sf.get_chain_seq(struc[0][e]) + 'S' # final S missing in one of the two chains
                vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb), 'Epitope'] = 'APQPELPYPQPGS' # final S missing in vdjdb
            elif (pdb == '4ozi') and (e=='J'):
                pymol_e[e] = pymolseqs_d[e]
                se[e] = sf.get_chain_seq(struc[0][e]) + 'GS' # final GS missing in one of the two chains
                vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb), 'Epitope'] = 'QPFPQPELPYPGS' # final S missing in vdjdb
            elif pdb == '4p2o':
                # class II peptide, again the middle is the same and wobbly at the end
                pymol_e[e] = pymolseqs_d[e].strip('AD')
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '4p2q':
                # class II peptide, again the middle is the same and wobbly at the end
                pymol_e[e] = pymolseqs_d[e]
                if e not in ['H', 'C']:
                    se[e] = sf.get_chain_seq(struc[0][e]) + 'G'
                else:
                    se[e] = sf.get_chain_seq(struc[0][e])                
            elif pdb == '4z7u':
                # class II peptide, again the middle is the same and wobbly at the end
                vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb), 'Epitope'] = 'APSGEGSFQPSQENPQ' 
                pymol_e[e] = pymolseqs_d[e].strip('GS')
                se[e] = sf.get_chain_seq(struc[0][e])
                if e == 'I':
                    se[e] = 'A' + se[e]
            elif pdb == '4z7v':
                # class II peptide, again the middle is the same and wobbly at the end
                pymol_e[e] = pymolseqs_d[e][2:-3]
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '4z7w':
                # class II peptide, again the middle is the same and wobbly at the end
                pymol_e[e] = pymolseqs_d[e][1:-2]
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '6u3n':
                pymol_e[e] = pymolseqs_d[e].strip('GSGGSIEGR') # remove leftover linker seq
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '6u3o':
                pymol_e[e] = pymolseqs_d[e][:-7] # remove leftover linker seq
                se[e] = sf.get_chain_seq(struc[0][e])
            elif pdb == '7sg1':
                pymol_e[e] = pymolseqs_d[e]
                se[e] = 'L'+sf.get_chain_seq(struc[0][e])
            elif pdb == '7t2b':
                pymol_e[e] = pymolseqs_d[e]
                se[e] = sf.get_chain_seq(struc[0][e])
                if e == 'M':
                    se[e] = 'A'+se[e]
                se[e] = 'G'+se[e]
            elif pdb == '7t2c':
                pymol_e[e] = pymolseqs_d[e]
                se[e] = 'GA'+sf.get_chain_seq(struc[0][e])+'E'
            elif pdb == '7t2d':
                pymol_e[e] = pymolseqs_d[e]
                se[e] = 'G'+sf.get_chain_seq(struc[0][e])
            elif (pdb == '7z50') and (e != 'W'):
                pymol_e[e] = pymolseqs_d[e]
                se[e] = sf.get_chain_seq(struc[0][e]) + 'GG'
            else:
                se[e] = sf.get_chain_seq(struc[0][e])
                pymol_e[e] = pymolseqs_d[e]

    assert len(set(pymol_e.values())) == 1, pymol_e
    assert len(set(se.values())) == 1, se
    epitope_pymol = list(pymol_e.values())[0]
    epitope_struc = list(se.values())[0]

    if pdb in set(vdjdb_info['structure.id']):
        epitope = vdjdb_info.loc[(vdjdb_info['structure.id'] == pdb), 'Epitope'].unique().tolist()
        if pdb == '1zgl':
            # remove G from linker
            epitope = [epitope[0].strip('G')]
        assert len(set(epitope)) == 1
        print('epitope: ', epitope[0], ', pymol: ', epitope_pymol, ', structure: ', epitope_struc)
        assert epitope[0] == epitope_struc == epitope_pymol
    else:
        print('epitope sequence cannot be validated as not in vdjdb')
        print('pymol: ', epitope_pymol, ', structure: ', epitope_struc)
        assert epitope_struc == epitope_pymol
        no_vdjdb_val.append(pdb)
    
    assert len(epitope_pymol) < 30, epitope_pymol
   
    sa_ann = {a:(sa[a], sa[a] in pymol_imgt_a[a] or pymol_imgt_a[a] in sa[a]) for a in sa.keys()}
    sb_ann = {b:(sb[b], sb[b] in pymol_imgt_b[b] or pymol_imgt_b[b] in sb[b]) for b in sb.keys()}
    se_ann = {e:(se[e], se[e] == pymol_e[e]) for e in se.keys()}
    pdbnotes[pdb] = {}
    pdbnotes[pdb] = {(a,b,e):(sa_ann[a][1], sb_ann[b][1], se_ann[e][1]) for (a, b, e) in chain_pairs}

    # write down if in complex or not

    mhc = set(datalist_clean.loc[datalist_clean.pdb == pdb]['mhc_type'].isna())
    assert len(mhc) == 1

    complex = 1-int(list(mhc)[0])

    # save V region sequence to dictionary

    seqs[pdb] = {'alpha_aa_imgt':list(set(pymol_imgt_a.values()))[0], 'alpha_aa_imgt_withGaps':list(set(pymol_imgt_a_gaps.values()))[0], 
                'beta_aa_imgt':list(set(pymol_imgt_b.values()))[0], 'beta_aa_imgt_withGaps':list(set(pymol_imgt_b_gaps.values()))[0], 'epitope_aa':list(set(pymol_e.values()))[0]}


0 1ao7
 ExecutiveLoad-Detail: Detected mmCIF
checking out missing residues
getting sequence from chain  C
epitope:  LLFGYPVYV , pymol:  LLFGYPVYV , structure:  LLFGYPVYV
1 1bd2
 ExecutiveLoad-Detail: Detected mmCIF
checking out missing residues
getting sequence from chain  C
epitope:  LLFGYPVYV , pymol:  LLFGYPVYV , structure:  LLFGYPVYV
Single chain TCR,  1bwm , total count:  1
3 1d9k
 ExecutiveLoad-Detail: Detected mmCIF
No missing residues
No missing residue information from pymol
 Executive-Details: key '_pdbx_unobs_or_zero_occ_residues.auth_comp_id' not in cif data for object '1d9k'.
 Executive-Details: key '_pdbx_unobs_or_zero_occ_residues.auth_asym_id' not in cif data for object '1d9k'.
 Executive-Details: key '_pdbx_unobs_or_zero_occ_residues.auth_seq_id' not in cif data for object '1d9k'.
getting sequence from chain  Q
getting sequence from chain  P
epitope:  GNSHRGAIEWEGIESG , pymol:  GNSHRGAIEWEGIESG , structure:  GNSHRGAIEWEGIESG
4 1fo0
 ExecutiveLoad-Detail: Detected mmCIF

In [14]:
print(rows_to_remove)

[336]


In [15]:
sequences = pd.DataFrame.from_dict(seqs, orient = 'index')
data_with_seq = pd.merge(datalist_clean.drop(rows_to_remove), sequences, left_on='pdb', right_index=True)
print(data_with_seq)

      pdb Bchain Achain TCRtype  model antigen_chain antigen_type  \
0    7q99      E      D   abTCR      0             C      peptide   
1    7q9a      E      D   abTCR      0             C      peptide   
3    2z31      B      A   abTCR      0             P      peptide   
4    2ak4      E      D   abTCR      0             C      peptide   
42   2ak4      J      I   abTCR      0             H      peptide   
..    ...    ...    ...     ...    ...           ...          ...   
791  6fr5      B      A   abTCR      0           NaN          NaN   
792  5men      E      D   abTCR      0             C      peptide   
793  1ao7      E      D   abTCR      0             C      peptide   
794  6fup      B      A   abTCR      0           NaN          NaN   
795  4jff      E      D   abTCR      0             C      peptide   

                                antigen_name antigen_het_name mhc_type  ...  \
0    asn-leu-ser-ala-leu-gly-ile-phe-ser-thr              NaN      MH1  ...   
1    leu-leu-

In [18]:
# I don't want to drop duplicates, but rather group them so I know when structure repeats in multiple pdbs
data_with_seq_nodupl = data_with_seq.fillna('NA').replace('','NA').drop(['TCRtype'], axis=1).groupby(['alpha_aa_imgt', 'beta_aa_imgt', 'alpha_aa_imgt_withGaps', 'beta_aa_imgt_withGaps', 'epitope_aa']).agg(list).reset_index()
print(data_with_seq_nodupl)

                                         alpha_aa_imgt  \
0    ADSVTQTGGQVALSEEDFLTIHCNYSASGYPALFWYVQYPGEGPQF...   
1    AEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRKGPELL...   
2    AKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPEL...   
3    AKTTQPDSMESTEGETVHLPCSHATISGNEYIYWYRQVPLQGPEYV...   
4    AKTTQPISMDSYEGQEVNITCNHNDIATSDYIMWYQQFPNQGPRFI...   
..                                                 ...   
312  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
313  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
314  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
315  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
316  TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLL...   

                                          beta_aa_imgt  \
0    AVTQSPRNKVAVTGEKVTLSCNQTNNHNNMYWYRQDTGHGLRLIYY...   
1    AGVIQSPRHEVTEMGQQVTLRCKPISGHDYLFWYRQTMMRGLELLI...   
2    IAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGLGLRLI...   
3    AVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLIHY...   
4    DTAVSQTP

In [19]:
for c in data_with_seq_nodupl.columns:
    data_with_seq_nodupl[c] = [list(x) if (type(x)==set) and (len(x)>1) else list(x)[0] if type(x) == set else x for x in data_with_seq_nodupl[c]]
data_with_seq_nodupl = data_with_seq_nodupl.replace('NA','')

In [20]:
print(data_with_seq_nodupl)

                                         alpha_aa_imgt  \
0    ADSVTQTGGQVALSEEDFLTIHCNYSASGYPALFWYVQYPGEGPQF...   
1    AEVEQDPGPLSVPEGAIVSLNCTYSNSAFQYFMWYRQYSRKGPELL...   
2    AKEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPEL...   
3    AKTTQPDSMESTEGETVHLPCSHATISGNEYIYWYRQVPLQGPEYV...   
4    AKTTQPISMDSYEGQEVNITCNHNDIATSDYIMWYQQFPNQGPRFI...   
..                                                 ...   
312  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
313  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
314  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
315  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
316  TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLL...   

                                          beta_aa_imgt  \
0    AVTQSPRNKVAVTGEKVTLSCNQTNNHNNMYWYRQDTGHGLRLIYY...   
1    AGVIQSPRHEVTEMGQQVTLRCKPISGHDYLFWYRQTMMRGLELLI...   
2    IAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGLGLRLI...   
3    AVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLIHY...   
4    DTAVSQTP

In [21]:
best_combo = {}
best_pdb_set = pd.DataFrame()
for pdb in pdbnotes.keys():
    info = pdbnotes[pdb]
    print(info)
    for chain_mix in info:
        # print(chain_mix)
        if info[chain_mix][0]==True and info[chain_mix][1]==True and info[chain_mix][2] == True:
            best_combo[pdb] = {chain_mix:info[chain_mix]}
            break
    if pdb not in best_combo.keys():
        for chain_mix in info:
            if info[chain_mix][0]==True and info[chain_mix][1]==True:
                best_combo[pdb] = {chain_mix:info[chain_mix]}
                break
    if pdb not in best_combo.keys():
        for chain_mix in info:
            if info[chain_mix][0]==True or info[chain_mix][1]==True:
                best_combo[pdb] = {chain_mix:info[chain_mix]}
                break
    if pdb not in best_combo.keys():
        best_combo[pdb] = {chain_mix:info[list(info.keys())[0]]}

    print(best_combo[pdb])
    achain = list(best_combo[pdb].keys())[0][0]
    # print(achain)     
    df_line = data_with_seq.loc[(data_with_seq.pdb == pdb) & (data_with_seq.Achain == achain)].reset_index(drop = True)
    # print(df_line)
    assert df_line.shape[0] == 1
    best_pdb_set = pd.concat([best_pdb_set, df_line])
best_pdb_set.to_csv('data/tcrdbset_unique_pdbs_best_chains.csv')

{('D', 'E', 'C'): (True, False, True)}
{('D', 'E', 'C'): (True, False, True)}
{('D', 'E', 'C'): (True, False, True)}
{('D', 'E', 'C'): (True, False, True)}
{('E', 'F', 'Q'): (True, True, True), ('A', 'B', 'P'): (True, True, True)}
{('E', 'F', 'Q'): (True, True, True)}
{('A', 'B', 'P'): (True, True, True)}
{('A', 'B', 'P'): (True, True, True)}
{('D', 'E', 'C'): (True, True, True)}
{('D', 'E', 'C'): (True, True, True)}
{('A', 'B', 'P'): (True, False, True), ('C', 'D', 'Q'): (True, False, True)}
{('A', 'B', 'P'): (True, False, True)}
{('D', 'E', 'C'): (True, True, True)}
{('D', 'E', 'C'): (True, True, True)}
{('A', 'B', nan): (True, True, True)}
{('A', 'B', nan): (True, True, True)}
{('D', 'E', nan): (True, False, True)}
{('D', 'E', nan): (True, False, True)}
{('D', 'E', 'Q'): (True, True, True), ('A', 'B', 'P'): (True, True, True)}
{('D', 'E', 'Q'): (True, True, True)}
{('D', 'E', 'C'): (True, True, True)}
{('D', 'E', 'C'): (True, True, True)}
{('A', 'B', 'P'): (True, False, True), ('C',

In [22]:
pdbs_shorter_seqs = best_pdb_set.copy()
all_pdbs = pdbs_shorter_seqs['pdb']
all_similarities_alpha, pdbs_shorter_seqs = sf.find_more_duplicates(all_pdbs, seqs, pdbs_shorter_seqs, 'alpha')
all_similarities_beta, pdbs_shorter_seqs = sf.find_more_duplicates(all_pdbs, seqs, pdbs_shorter_seqs, 'beta')
pdbs_shorter_seqs_nodupl = pdbs_shorter_seqs.fillna('NA').replace('','NA').drop(['TCRtype'], axis=1).groupby(['alpha_aa_imgt_shorter', 'beta_aa_imgt_shorter']).agg(list).reset_index()

1
2
3
1
2
3


In [23]:
for c in pdbs_shorter_seqs_nodupl.columns:
    pdbs_shorter_seqs_nodupl[c] = [list(x) if (type(x)==set) and (len(x)>1) else list(x)[0] if type(x) == set else x for x in pdbs_shorter_seqs_nodupl[c]]
pdbs_shorter_seqs_nodupl = pdbs_shorter_seqs_nodupl.replace('NA','')

In [24]:
print(pdbs_shorter_seqs_nodupl)

                                 alpha_aa_imgt_shorter  \
0    AKTTQPDSMESTEGETVHLPCSHATISGNEYIYWYRQVPLQGPEYV...   
1    AKTTQPISMDSYEGQEVNITCNHNDIATSDYIMWYQQFPNQGPRFI...   
2    AKTTQPTSMDCAEGRAANLPCNHSTISGNEYVYWYRQIHSQGPQYI...   
3    AQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...   
4    AQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...   
..                                                 ...   
209  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
210  SQQGEEDPQALSIQEGENATMNCSYKTSINNLQWYRQNSGRGLVHL...   
211  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
212  SVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGPQLLL...   
213  TQLLEQSPQFLSIQEGENLTVYCNSSSVFSSLQWYRQEPGEGPVLL...   

                                  beta_aa_imgt_shorter           pdb  Bchain  \
0    AVTQSPRNKVTVTGGNVTLSCRQTNSHNYMYWYRQDTGHGLRLIHY...        [6l9l]     [H]   
1    DTAVSQTPKYLVRQTGKNESLKCEQNLGHNAMYWYKQDSKKLLKIM...        [7byd]     [J]   
2    TGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLT...  [3dx9, 

In [25]:
pdbs_shorter_seqs.to_csv('data/tcrdb_export_cleaned_withseqs.csv')
pdbs_shorter_seqs_nodupl.to_csv('data/tcrdbset_unique_receptors.csv')