# import conga package

In [1]:
# import conga: you would need to point this to wherever you downloaded the repository
# this is the top level folder for the repository (ie, it should contain scripts/ conga/ examples/)
path_to_conga = '/home/pbradley/gitrepos/conga/'
import sys
sys.path.append(path_to_conga)
import conga
import numpy as np # not needed for tcrdist
import pandas as pd # not needed for tcrdist


# create tcrdist calculator object

In [2]:
# create tcrdist calculator for human alpha-beta tcrs
# other possible values for organism are
# 'mouse', 'human_gd', 'mouse_gd', human_ig' 
organism = 'human'
tcrdist = conga.tcrdist.tcr_distances.TcrDistCalculator(organism)


# do some calculations

In [3]:
# within conga the tcrs are stored as columns in the AnnData.obs array
# for calculations they are extracted into lists of tuples [tcr1, tcr2, ...]
# where each tcr tuple looks like ((va,ja,cdr3a,cdr3a_nucseq), (vb,jb,cdr3b,cdr3b_nucseq))
# but for TCRdist we only need the V gene and CDR3, so we leave the J genes as None below and skip the nucleotide seqs
tcr1 = (('TRAV1-1*01', None, 'CAVEALTGGGNKLTF'), ('TRBV5-6*01', None, 'CASSAYTSGPKEQYF'))
tcr2 = (('TRAV1-1*01', None, 'CAVPGITGGGNKLTF'), ('TRBV5-4*01', None, 'CASSLEQGPLQYF'))

tcrdist(tcr1,tcr2)

133.0

In [4]:
# or we can do single-chain distances
tcrb1 = ('TRBV5-6*01', None, 'CASSAYTSGPKEQYF')
tcrb2 = ('TRBV5-4*01', None, 'CASSLEQGPLQYF')

tcrdist.single_chain_distance(tcrb1,tcrb2)

103.0

# how do we figure out which V genes are understood by TCRdist?

In [5]:
# the information on genes and cdrs is stored in the all_genes dictionary.
# for tcrdist the V genes need to be in this dictionary. Note that they
# have allele information (eg '*01'). If 10x doesn't provide that
# we can just append '*01' to the V gene.
info = conga.tcrdist.all_genes.all_genes['human']
info.keys()


dict_keys(['TRAV1-1*01', 'TRAV1-1*02', 'TRAV1-2*01', 'TRAV1-2*02', 'TRAV10*01', 'TRAV11*01', 'TRAV12-1*01', 'TRAV12-1*02', 'TRAV12-2*01', 'TRAV12-2*02', 'TRAV12-2*03', 'TRAV12-3*01', 'TRAV12-3*02', 'TRAV13-1*01', 'TRAV13-1*02', 'TRAV13-1*03', 'TRAV13-2*01', 'TRAV13-2*02', 'TRAV14/DV4*01', 'TRAV14/DV4*02', 'TRAV14/DV4*03', 'TRAV14/DV4*04', 'TRAV16*01', 'TRAV17*01', 'TRAV18*01', 'TRAV19*01', 'TRAV2*01', 'TRAV2*02', 'TRAV20*01', 'TRAV20*02', 'TRAV20*03', 'TRAV20*04', 'TRAV21*01', 'TRAV21*02', 'TRAV22*01', 'TRAV23/DV6*01', 'TRAV23/DV6*02', 'TRAV23/DV6*03', 'TRAV23/DV6*04', 'TRAV24*01', 'TRAV24*02', 'TRAV25*01', 'TRAV26-1*01', 'TRAV26-1*02', 'TRAV26-1*03', 'TRAV26-2*01', 'TRAV26-2*02', 'TRAV27*01', 'TRAV27*02', 'TRAV27*03', 'TRAV29/DV5*01', 'TRAV29/DV5*02', 'TRAV3*01', 'TRAV30*01', 'TRAV30*02', 'TRAV30*03', 'TRAV30*04', 'TRAV34*01', 'TRAV35*01', 'TRAV35*02', 'TRAV36/DV7*01', 'TRAV36/DV7*02', 'TRAV36/DV7*03', 'TRAV36/DV7*04', 'TRAV38-1*01', 'TRAV38-1*02', 'TRAV38-1*03', 'TRAV38-1*04', 'TRAV3

In [6]:
# info maps from gene names to objects that store some info about the genes
trav12_info = info['TRAV1-2*01']
print(dir(trav12_info))
print(trav12_info.chain,
      trav12_info.region,
      trav12_info.cdrs)
      

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'alseq', 'cdr_columns', 'cdrs', 'chain', 'count_rep', 'id', 'mm1_rep', 'nucseq', 'nucseq_offset', 'organism', 'protseq', 'region', 'rep']
A V ['TSG......FNG', 'NVL....DGL', 'SRSKGY', 'CAVR.....']


In [7]:
# the information in the all_genes dictionary is read from this tsv file
tsvfile = path_to_conga+'conga/tcrdist/db/combo_xcr.tsv'
df = pd.read_csv(tsvfile, sep='\t')
df.head()


Unnamed: 0,id,organism,chain,region,nucseq,frame,aligned_protseq,cdr_columns,cdrs
0,TRAV1*01,mouse,A,V,ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...,1,GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...,28-39;57-66;82-88;106-111,TSG......FNG;VVL....DGL;SRSN.GY;CAVR..
1,TRAV1*02,mouse,A,V,ggacagggtgtggagcagcctgccaaattgatgtctgtggagggaa...,1,GQGVEQ.P.AKLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...,28-39;57-66;82-88;106-111,TSG......FNG;VVL....DGL;SRSN.GY;CAVR..
2,TRAV10*01,mouse,A,V,ggagagaaggtcgagcaacacgagtctacactgagtgttcgagagg...,1,GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...,28-39;57-66;82-88;106-111,DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..
3,TRAV10*02,mouse,A,V,ggagagaaggtcgagcaacatgagtctacactgagtgttcgagagg...,1,GEKVEQHE.STLSVREGDSAVINCTYTDTA......SSYFPWYKQE...,28-39;57-66;82-88;106-111,DTA......SSY;IRSN...VDR;DKKA.KR;CAAS..
4,TRAV10*03,mouse,A,V,ggagagaaggtcgagcaacacgagtctacacttagtgttcaagagg...,1,GEKVEQHE.STLSVQEGDSAVINCTYTDTA......SSYFPWYKQE...,28-39;57-66;82-88;106-111,DTA......SSY;IRSN...VDR;DKKA.KR;CAAA..


In [8]:
df.iloc[0]

id                                                          TRAV1*01
organism                                                       mouse
chain                                                              A
region                                                             V
nucseq             ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...
frame                                                              1
aligned_protseq    GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...
cdr_columns                                28-39;57-66;82-88;106-111
cdrs                          TSG......FNG;VVL....DGL;SRSN.GY;CAVR..
Name: 0, dtype: object

# compute a tcrdist matrix: python

In [9]:
# lets say we had a long list of tcrs

tcrs = [(('TRAV1-1*01',
   'TRAJ10*01',
   'CAVEALTGGGNKLTF',
   'tgcgctgtggaggcactcacgggaggaggaaacaaactcaccttt'),
  ('TRBV5-6*01',
   'TRBJ2-7*01',
   'CASSAYTSGPKEQYF',
   'tgtgccagcagcgcgtacactagcggacctaaagagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ10*01',
   'CAVPGITGGGNKLTF',
   'tgcgctgtgcccggaatcacgggaggaggaaacaaactcaccttt'),
  ('TRBV5-4*01',
   'TRBJ2-5*01',
   'CASSLEQGPLQYF',
   'tgtgccagcagcctggagcagggaccccttcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ10*01',
   'CAVRDLGLTGGGNKLTF',
   'tgcgctgtgagagatctggggctcacgggaggaggaaacaaactcaccttt'),
  ('TRBV9*01',
   'TRBJ2-2*01',
   'CASSVEKRGGAGELFF',
   'tgtgccagcagcgtagagaagcgggggggtgccggggagctgtttttt')),
 (('TRAV1-1*01',
   'TRAJ10*01',
   'CAVRGTGGGNKLTF',
   'tgtgctgtgagaggtacgggaggaggaaacaaactcaccttt'),
  ('TRBV2*01',
   'TRBJ2-7*01',
   'CASSEAPTGWEQYF',
   'tgtgccagcagtgaagccccgacagggtgggagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ10*01',
   'CAVRVGGGNKLTF',
   'tgcgctgtgagggtgggaggaggaaacaaactcaccttt'),
  ('TRBV11-2*01',
   'TRBJ1-1*01',
   'CASSLDDVGGGFMNTEAFF',
   'tgtgccagcagcttagacgatgttggaggggggttcatgaacactgaagctttcttt')),
 (('TRAV1-1*01',
   'TRAJ10*01',
   'CAVSTGGGNKLTF',
   'tgcgctgtgagcacgggaggaggaaacaaactcaccttt'),
  ('TRBV20-1*01',
   'TRBJ2-1*01',
   'CSARARQGLFLNEQFF',
   'tgcagtgctagagcgagacaggggcttttcttgaatgagcagttcttc')),
 (('TRAV1-1*01',
   'TRAJ11*01',
   'CAALGGYSTLTF',
   'tgcgctgccctcggaggatacagcaccctcaccttt'),
  ('TRBV4-1*01',
   'TRBJ2-2*01',
   'CASSHSPGLAGGHTGELFF',
   'tgcgccagcagccactcgccgggactagcgggagggcacaccggggagctgtttttt')),
 (('TRAV1-1*01',
   'TRAJ11*01',
   'CAVRDKNSGYSTLTF',
   'tgcgctgtgagagataagaattcaggatacagcaccctcaccttt'),
  ('TRBV6-3*01',
   'TRBJ1-2*01',
   'CASTSADGYGYTF',
   'tgtgccagcacctcggcggacgggtatggctacaccttc')),
 (('TRAV1-1*01',
   'TRAJ11*01',
   'CAVRSDSGYSTLTF',
   'tgcgctgtgagaagcgattcaggatacagcaccctcaccttt'),
  ('TRBV15*01',
   'TRBJ2-7*01',
   'CATTPEGAPYEQYF',
   'tgtgccaccaccccggagggagcgccctacgagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAAEGMDSSYKLIF',
   'tgcgctgccgaagggatggatagcagctataaattgatcttc'),
  ('TRBV5-6*01',
   'TRBJ2-5*01',
   'CASSLGTGTSGPQETQYF',
   'tgtgccagcagcctggggacaggaacatcgggacctcaagagacccagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAGIQDSSYKLIF',
   'tgcgccgggatacaagatagcagctataaattgatcttc'),
  ('TRBV20-1*01',
   'TRBJ2-1*01',
   'CSARAKRGLFNEQFF',
   'tgcagtgctagagcaaagcggggcctcttcaatgagcagttcttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAKVDSSYKLIF',
   'tgcgctaaagtggatagcagctataaattgatcttc'),
  ('TRBV20-1*01',
   'TRBJ1-1*01',
   'CSARRPGQEVTEAFF',
   'tgcagtgctaggaggccgggacaggaggtcactgaagctttcttt')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CASPGGDSSYKLIF',
   'tgcgcttccccggggggggatagcagctacaaattgatcttc'),
  ('TRBV19*01',
   'TRBJ2-2*01',
   'CASSPSEANTGELFF',
   'tgtgccagtagtccatcggaggcgaacaccggggagctgtttttt')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVEKDSSYKLIF',
   'tgcgctgtggaaaaggatagcagctataaattgatcttc'),
  ('TRBV12-4*01',
   'TRBJ2-7*01',
   'CASKREDYEQYF',
   'tgtgccagcaagagggaggactacgagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVLHSSYKLIF',
   'tgcgctgtgctgcatagcagctataaattgatcttc'),
  ('TRBV7-9*01',
   'TRBJ2-7*01',
   'CASSSPGTGTSYEQYF',
   'tgtgccagcagctccccagggacagggacttcctacgagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVRDGKSGDSSYKLIF',
   'tgcgctgtgagagatggaaagagcggggatagcagctataaattgatcttc'),
  ('TRBV19*01',
   'TRBJ2-5*01',
   'CASSSGSGPSQETQYF',
   'tgtgccagtagttcaggtagcgggccgtcccaagagacccagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVRDPVDSSYKLIF',
   'tgcgctgtgagagatccggtggatagcagctataaattgatcttc'),
  ('TRBV6-3*01',
   'TRBJ2-5*01',
   'CASSPEREGSEETQYF',
   'tgtgccagcagtccggaacgggaggggagtgaagagacccagtacttc')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVREDSSYKLIF',
   'tgcgctgtgagagaggatagcagctataaattgatcttc'),
  ('TRBV7-2*01',
   'TRBJ2-2*01',
   'CASSLGPQGTGELFF',
   'tgtgccagcagcttaggcccccagggaaccggggagctgtttttt')),
 (('TRAV1-1*01',
   'TRAJ12*01',
   'CAVSRMDSSYKLIF',
   'tgcgctgtgtcccggatggatagcagctataaattgatcttc'),
  ('TRBV19*01',
   'TRBJ2-7*01',
   'CASSHLLSTVDYEQYF',
   'tgtgccagtagtcacctcctatcgacagtggactacgagcagtacttc')),
 (('TRAV1-1*01',
   'TRAJ13*01',
   'CAASGGYQKVTF',
   'tgcgctgcgtctgggggttaccagaaagttaccttt'),
  ('TRBV28*01',
   'TRBJ1-5*01',
   'CASSSGADSLYQPQHF',
   'tgtgccagcagcagcggggctgacagtttatatcagccccagcatttt'))]
len(tcrs)

20

In [10]:
# here's a one-liner to create the distance matrix in python
D = np.array([tcrdist(t1,t2) for t1 in tcrs for t2 in tcrs]).reshape((len(tcrs),len(tcrs)))
D

array([[  0., 133., 174., 176., 249., 219., 295., 260., 217., 204., 243.,
        267., 239., 225., 223., 272., 254., 229., 230., 246.],
       [133.,   0., 189., 152., 243., 247., 295., 229., 219., 208., 268.,
        277., 233., 216., 247., 281., 247., 241., 254., 256.],
       [174., 189.,   0., 208., 254., 271., 291., 285., 251., 273., 283.,
        298., 265., 277., 264., 277., 249., 223., 283., 306.],
       [176., 152., 208.,   0., 216., 225., 267., 204., 181., 248., 249.,
        258., 220., 193., 195., 256., 237., 208., 229., 241.],
       [249., 243., 254., 216.,   0., 235., 253., 293., 272., 261., 289.,
        259., 268., 259., 237., 292., 281., 228., 274., 273.],
       [219., 247., 271., 225., 235.,   0., 274., 290., 254., 291., 138.,
        165., 260., 239., 248., 281., 281., 253., 248., 236.],
       [295., 295., 291., 267., 253., 274.,   0., 286., 241., 274., 283.,
        274., 219., 279., 234., 315., 304., 236., 282., 239.],
       [260., 229., 285., 204., 293., 290

# compute a distance matrix using the (faster) C++ code 

In [11]:
# or we can use this wrapper around the C++ implementation if we have a longer list
# or really want it to be fast
D2 = conga.preprocess.calc_tcrdist_matrix_cpp(tcrs,'human')
# this should be True
np.all(D2==D)

util.run_command: cmd= /home/pbradley/gitrepos/conga/tcrdist_cpp/bin/find_neighbors -f tmp_tcrdists5345_tcrs.tsv --only_tcrdists -d /home/pbradley/gitrepos/conga/tcrdist_cpp/db/tcrdist_info_human.txt -o tmp_tcrdists5345


True