In [1]:
import csv

Define functions:

In [2]:
def revcomp(seq):
  comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
  return "".join(comp[base] for base in seq[::-1])

These are the 20 sub-barcodes that will be used:

In [3]:
subbarcodes = ['ACGT', 'ACTG', 'AGCT', 'AGTC', 'ATCG', 'CAGT', 'CGAT', 'CGTA', 'CTGA', 'GACT', 'GATC', 'GCTA', 'GTAC', 'GTCA', 'TACG', 'TAGC', 'TCAG', 'TCGA', 'TGAC', 'TGCA']

Read in the data from Pryor et al. (https://doi.org/10.1371/journal.pone.0238592; saved their .xlsx as .csv) and create a nested dictionary:

In [4]:
with open ('ligation_efficiency.csv') as f:
  reader = csv.DictReader(f)
  efficiencies = {}
  for row in reader:
    overhang = row['Overhang']
    row.pop('Overhang')
    row = dict([a, int(x)] for a, x in row.items()) # convert values from string to int
    efficiencies[overhang] = row

Find the efficiency of each sub-barcode ligating to its reverse complement to make sure that none of the sub-barcodes have low ligation efficiency. Also compare to the worst and best efficiencies seen.

In [5]:
for barcode in subbarcodes:
  print(f'{barcode}: {efficiencies[barcode][revcomp(barcode)]}')

self_efficiencies = []
for overhang in efficiencies:
  self_efficiencies.append(efficiencies[overhang][revcomp(overhang)])

self_efficiencies.sort()
print(f'Worst efficiencies: {self_efficiencies[:10]}')
print(f'Best efficiencies: {self_efficiencies[-10:]}')
print(f'Average efficiency: {sum(self_efficiencies)/len(self_efficiencies)}')

ACGT: 582
ACTG: 595
AGCT: 456
AGTC: 606
ATCG: 570
CAGT: 595
CGAT: 570
CGTA: 507
CTGA: 712
GACT: 606
GATC: 616
GCTA: 524
GTAC: 504
GTCA: 564
TACG: 507
TAGC: 524
TCAG: 712
TCGA: 708
TGAC: 564
TGCA: 522
Worst efficiencies: [240, 252, 252, 265, 265, 272, 272, 273, 273, 280]
Best efficiencies: [718, 747, 747, 748, 748, 754, 766, 766, 789, 789]
Average efficiency: 514.7578125


All sub-barcodes are at least close to the average efficiency seen for all overhangs.

Now find the sub-barcode pairs that are most likely to incorrectly ligate with each other:

In [9]:
scores = []
for barcode in subbarcodes:
  other_barcodes = [i for i in subbarcodes if i != barcode]
  for other in other_barcodes:
    scores.append((barcode, other, efficiencies[barcode][revcomp(other)]))
  
sorted_scores = sorted(scores, key=lambda x: x[2])
print(sorted_scores[-12:])

[('TGCA', 'TGAC', 0), ('ACGT', 'CAGT', 1), ('ACTG', 'ACGT', 1), ('AGCT', 'TGCA', 1), ('ATCG', 'TACG', 1), ('CGTA', 'CGAT', 1), ('GATC', 'GTAC', 1), ('GTAC', 'GATC', 1), ('TGCA', 'AGCT', 1), ('ATCG', 'CGAT', 2), ('GACT', 'GATC', 2), ('GATC', 'AGTC', 2)]


In [7]:
for barcode in subbarcodes:
  other_barcodes = [i for i in subbarcodes if i != barcode]