# Analysing the Hamming distance between the ONT barcodes
Written by Jason A. Hendry

In [None]:
import os
import sys
import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import re
from Bio import SeqIO
from scipy.spatial.distance import hamming
import matplotlib.pyplot as plt

In [None]:
#Define paths and filenames
bc_dir = "data"
bc_file = "barcodes_masked.fasta"

# Identify relevent barcodes

In [None]:
#Set-up lists
bc_seq = []
bc_id = []

#Open fasta file
for record in SeqIO.parse(os.path.join(bc_dir, bc_file), "fasta"):
    #Identify correct entries 
    if re.match(r"BC\d{2}$",record.name):
        #Append to lists
        bc_seq.append(record.seq)
        bc_id.append(record.id)

In [None]:
#Calculate barcode lengths
lengths = []
for seq in bc_seq:
    l = len(bc_seq[0])
    lengths.append(l)

#Check lengths are the same otherwise unable to calculate hamming distance
if max(lengths) == min(lengths):
    print("All barcodes have the same length (%d bp)" % lengths[0])
else:
    print("Mixed length barcodes")

# Create pairwise distance matrix
Want to identify the hamming distance between each barcode and every other one

In [None]:
#Number of barcodes
n_bc = len(bc_seq)
#n dimensional array for pairwise comparisons filled with np.nan
bc_hamm = np.full((n_bc, n_bc), np.nan)

#Cycle through each of the barcodes
for count in range(0,n_bc,1):
    #Cycle through each of the remaining barcodes for each barcode
    for comp in range(count, n_bc,1):
        #Calculate hamming distance
        h = hamming(bc_seq[count], bc_seq[comp]) * len(bc_seq[count])
        #Insert result directly into array
        bc_hamm[count,comp] = h

# Visual matrix

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(bc_hamm)
plt.colorbar()
plt.title('Pairwise matrix of Hamming distance between all ONT native barcodes')

# Summary Stats

In [None]:
#Replace all zeroes (self-comparisons) with nan to allow summary stats to be calculated
bc_hamm_na = np.where(bc_hamm == 0, np.nan, bc_hamm)

print("Summary stats:")
print("   Min: %d" % np.nanmin(bc_hamm_na))
print("   Max: %d" % np.nanmax(bc_hamm_na))
print("   Median: %d" % np.nanmedian(bc_hamm_na))

In [None]:
data_flat = bc_hamm_na.flatten()  # Flatten the array to combine all data points
ax = sns.displot(data_flat, kde=True, stat="density")
# Access the main axis using plt.gca() (get current axis)
main_axis = plt.gca()
main_axis.set_title("Normal distribution of hamming distances between all ONT native barcodes")
main_axis.set_xlabel("Hamming distance")

# Identify closest barcodes

In [None]:
#Identify all locations with the min Hamming distance
ham_min = np.where(bc_hamm_na==np.nanmin(bc_hamm_na))

x=ham_min[0]
y=ham_min[1]

In [None]:
print("Minimum Hamming Distances identified between:")
#Create list for each min value
for v in range(0,len(x),1):
    print("   BC%d and BC%d" % (x[v]+1, y[v]+1))

# Conclusions
The above barcode pairs are most likely to be mistakenly read as the other, however with a minimum hamming distance of 12 for 24 bp barcodes i.e. 50% difference this seems unlikely.

Could explore whether there are amplicons that have 3' barcode like seq and whether it is likely that the 5' of the barcode could be missed. For example, if:

BC01: <font color='green'>ACGTAC</font><br>
Amp1: CGTAGG<br>

Would expect to create a sequence <font color='green'>ACGTAC</font>CGTAGG<br>

However if sequence from 
BC02:<font color='red'>TACCGT</font><br>

Is it possible that we would lose the first few base pairs, i.e. <font color='green'>ACG</font>, it would be assigned to BC02 i.e:<br>
<font color='red'>TACCGT</font>AGG<br>
