# Edit Distance

In [96]:
# Read genome from FASTA file in folder (chr1.GRCh38.excerpt.fasta) and prepare pattern matching variables

# Set FASTA filename
filename = 'chr1.GRCh38.excerpt.fasta'

def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

# Set text 't' to the genome from file
t = readGenome(filename)

# Specify shorter text 't' for testing
# t = 'TATTGGCTATACGGTT'

# Set pattern 'p' to desired string to match
p = 'GCTGATCGATCGTACG'

In [97]:
# Read sequences and qualities from imported FASTQ file

# Set FASTQ filename
filename = 'ERR266411_1.for_asm.fastq'

def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename, 'r') as fq:
        while True:
            fq.readline()  # skip name line
            seq = fq.readline().rstrip()  # read base sequence
            fq.readline()  # skip placeholder line
            qual = fq.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

sequences, qualities = readFastq(filename)

In [98]:
pattern_len = len(p)
min_dist = None

def editDistance(x, y):
    # Create distance matrix
    D = []
    for i in range(len(x)+1):
        D.append([0]*(len(y)+1))
    # Initialize first row and column of matrix
    for i in range(len(x)+1):
        D[i][0] = i
    for i in range(len(y)+1):
        D[0][i] = i
    # Fill in the rest of the matrix
    for i in range(1, len(x)+1):
        for j in range(1, len(y)+1):
            distHor = D[i][j-1] + 1
            distVer = D[i-1][j] + 1
            if x[i-1] == y[j-1]:
                distDiag = D[i-1][j-1]
            else:
                distDiag = D[i-1][j-1] + 1
            D[i][j] = min(distHor, distVer, distDiag)
    # Edit distance is the value in the bottom right corner of the matrix
    return D[-1][-1]

for i in range(len(t) - pattern_len + 1):
    substring = t[i:i+pattern_len]
    dist = editDistance(p, substring)
    if (min_dist is None) or (dist < min_dist):
        min_dist = dist

print(min_dist)

3


In [99]:
def read_fastq(filename):
    sequences = []
    with open(filename, 'r') as fh:
        while True:
            fh.readline()  # name line
            seq = fh.readline().rstrip()  # sequence
            fh.readline()  # plus line
            fh.readline()  # quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
    return sequences

# Load reads
reads = read_fastq("ERR266411_1.for_asm.fastq")
print(f"Total reads loaded: {len(reads)}")

def overlap(a, b, min_length=30):
    """Return length of longest suffix of 'a' matching prefix of 'b' with len ≥ min_length."""
    start = 0
    while True:
        start = a.find(b[:min_length], start)
        if start == -1:
            return 0
        if b.startswith(a[start:]):
            return len(a) - start
        start += 1

from collections import defaultdict

def build_kmer_index(reads, k):
    index = defaultdict(set)
    for read in reads:
        for i in range(len(read) - k + 1):
            kmer = read[i:i + k]
            index[kmer].add(read)
    return index

def find_all_overlaps(reads, k=30):
    index = build_kmer_index(reads, k)
    overlaps = {}
    for a in reads:
        suffix = a[-k:]
        candidates = index.get(suffix, set())
        for b in candidates:
            if a != b:
                olen = overlap(a, b, k)
                if olen >= k:
                    overlaps[(a, b)] = olen
    return overlaps

print(f"Total overlaps found: {len(overlaps)}")

# Count reads with at least one outgoing edge
outgoing = set(a for (a, b) in overlaps)
print(f"Reads with at least one outgoing edge: {len(outgoing)}")

Total reads loaded: 10000
Total overlaps found: 27044
Reads with at least one outgoing edge: 4353


In [2]:
from collections import defaultdict

# Function to read FASTQ sequences
def readFastq(filename):
    sequences = []
    with open(filename, 'r') as fq:
        while True:
            fq.readline()  # skip header
            seq = fq.readline().rstrip()
            fq.readline()  # skip plus line
            fq.readline()  # skip quality line
            if not seq:
                break
            sequences.append(seq)
    return sequences

# Overlap function (longest suffix of a matching prefix of b >= min_length)
def overlap(a, b, min_length=30):
    start = 0
    max_olen = 0
    while True:
        start = a.find(b[:min_length], start)
        if start == -1:
            break
        olen = len(a) - start
        if olen >= min_length and b.startswith(a[start:]):
            max_olen = max(max_olen, olen)
        start += 1
    return max_olen

def find_overlaps(reads, k=30):
    # Deduplicate reads
    reads_unique = list(set(reads))

    # Build prefix index: prefix_kmer -> set of read indices
    prefix_index = defaultdict(set)
    for i, read in enumerate(reads_unique):
        prefix = read[:k]
        prefix_index[prefix].add(i)

    overlaps = {}
    for i, read_a in enumerate(reads_unique):
        suffix = read_a[-k:]
        candidates = prefix_index.get(suffix, set())
        for j in candidates:
            if i != j:
                read_b = reads_unique[j]
                olen = overlap(read_a, read_b, min_length=k)
                if olen > 0:
                    overlaps[(i, j)] = olen

    return overlaps, reads_unique

# === Main execution ===
filename = 'ERR266411_1.for_asm.fastq'  # make sure this file is in the working dir
reads = readFastq(filename)

overlap_map, unique_reads = find_overlaps(reads, k=30)

print(f"Total reads loaded: {len(reads)}")
print(f"Unique reads after deduplication: {len(unique_reads)}")
print(f"Total overlaps found (length >= 30): {len(overlap_map)}")

outgoing_nodes = set(i for i, j in overlap_map.keys())
print(f"Reads with at least one outgoing overlap: {len(outgoing_nodes)}")


Total reads loaded: 10000
Unique reads after deduplication: 10000
Total overlaps found (length >= 30): 27044
Reads with at least one outgoing overlap: 4353
