In [16]:
import pandas as pd
from pathlib import Path
from reader import FastqProcessor, SequenceMatch
from HeatMapVis import SequenceVisualizer
import matplotlib.pyplot as plt
import numpy as np
from typing import List
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [17]:
def reverse_complement(seq: str) -> str:
    """Generate reverse complement of a DNA sequence."""
    complement = str.maketrans("ATGC", "TACG")
    return seq.translate(complement)[::-1]

In [18]:
def record_reverse_complement(record: SeqRecord) -> SeqRecord:
    """
    Return a new SeqRecord with the reverse complement of the given sequence.
    """
    return SeqRecord(
        seq=record.seq.reverse_complement(),
        id=record.id + "_RC",
        description=record.description + " reverse complement",
        letter_annotations=record.letter_annotations
    )

In [33]:
fastq_file = Path("data_samples/new_sequencing_may2025/reads/barcode14.fastq")
query_dict = {
    "RCRight|R": "CTTTAGGGCC",
    "RCRight|L": "TCGTGCTAAG",
    "RCLeft|R": "CGAGAGCAG",
    "RCLeft|L": "ATCCATGAAG",
    "Left|L": "CTTCATGGAT",
    "Left|R": "CCTGCTCTCG",
    "Right|L": "GGCCCTAAAG",
    "Right|R": "CTTAGCACGA",
}
# query_dict = {
#     "RCRight/r": "CTTTAGGGCC",
#     "RCLeft/l": "CGAGAGCAG",
#     "Left/r": "CCTGCTCTCG",
#     "Right/l": "GGCCCTAAAG",
#     "RCLeft/r": "ATCCATGAAG",
#     "Left/l": "CTTCATGGAT",
#     "Right/r": "CTTAGCACGA",
#     "RCRight/l": "TCGTGCTAAG",
# }

In [34]:
# Process FASTQ file
processor = FastqProcessor(fastq_file, query_dict)
sequence_matches = processor.process_file()

Processing FASTQ: 100%|██████████| 7325/7325 [00:00<00:00, 82938.60it/s]


In [35]:
results = []
for i, match in enumerate(sequence_matches):
    results.append({
        "Sequence": str(match)
    })
df = pd.DataFrame(results)
df.to_csv('data_samples/new_sequencing_may2025/positions/barcode14/fuzzy_matches/barcode14_primer_positions.csv', index=False)

In [36]:
df.head(100)

Unnamed: 0,Sequence
0,"[SequenceMatch(seq_id=0, query_name='Left|R', ..."
1,"[SequenceMatch(seq_id=1, query_name='RCLeft|L'..."
2,"[SequenceMatch(seq_id=2, query_name='Left|L', ..."
3,"[SequenceMatch(seq_id=3, query_name='RCRight|L..."
4,"[SequenceMatch(seq_id=4, query_name='Right|R',..."
...,...
95,"[SequenceMatch(seq_id=95, query_name='Left|R',..."
96,"[SequenceMatch(seq_id=96, query_name='Right|L'..."
97,"[SequenceMatch(seq_id=97, query_name='RCRight|..."
98,"[SequenceMatch(seq_id=98, query_name='RCLeft|L..."


In [37]:
df.shape

(7325, 1)