In [1]:
import pandas as pd
from compressor import (IntelligentCompressor, KMeansClustering, HierarchicalClustering, DBSCANClustering, \
    # Word2VecEmbedding,
                        HDBSCANClustering, BIRCHClustering, OPTICSClustering, GMMClustering, MeanShiftClustering, \
    AffinityPropagationClustering, SpectralClusteringMethod)
from pathlib import Path
from reader import FastqProcessor, SequenceMatch
from HeatMapVis import SequenceVisualizer
import matplotlib.pyplot as plt
import numpy as np
from typing import List
from Bio import SeqIO
from fuzzy_searcher import FuzzySearcher
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Embeddings import (Word2VecEmbedding,
                        WeightedWord2VecEmbedding,
                        PositionalEncodingWord2VecEmbedding,
                        Bio2VecEmbedding,
                        LSTMEmbedding,
                        TransformerSequenceEmbedding)

In [2]:
def reverse_complement(seq: str) -> str:
    """Generate reverse complement of a DNA sequence."""
    complement = str.maketrans("ATGC", "TACG")
    return seq.translate(complement)[::-1]

def record_reverse_complement(record: SeqRecord) -> SeqRecord:
    """
    Return a new SeqRecord with the reverse complement of the given sequence.
    """
    return SeqRecord(
        seq=record.seq.reverse_complement(),
        id=record.id + "_RC",
        description=record.description + " reverse complement",
        letter_annotations=record.letter_annotations
    )

In [3]:
fastq_file = Path("/Users/maria/PycharmProjects/oxo_dG/input_files/2026_cassette/fastq/calls.sorted.fastq")

In [4]:
query_dict = {
    "A1": "GATCAGTCCGATATC",
    "A2": "TCGACATGCTAGTGC",
    "A3": "GCTATCGGATACGTC",
    "S1": "ATGACTGCCA",
    "L": "TTTTTT",
    "S2": "TGGCAGTCAT",
    "A1C": "GACGTATCCGATAGC",
    "A2C": "GCACTAGCATGTCGA",
    "A3C": "GATATCGGACTGATC"
}

In [5]:
#processor = FastqProcessor(fastq_file, query_dict, 1.0)
processor = FuzzySearcher(
    fastq_path=fastq_file,
    query_dict=query_dict,
    output_parquet=Path("matches.parquet"),
    similarity_threshold=0.9
)

In [6]:
sequence_matches = processor.search()

Scanning FASTQ: 2465275it [03:46, 10878.83it/s]


✅ Finished. Parquet saved to: matches.parquet


In [8]:
import pyarrow.parquet as pq


In [9]:
table = pq.read_table("matches.parquet")


In [10]:
table.schema

seq_id: string
query_name: string
position: int32
length: int16
score: float

In [11]:
table.num_rows

19229227

In [12]:
table.slice(0, 10).to_pandas()

Unnamed: 0,seq_id,query_name,position,length,score
0,5284e66b-b7b4-44db-9949-f73969d28dbd,A2,27,14,1.0
1,5284e66b-b7b4-44db-9949-f73969d28dbd,A3,51,14,1.0
2,5284e66b-b7b4-44db-9949-f73969d28dbd,S1,66,9,1.0
3,5284e66b-b7b4-44db-9949-f73969d28dbd,L,76,5,1.0
4,5284e66b-b7b4-44db-9949-f73969d28dbd,L,77,5,1.0
5,5284e66b-b7b4-44db-9949-f73969d28dbd,S2,82,9,1.0
6,5284e66b-b7b4-44db-9949-f73969d28dbd,A1C,92,14,1.0
7,5284e66b-b7b4-44db-9949-f73969d28dbd,A2C,116,14,1.0
8,b6afcfc1-adfe-4ff3-af82-e4f82ae27091,A1,0,14,1.0
9,b6afcfc1-adfe-4ff3-af82-e4f82ae27091,A3,48,14,1.0


In [13]:
!pip install duckdb

Collecting duckdb
  Obtaining dependency information for duckdb from https://files.pythonhosted.org/packages/9f/01/b19f532ee7340ef11c3363300f677074d7d2bf03af5ac76efacf03b4dd76/duckdb-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata
  Downloading duckdb-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (4.3 kB)
Downloading duckdb-1.4.4-cp39-cp39-macosx_10_9_x86_64.whl (15.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.4.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Users/maria/PycharmProjects/NanoClustVis/.venv/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import duckdb

In [16]:
df = duckdb.query("""
SELECT *
FROM 'matches.parquet'
LIMIT 10
""").to_df()

In [17]:
df.head(100)

Unnamed: 0,seq_id,query_name,position,length,score
0,5284e66b-b7b4-44db-9949-f73969d28dbd,A2,27,14,1.0
1,5284e66b-b7b4-44db-9949-f73969d28dbd,A3,51,14,1.0
2,5284e66b-b7b4-44db-9949-f73969d28dbd,S1,66,9,1.0
3,5284e66b-b7b4-44db-9949-f73969d28dbd,L,76,5,1.0
4,5284e66b-b7b4-44db-9949-f73969d28dbd,L,77,5,1.0
5,5284e66b-b7b4-44db-9949-f73969d28dbd,S2,82,9,1.0
6,5284e66b-b7b4-44db-9949-f73969d28dbd,A1C,92,14,1.0
7,5284e66b-b7b4-44db-9949-f73969d28dbd,A2C,116,14,1.0
8,b6afcfc1-adfe-4ff3-af82-e4f82ae27091,A1,0,14,1.0
9,b6afcfc1-adfe-4ff3-af82-e4f82ae27091,A3,48,14,1.0


In [18]:
duckdb.query("""
SELECT count(*) FROM 'matches.parquet'
""").show()

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│     19229227 │
└──────────────┘

