In [1]:
from utils import *
from speedup import *
from generate_kmer import *
max_num_of_files = None

def check_sam_data(sam_data):
    result = np.array([
        [read['contig_name'], read['map_quality']] for read in sam_data if read['contig_name']!='*'
    ])
    plt.figure(figsize=(6,4))
    plt.hist(result[:,0], bins=499, label=f'{len(np.unique(result[:,0]))}/500 reads')
    #plt.title('PCR dataset')
    plt.xlabel('Sequence number')
    plt.ylabel('Count')
    plt.legend(loc='center right')
    plt.show()
    x=np.histogram(result[:,0], bins=499)
    print(f"highest peak at position={int(x[1][np.argmax(x[0])])}")
    
    plt.figure(figsize=(6,4))
    plt.hist(result[:,1], bins=max(result[:,1])-min(result[:,1]))
    #plt.title('dataset')
    plt.xlabel('MapQuality')
    plt.ylabel('Count')
    plt.yscale('log')
    plt.show()

def write_fastq(sequences, qualities, output_file):
    with open(output_file, "w") as f:
        for i, (seq, qual) in enumerate(zip(sequences, qualities)):
            seq_id = f"seq_{i+1}"
            f.write(f"@{seq_id}\n")
            f.write(f"{''.join(seq.astype(str))}\n")
            f.write("+\n")
            f.write(f"{qual}\n")



## long read high accuracy - minimap2
```python
%%bash
minimap2="/v/scratch/tools/minimap2/minimap2"

REF="references/ref_sequences_xlsx.fasta" 
READS_DIR="/v/volumes/nanopore/timin_uracil/oligos/timin20250417/basecall/20250507/bam_to_fastq"
OUT_DIR="/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/dorado_bam2fastq_alignments"

mkdir -p "$OUT_DIR"

for READ_FILE in "$READS_DIR"/*.fastq; do
    BASENAME=$(basename "$READ_FILE" .fastq)
    $minimap2 -x lr:hq --frag=yes -a "$REF" "$READ_FILE" > "$OUT_DIR/${BASENAME}.sam"
done
```


In [2]:
dorado_alignments_path_mm2 = "/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/dorado_bam2fastq_alignments/*.sam"
dorado_alignments_files_mm2 = sorted(glob(dorado_alignments_path_mm2))

dorado_sam_data_mm2 = read_sam(dorado_alignments_files_mm2,
                               verbose=True,
                               has_movetable=False,
                               get_ReadQuality=True,
                               #min_MAPQ=30, 
                               min_length=490,
                               max_length=600,
                               #max_deletion=50,
                               #max_insertion=50,
                               #max_indel_frequency=0.4,
                               possible_sam_flags={4}, ## possible in the dataset: [0,4,16] == [forward, unmapped, reverse]
                              )

Loading files: 100%|██████████| 1/1 [00:39<00:00, 39.09s/file]

Number of reads stored: 386489
Number of reads dropped: 1007952





In [3]:
sequences = get_feature_from_sam_data(dorado_sam_data_mm2, 'sequence')
qualities = get_feature_from_sam_data(dorado_sam_data_mm2, 'read_quality')

In [4]:
#write_fastq(sequences, qualities, 
#            "/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/unmapped_sequences/unmapped_sequnces.fastq")

In [5]:
from collections import Counter

sequence_strings = ["".join(seq.astype(str)) for seq in sequences]

counts = Counter(sequence_strings)
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

In [6]:
for i, (seq, count) in enumerate(sorted_counts):
    print(f"{count} {seq[:5]}...{seq[len(seq)-5:]}")
    if i == 5:
        break

1 TAAGG...TGGTT
1 GTTAT...TGGTG
1 TAAGG...TTCAT
1 TTATG...CGTTG
1 TAAGG...TTACC
1 TGTTT...GTTTA


---

# self aligning from unmapped sequences

```python
%%bash
minimap2="/v/scratch/tools/minimap2/minimap2"

READS_DIR="/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/unmapped_sequences"
OUT_DIR="/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning"

mkdir -p "$OUT_DIR"

for READ_FILE in "$READS_DIR"/*.fastq; do
    BASENAME=$(basename "$READ_FILE" .fastq)
    $minimap2 -x ava-ont --frag=yes -a "$READ_FILE" "$READ_FILE" > "$OUT_DIR/${BASENAME}.sam"
done
```

In [7]:
unmapped_seq_fname = "/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning/unmapped_sequences.sam"

unmapped_seq_sam_data = read_sam(unmapped_seq_fname,
                                 verbose=True,
                                 has_movetable=False,
                                 #min_MAPQ=30, 
                                 #min_length=150,
                                 #max_length=2000,
                                 #max_deletion=50,
                                 #max_insertion=50,
                                 #max_indel_frequency=0.4,
                                 possible_sam_flags={0,4,16}, ## possible in the dataset: [0,4,16] == [forward, unmapped, reverse]
                                )

Loading files: 100%|██████████| 1/1 [02:34<00:00, 154.13s/file]

Number of reads stored: 521242
Number of reads dropped: 2503818





```python
reads = np.array[''.join(read['sequence'].astype(str)) for read in tqdm(unmapped_seq_sam_data)], dtype=object)
np.save('/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning/unmapped_sequences.npy', reads)
```

In [8]:
reads = np.load('/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning/unmapped_sequences.npy', 
                allow_pickle=True
               )

In [18]:
with open('/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning/unmapped_sequneces_smaller.fasta', 'w') as f_out:
    for i, r in enumerate(reads[::600]):
        f_out.write(f">unmapped_seq{i}\n")
        f_out.write(f"{r}\n")

In [19]:
with open('/v/projects/nanopore/balazs/data/new_dataset/thymine_250417/minimap2/unmapped_self_aligning/unmapped_sequneces_first300.fasta', 'w') as f_out:
    for i, r in enumerate(reads[:300]):
        f_out.write(f">unmapped_seq{i}\n")
        f_out.write(f"{r}\n")

In [23]:
for i in np.arange(0,41,5):
    print(i, '-', ''.join((unmapped_seq_sam_data[i]['sequence']).astype(str)))
    print()

0 - TAAGGGAATAGAGCCATACGGAAATGTTGAATACTTCATACTCTATTTTGAATATTATTGAAGCATTTATCAGGGTTATCTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCGAAAAGTGCCACCTCATAGTCGCTAGCTGTACAAAAAAGCAGGCTTTAAAGGAACCAATTCAGTCGACTGGATCCTCTTGTGGAAAGGACGAAACGCCGTGTACTGCTCCAGCATAGCTCTTAAACAAGCGGGCCGCTCGAGGTACCTCTCTACATATGACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTTACCATAGGCTCCGCCCCCTCCGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAGTTCCGACGGACTATCAAGATTGCCCGGCGTTTCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCAGCAATTGGTT

5 - TGTTTTGGCCTATTACTGGACCGTTGCGTATTGCTGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACAGAGGGAGCTTCCGGGGGAAACGCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAAACGCCAGCAACGCAGCCTCGGTTCCTGGCCTTTTGCTGGCCTTTTGCTCACATGTCATATGTAGAGAGGTACCTCGAGCGGCCCAGACTTCTTTAAGAGCTATGCTGGAAACAGTACACGGTGTTTCGTCCTTTCCACAGAGGATCCAGTCGACTGATTTGGTTCCTTAAAGCCTGCTTTTACAGCTAGCGACGTCAGGTGGCACTTTTGGGGAAATGTGCGCGGAACCCCTATTTGTTTATTTTTCTAAATACA