# [SAM file format](https://www.metagenomics.wiki/tools/samtools/bam-sam-file-format)
```python
%%bash
samtools --help
```

# Minimap2 - [Manual Page](https://lh3.github.io/minimap2/minimap2.html)

```python
%%bash
minimap2=/v/scratch/tools/minimap2/minimap2
$minimap2 --help
```

---
# Create SAM files from `timin0421` data

In [None]:
%%bash
BAM_FILES="/v/projects/nanopore/agnes/signal_visualization/data/timin0421/pass/"
SORTED_BAM_FILES="/v/projects/nanopore/balazs/data/thymine_0421/temp_bam/"
mkdir -p $SORTED_BAM_FILES

for BAM in ${BAM_FILES}*.bam; do
    BAM_BASE=$(basename "$BAM" .bam)
    samtools sort -n "${BAM_FILES}${BAM_BASE}.bam" -o "${SORTED_BAM_FILES}${BAM_BASE}.bam"
done

In [None]:
import os
import pysam
from glob import glob

def filename_from_path(path):
    return path.split('/')[-1]

bam_files = glob("/v/projects/nanopore/balazs/data/thymine_0421/temp_bam/*")
fastq_path = "/v/projects/nanopore/balazs/data/thymine_0421/temp_fasq/"
os.mkdir(fastq_path)

for bam in bam_files:
    bam_file = pysam.AlignmentFile(bam, "rb", check_sq=False)
    with open(fastq_path+filename_from_path(bam)[:-3]+"fastq", "w") as fq_out:
        for read in bam_file:
            if read.is_unmapped:
                # Get the tags (it returns a list of tuples)
                tags = read.get_tags()
                # Convert tags to a string for the FASTQ header
                metadata = "\t".join(f"{tag}:{value}" for tag, value in tags)
                # Write the read to FASTQ format with metadata in the header
                fq_out.write(f"@{read.query_name} {metadata}\n")
                fq_out.write(f"{read.query_sequence}\n")
                fq_out.write("+\n")
                fq_out.write(f"{read.qual}\n")

In [None]:
%%bash
minimap2=/v/scratch/tools/minimap2/minimap2

REF_SEQUENCE="/v/volumes/nanopore/ref/p300_ID3.fasta"
FASTQ_FILES="/v/projects/nanopore/balazs/data/thymine_0421/temp_fasq/"
SAM_PATH="/v/projects/nanopore/balazs/data/thymine_0421/"

for FASTQ in ${FASTQ_FILES}*.fastq; do
    FASTQ_BASE=$(basename "$FASTQ" .fastq)
    $minimap2 -ax map-ont -y -t 8 "$REF_SEQUENCE" "${FASTQ_FILES}${FASTQ_BASE}.fastq" > "${SAM_PATH}mapped_${FASTQ_BASE}.sam"
done

In [None]:
%%bash
BAM_FILES="/v/projects/nanopore/balazs/data/thymine_0421/temp_bam/"
FASTQ_FILES="/v/projects/nanopore/balazs/data/thymine_0421/temp_fasq/"

rm -r "${BAM_FILES}"
rm -r "${FASTQ_FILES}"

In [1]:
%%bash
SAM="/v/projects/nanopore/balazs/data/thymine_0421/mapped_basecalls.sam"
head -n 4 "$SAM"

@HD	VN:1.6	SO:unsorted	GO:query
@SQ	SN:p300_ID3	LN:1164
@PG	ID:minimap2	PN:minimap2	VN:2.28-r1209	CL:minimap2 -ax map-ont -y -t 8 /v/volumes/nanopore/ref/p300_ID3.fasta /v/projects/nanopore/balazs/data/thymine_0421/fasq/basecalls.fastq
000b042c-f087-49bf-8ed3-e0b385e3bb5b	0	p300_ID3	4	60	32S48M1D54M2D33M1D54M1D4M1D6M4D60M1D53M5D8M1I21M2D40M1D13M3D4M2I40M1I2M4D4M2D107M1D79M1D4M2D9M1D35M2D153M2I21M1I24M1I77M1D43M1D40M3D24M1D16M1I39M10S	*	0	0	AATGTACTTCGTTCCAGTTACGTATTGCTCTTATGCTACCAAATGCTGCAGGCATGGTTCCAGTTTCCATGAATCCAGGGCTAACATGGGACAGCCGCAGCCAGGAATGACTTCTAATGGCCCTCTACCTGACCAGTATGATCCGTGGCAGTGTGCCAAGCCAGATGTTACTCGAATAACTCCACAATCTGGTTTGAATCAATTTGGCCAGATGAGCATGGCCAGCCCCTAACCCCGGCAAGCCCCTCCTCTTCAGCACCATGGACGATTGGCTCAACCTGGAGCTCTCAACCGCCTATGGGCTATGGGCCTCGTATGCAACAGCCTTCCAACCAGGGCCAGTTCTCAGACTCCAGTTCCCATCACAGGGAATGGGCAACAAATATCCCTTTGGCTCCGTCCAGCGGTCAAGCTCAGTGTCTCAAGCTTTGTCTCTAGTTCTTCCTGCCCGGTGAACTCTCCTATAATGCCCCTCACTCAGGAGCCACATTCACTGTCCCCAGCTTCCTCAACCAGCTCTTCATCAGAATTCACCCTCGCCTGTACCTAGTCGTAC

# Create SAM files from `uracil0504` data

In [None]:
%%bash
BAM_FILES="/v/projects/nanopore/agnes/signal_visualization/uracil0504/guppy/pass/"
SORTED_BAM_FILES='/v/projects/nanopore/balazs/data/uracil_0504/temp_bam/'
mkdir -p $SORTED_BAM_FILES

for BAM in ${BAM_FILES}*.bam; do
    BAM_BASE=$(basename "$BAM" .bam)
    samtools sort -n "${BAM_FILES}${BAM_BASE}.bam" -o "${SORTED_BAM_FILES}${BAM_BASE}.bam"
done

In [None]:
import os
import pysam
from glob import glob

def filename_from_path(path):
    return path.split('/')[-1]

bam_files = glob("/v/projects/nanopore/balazs/data/uracil_0504/temp_bam/*")
fastq_path = "/v/projects/nanopore/balazs/data/uracil_0504/temp_fasq/"
os.mkdir(fastq_path)

for bam in bam_files:
    bam_file = pysam.AlignmentFile(bam, "rb", check_sq=False)
    with open(fastq_path+filename_from_path(bam)[:-3]+"fastq", "w") as fq_out:
        for read in bam_file:
            if read.is_unmapped:
                # Get the tags (it returns a list of tuples)
                tags = read.get_tags()
                # Convert tags to a string for the FASTQ header
                metadata = "\t".join(f"{tag}:{value}" for tag, value in tags)
                # Write the read to FASTQ format with metadata in the header
                fq_out.write(f"@{read.query_name} {metadata}\n")
                fq_out.write(f"{read.query_sequence}\n")
                fq_out.write("+\n")
                fq_out.write(f"{read.qual}\n")

In [None]:
%%bash
minimap2=/v/scratch/tools/minimap2/minimap2

REF_SEQUENCE="/v/volumes/nanopore/ref/p300_ID3.fasta"
FASTQ_FILES="/v/projects/nanopore/balazs/data/uracil_0504/temp_fasq/"
SAM_PATH="/v/projects/nanopore/balazs/data/uracil_0504/"

for FASTQ in ${FASTQ_FILES}*.fastq; do
    FASTQ_BASE=$(basename "$FASTQ" .fastq)
    $minimap2 -ax map-ont -y -t 8 "$REF_SEQUENCE" "${FASTQ_FILES}${FASTQ_BASE}.fastq" > "${SAM_PATH}mapped_${FASTQ_BASE}.sam"
done

In [None]:
%%bash
BAM_FILES='/v/projects/nanopore/balazs/data/uracil_0504/temp_bam/'
FASTQ_FILES="/v/projects/nanopore/balazs/data/uracil_0504/temp_fasq/"

rm -r "${BAM_FILES}"
rm -r "${FASTQ_FILES}"

In [None]:
%%bash
SAM="/v/projects/nanopore/balazs/data/uracil_0504/mapped_basecalls.sam"
head -n 4 "$SAM"