<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_2/1_align_zebov_subset_kraken2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Align a subset of the macaque PBMC Zaire ebolavirus (ZEBOV) dataset using Kraken2 (standard nucleotide alignment) and generate bam files to visualize the alignment

In [None]:
# Install Kraken2 v1.0.2 (defining version for reproducibility)
!git clone https://github.com/DerrickWood/kraken2.git --branch v2.1.2
!cd kraken2 && ./install_kraken2.sh ./

kraken2 = "/content/kraken2/kraken2"
kraken2_build = "/content/kraken2/kraken2-build"

In [None]:
# Number of threads used for alignment
threads = 2

### Download raw sequencing file and subset to first 100,000,000 reads


In [None]:
!pip install -q ffq
import json

out = "data.json"
!ffq SRR12698539 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

for dataset in data:
    url = dataset["url"]
    !curl -O $url

In [None]:
fastq = "SRR12698539_2.fastq.gz"
test_fastq_gz = "SRR12698539_2_short.fastq.gz"
test_fastq = "SRR12698539_2_short.fastq"

# Create new file keeping only first X reads
!zcat $fastq | head -400000000 > $test_fastq_gz

# Unzip file for use with Kraken2
!gunzip $test_fastq_gz

### Run Kraken2

Build Kraken2 viral index + add ZEBOV to standard viral reference (otherwise ZEBOV will not be detected). Zaire ebolavirus (ZEBOV) genome ViralProj14703 (linked to NC_002549.1) downloaded from https://www.ncbi.nlm.nih.gov/data-hub/genome/?taxon=186538.

In [None]:
krakendb = "kraken2-2.1.2/krakendb"

In [None]:
# Download ZEBOV genome
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_2/GCA_000848505.1_ViralProj14703_genomic.fna

In [None]:
!$kraken2_build --db $krakendb --download-taxonomy

# Apply fix (https://github.com/DerrickWood/kraken2/issues/292#issuecomment-1206837801) first so the following line works
!$kraken2_build --db $krakendb --download-library viral

# Add ZEBOV genome
!$kraken2_build --db $krakendb --add-to-library GCA_000848505.1_ViralProj14703_genomic.fna

!$kraken2_build --db $krakendb --build --threads $threads

Align sequencing reads to custom Kraken2 reference index:

In [None]:
outfolder = "zebov_subset_alignment"

In [None]:
!$kraken2 \
    --db $krakendb \
    --threads $threads \
    --minimum-hit-groups 3 \
    --report-minimizer-data \
    --report $outfolder/kraken/SRR12698503.k2report \
    $test_fastq > $outfolder/kraken/SRR12698503.kraken2

### Extract Kraken reads
The extract_kraken_reads.py script (from the KrakenTools GitHub repo) extracts reads that matched a particular species, identified by the taxonomy ID that is provided with the -t parameter:

In [None]:
# Define ZEBOV taxonomy ID
ebov_tid = 186538

In [None]:
# Download script
!curl -O https://raw.githubusercontent.com/jenniferlu717/KrakenTools/master/extract_kraken_reads.py

!/usr/bin/time -v \
    python extract_kraken_reads.py \
    -k $outfolder/kraken/SRR12698503.kraken2 \
    --include-children \
    -s $test_fastq \
    -t $ebov_tid \
    -r $outfolder/kraken/SRR12698503.k2report \
    -o $outfolder/kraken/SRR12698503_EBOV.tid10298.1.fa

### Align extracted reads to the ZEBOV genome using Bowtie2

In [None]:
# Install bowtie2
!wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.2.5/bowtie2-2.2.5-linux-x86_64.zip
!unzip bowtie2-2.2.5-linux-x86_64.zip
bowtie2_build = "bowtie2-2.2.5-linux-x86_64/bowtie2-build"
bowtie2 = "bowtie2-2.2.5-linux-x86_64/bowtie2"

Generate Bowtie2 genome index:

In [None]:
b_index = "b_index"

In [None]:
!$bowtie2_build \
    GCA_000848505.1_ViralProj14703_genomic.fna \
    $b_index/ebov

Align extracted reads to ZEBOV genome:

In [None]:
!$bowtie2 \
    -x $b_index/ebov \
    -f -p $threads \
    -U $outfolder/kraken/SRR12698503_EBOV.tid10298.1.fa \
    -S $outfolder/kraken/SRR12698503_EBOV_aligned.sam

### Use SAMtools to convert the SAM files to sorted BAM files

In [None]:
# Install SAMtools
!wget https://github.com/samtools/samtools/releases/download/1.6/samtools-1.6.tar.bz2
!tar -vxjf samtools-1.6.tar.bz2
!cd samtools-1.6; make
samtools = "samtools-1.6/samtools"

gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_tview_curses.o bam_tview_curses.c
[01m[Kbam_tview_curses.c:[m[K In function ‘[01m[Kcurses_mvprintw[m[K’:
   82 |     [01;35m[Kmvprintw[m[K(y,x,str);
      |     [01;35m[K^~~~~~~~[m[K
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_tview_html.o bam_tview_html.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_lpileup.o bam_lpileup.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_quickcheck.o bam_quickcheck.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_addrprg.o bam_addrprg.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_markdup.o bam_markdup.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_aux.o bam_aux.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam.o bam.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_import.o bam_import.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o sam.o sam.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o sam_header.o sam_header.c
gcc -g -Wall -O2 -I. -Ihtslib-1.6  -c -o bam_plbuf.o bam_plbuf.c
ar -csru libbam.a bam_aux.o bam.o

In [None]:
!$samtools view \
    -bS -F4 $outfolder/kraken/SRR12698503_EBOV_aligned.sam \
    > $outfolder/kraken/SRR12698503_EBOV_aligned.bam

In [None]:
!$samtools sort \
    $outfolder/kraken/SRR12698503_EBOV_aligned.bam \
    -o $outfolder/kraken/SRR12698503_EBOV_sorted.bam

In [None]:
!$samtools index \
    $outfolder/kraken/SRR12698503_EBOV_sorted.bam