<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/align_macaque_PBMC_data/8_virus_bwa/1_remove_host_reads_with_bwa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install [bwa](https://github.com/lh3/bwa)

In [None]:
!git clone https://github.com/lh3/bwa.git
!cd bwa; make
bwa = "bwa/bwa"

# Install [seqtk](https://github.com/lh3/seqtk)

In [None]:
!git clone https://github.com/lh3/seqtk.git
!cd seqtk; make
seqtk = "seqtk/seqtk"

# Install [Samtools](https://www.htslib.org/)

In [None]:
!wget https://github.com/samtools/samtools/releases/download/1.6/samtools-1.6.tar.bz2
!tar -vxjf samtools-1.6.tar.bz2
!cd samtools-1.6; make
samtools = "samtools/samtools"

# Create bwa index using macaque and dog genomes

In [None]:
# Download macaque and dog reference genomes and gtf files
!gget ref -w dna -r 110 -d macaca_mulatta
!gget ref -w dna -r 110 -d canis_lupus_familiaris

combined_genomes = "combined_genomes.dna.toplevel.fa"
!cat "Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz" "Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz" > $combined_genomes

In [None]:
%%time
!$bwa index $combined_genomes

# Align sequences

In [None]:
!pip install -q ffq
import json
import os
import glob
from tqdm import tqdm

TQDM_BAR_FORMAT = (
    "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
)

Download raw sequencing data:

In [None]:
# Use ffq to get all FTP download links
out = "GSE158390_data.json"
!ffq GSE158390 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

# # Download the complete dataset (106 paired fastqs containing a total of 30 billion reads)
# for dataset in data:
#     url = dataset["url"]
#     !curl -O $url

# Download only two datasets to demonstrate this notebook
for dataset in data[:2]:
    url = dataset["url"]
    !curl -O $url

Align data to the comined macaque and dog genome using bwa:

In [None]:
bwa_outfolder = "bwa_unmapped_reads"
!mkdir $bwa_outfolder

filenames = glob.glob("*_2.fastq.gz")

len(filenames)

In [None]:
# Number of threads to use during alignment
threads = 8

In [None]:
%%time
processed_srrs = [i.split("/")[-1].split(".sam.gz")[0] for i in glob.glob(f"{bwa_outfolder}/*.sam.gz")]

with tqdm(total=len(filenames), bar_format=TQDM_BAR_FORMAT) as pbar:
    pbar.set_description("Files processed")
    for file in filenames:
        # Check if file was already processed
        if file.split("/")[-1].split(".fastq.gz")[0] not in processed_srrs:
            print(file.split("/")[-1].split(".fastq.gz")[0])
            bwa_out = f"{bwa_outfolder}/{file.split('/')[-1].replace('.fastq.gz', '')}.sam.gz"
            sam_out = f"{bwa_outfolder}/{file.split('/')[-1].replace('.fastq.gz', '')}_unmapped.txt"
            new_file = f"{bwa_outfolder}/raw/{file.split('/')[-1].replace('.gz', '')}"

            # Align to reference using bwa
            !$bwa mem \
                -t $threads \
                $canine_macaque_fasta \
                $file | gzip -3 > $bwa_out

            # Get names of all UNMAPPED reads using samtools
            !samtools view \
                --threads $threads \
                -f 4 \
                $bwa_out | cut -f 1 > $sam_out

            # Create new fastq file including only UNMAPPED sequences using seqtk
            !$seqtk subseq $file $sam_out > $new_file

            pbar.update(n=1)

Also remove reads from barcode files:

In [None]:
%%time
barcode_files = glob.glob(f"*_1.fastq.gz")

with tqdm(total=len(filenames), bar_format=TQDM_BAR_FORMAT) as pbar:
    pbar.set_description("Barcode files processed")

    for bc_file in barcode_files:
        sam_out = f"{bwa_outfolder}/{bc_file.split('/')[-1].replace('.fastq.gz', '').replace('_1', '_2')}_unmapped.txt"
        new_bc_file = f"{bwa_outfolder}/raw/{bc_file.split('/')[-1].replace('.gz', '')}"

        # Create new fastq file including only unmapped sequences
        !$seqtk subseq $bc_file $sam_out > $new_bc_file

        pbar.update(n=1)