<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/align_macaque_PBMC_data/6_virus_host_captured/1_align_host_captured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capture host reads before aligning sequencing reads to PalmDB with kallisto translated search

In [None]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Install bustools from source
!git clone -q https://github.com/BUStools/bustools.git
!cd bustools && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"
bustools = "/content/bustools/build/src/bustools"

In [None]:
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

virus_fasta = "palmdb_rdrp_seqs.fa"
virus_t2g = "palmdb_clustered_t2g.txt"

In [None]:
# Number of threads to use in alignment
threads = 2

### Download raw sequencing data

In [None]:
!pip install -q ffq
import json

out = "GSE158390_data.json"

# # Download the complete dataset (106 paired fastqs containing a total of 30 billion reads)
# !ffq GSE158390 --ftp -o $out

# Download only two fastq pairs to demonstrate this notebook
!ffq SRR12698499 SRR12698500 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

for dataset in data:
    url = dataset["url"]
    !curl -O $url

Generate sample batch file to align all fastq files simultaneously:

In [None]:
import glob

fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()

# Get sample names
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

samples = list(set(samples))

# Generate sample batch file
sample_batch_file = "batch.txt"
with open(sample_batch_file, "w") as batch_file:
    for sample in samples:
        fastq1 = sample + "_1.fastq.gz"
        fastq2 = sample + "_2.fastq.gz"
        batch_file.write(sample + "\t" + fastq1 + "\t" + fastq2 + "\n")

### Align to host with `-n` flag to keep track of aligned reads:

In [None]:
# Download macaque and dog reference genomes and gtf files
!pip install -q gget
!gget ref -w dna,gtf -r 110 -d macaca_mulatta
!gget ref -w dna,gtf -r 110 -d canis_lupus_familiaris

macaque_fasta = "Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz"
macaque_gtf = "Macaca_mulatta.Mmul_10.110.gtf.gz"
canine_fasta = "Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz"
canine_gtf = "Canis_lupus_familiaris.ROS_Cfam_1.0.110.gtf.gz"

In [None]:
index = "host_index.idx"
t2g = "host_t2g.txt"
fa = "host_transcriptome.fa"

!kb ref \
    -t $threads \
    -i $index \
    -g $t2g \
    -f1 $fa \
    $canine_fasta,$macaque_fasta $canine_gtf,$macaque_gtf

In [None]:
host_out_folder = "virus_host_capture_alignment_results/host"

In [None]:
%%time
!$kallisto bus \
        -n \
        -i $index \
        -o $host_out_folder \
        -t $threads \
        -B $sample_batch_file \
        --batch-barcodes \
        -x 0,0,12:0,12,20:1,0,0

### Align to optimized PalmDB with `-n` flag to keep track of aligned reads:

In [None]:
out_folder = "virus_host_capture_alignment_results/virus"

In [None]:
# Generate virus reference index
virus_index = "virus_index.idx"
!$kallisto index \
    --aa \
    -t $threads \
    -i $virus_index \
    $virus_fasta

In [None]:
%%time
!$kallisto bus \
      -n \
      --aa \
      -i $virus_index \
      -o $out_folder \
      -t $threads \
      -B $sample_batch_file \
      --batch-barcodes \
      -x 0,0,12:0,12,20:1,0,0

### Split virus matrix into reads seen in host and those not seen in host
This has to be done for each batch (SRR library) separately, since the read numbers reset for each batch:

In [None]:
from tqdm import tqdm

In [None]:
# Get list of sample barcodes
sb_file = f"{host_out_folder}/matrix.sample.barcodes"

with open(sb_file, "r") as sb:
    sample_barcodes = sb.read().splitlines()

sample_barcodes[:10]

In [None]:
%%time
sample_barcodes_star = f"{host_out_folder}/matrix.sample.barcodes.tmp"

TQDM_BAR_FORMAT = (
    "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
)

with tqdm(total=len(sample_barcodes), bar_format=TQDM_BAR_FORMAT) as pbar:
    for sb in sample_barcodes:
        with open(sample_barcodes_star, "w") as sbs_file:
            sbs_file.write(sb + "*")

        print(f"Capture reads for batch {sb} for virus and host alignments...")
        # Capture records for this batch from virus and host bus files
        !$bustools capture \
            $out_folder/output.bus \
            --capture $sample_barcodes_star \
            --barcode \
            -o $out_folder/output_sample_tmp.bus

        !$bustools capture \
            $host_out_folder/output.bus \
            --capture $sample_barcodes_star \
            --barcode \
            -o $host_out_folder/output_sample_tmp.bus

        print(f"Split viral alignments into host/masked for batch {sb}...")
        # Capture records from virus reads also mapped to host
        bus_name = f"output_{sb}_host.bus"
        !$bustools capture \
            $out_folder/output_sample_tmp.bus \
            --capture <($bustools text -pf $host_out_folder/output_sample_tmp.bus | cut -d$'\t' -f5)  \
            --flags \
            -o $out_folder/$bus_name

        # Capture records from virus reads that did not map to host
        bus_name = f"output_{sb}_virus.bus"
        !$bustools capture \
            $out_folder/output_sample_tmp.bus \
            --capture <($bustools text -pf $host_out_folder/output_sample_tmp.bus | cut -d$'\t' -f5)  \
            --complement \
            --flags \
            -o $out_folder/$bus_name

        pbar.update(n=1)

### Clean up cell barcodes using host cell whitelist generated by bustools

In [None]:
# Download cell barcode onlist generated during alignment to host
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/align_macaque_PBMC_data/bustools_onlist.txt

In [None]:
import os

In [None]:
%%time
with tqdm(total=len(sample_barcodes), bar_format=TQDM_BAR_FORMAT) as pbar:
    for sb in sample_barcodes:
        for ent in ["host", "virus"]:
            bus_name = f"output_{sb}_{ent}.bus"

            # Correct barcodes (no need to sort first)
            # This needs to be done before removing the flag column since adding the sample barcodes to the cell barcodes will confuse bustools correct
            !$bustools correct \
                -w bustools_onlist.txt \
                -o $out_folder/output_temp1.bus \
                $out_folder/$bus_name

            # Remove flag column so bustools count does not get confused
            # -a retains the sample barcodes and adds them to the normal cell barcodes
            !$bustools text \
                -apf $out_folder/output_temp1.bus | cut -d$'\t' -f1,2,3,4 \
                | $bustools fromtext -o $out_folder/output_temp2.bus -

            # Sort and count
            !$bustools sort \
                -m 4G \
                -t $threads \
                -o $out_folder/output_temp3.bus \
                $out_folder/output_temp2.bus

            os.makedirs(f"{out_folder}/{sb}/{ent}", exist_ok=True)
            !$bustools count \
                --genecounts \
                -o $out_folder/$sb/$ent/bustools_count/ \
                -g $virus_t2g \
                -e $out_folder/matrix.ec \
                -t $out_folder/transcripts.txt \
                $out_folder/output_temp3.bus

        pbar.update(n=1)