<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_2/2_align_zebov_subset_kallisto_translated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Align a subset of the macaque PBMC Zaire ebolavirus (ZEBOV) dataset using kallisto translated search and generate bam files to visualize the alignment

In [1]:
# Number of threads used for alignment
threads = 2

### Download raw sequencing file and subset to first 100,000,000 reads

In [2]:
!pip install -q ffq
import json

out = "data.json"
!ffq SRR12698539 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

for dataset in data:
    url = dataset["url"]
    !curl -O $url

[2023-12-13 04:56:29,302]    INFO Parsing run SRR12698539
2
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 6625M  100 6625M    0     0  40.3M      0  0:02:44  0:02:44 --:--:-- 42.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.6G  100 20.6G    0     0  42.2M      0  0:08:19  0:08:19 --:--:-- 41.2M


In [3]:
fastq = "SRR12698539_2.fastq.gz"
test_fastq = "SRR12698539_2_short.fastq"

# Create new file keeping only first X reads
!zcat $fastq | head -400000000 > $test_fastq

# Align using kallisto translated search

In [4]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Install bustools from source
!git clone -q https://github.com/BUStools/bustools.git
!cd bustools && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"
bustools = "/content/bustools/build/src/bustools"

  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test COMPILER_SUPPORTS_CXX17
-- Performing Test COMPILER_SUPPORTS_CXX17 - Success
[0mshared build[0m
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- Found ZLIB: /us

Build reference index:

In [5]:
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

virus_fasta = "palmdb_rdrp_seqs.fa"
virus_t2g = "palmdb_clustered_t2g.txt"

--2023-12-13 05:17:41--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4561689 (4.3M) [text/plain]
Saving to: ‘palmdb_clustered_t2g.txt’


2023-12-13 05:17:41 (46.6 MB/s) - ‘palmdb_clustered_t2g.txt’ saved [4561689/4561689]

--2023-12-13 05:17:41--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35361991 (34M) [text/plain]
Saving to: ‘palmdb_rdrp

In [6]:
# Get host genomes and transcriptomes and concatenate them into a single file for host masking
!pip install -q gget
!gget ref -w cdna,dna -r 110 -d canis_lupus_familiaris
!gget ref -w cdna,dna -r 110 -d macaca_mulatta
canine_cdna = "Canis_lupus_familiaris.ROS_Cfam_1.0.cdna.all.fa.gz"
macaque_cdna = "Macaca_mulatta.Mmul_10.cdna.all.fa.gz"
canine_dna = "Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz"
macaque_dna = "Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz"

canine_macaque_fasta = "combined.cdna_dna.all.fa.gz"
!cat $canine_cdna $macaque_cdna $canine_dna $macaque_dna > $canine_macaque_fasta

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.1/43.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hWed Dec 13 05:18:07 2023 INFO Fetching reference information for canis_lupus_familiaris from Ensembl release: 110.
{
    "canis_lupus_familiaris": {
        "transcriptome_cdna": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/fasta/canis_lupus_familiaris/cdna/Canis_lupus_familiaris.ROS_Cfam_1.0.cdna.all.fa.gz",
            "ensembl_release": 110,
            "release_date": "2023-04-22",
            "release_time": "16:19",
            "bytes": "26M"
        },
        "genome_dna": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/fasta/canis_lupus_familiaris/dna/Canis_lupus_familiaris.ROS_Cfam_1.0.d

In [7]:
virus_index = "virus_index.idx"

# Generate virus reference index
!$kallisto index \
    --aa \
    -t $threads \
    --d-list=$canine_macaque_fasta \
    -i $virus_index \
    $virus_fasta


[index] --d-list-overhang was set to 3 (with --aa, the d-list overhang must be >= 3)
[build] loading fasta file palmdb_rdrp_seqs.fa
[build] k-mer length: 31
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Finished
CompactedDBG::build(): Estimated number of k-mers occurring at least once: 37641510
CompactedDBG::build(): Estimated number of minimizer occurring at least once: 7877811
CompactedDBG::filter(): Processed 87630084 k-mers in 296561 reads
CompactedDBG::filter(): Found 37508894 unique k-mers
CompactedDBG::filter(): Number of blocks in Bloom filter is 257317
CompactedDBG::construct(): Extract approximate unitigs (1/2)
CompactedDBG::construct(): Extract approximate unitigs (2/2)
CompactedDBG::construct(): Closed all input files

CompactedDBG::construct(): Splitting unitigs (1/2)

CompactedDBG::construct(): Splitting unitigs (2/2)
CompactedDBG::const

Align using `-n` flag to keep track of aligned reads:

In [8]:
outfolder = "zebov_subset_alignment"
!mkdir $outfolder

!$kallisto bus \
        -i $virus_index \
        -o $outfolder/kallisto \
        --aa \
        -n \
        -x bulk \
        -t $threads \
        $test_fastq

!$bustools sort --flags -o $outfolder/kallisto/output_sorted.bus $outfolder/kallisto/output.bus

!$bustools count \
    --genecounts \
    --cm -m \
    -o $outfolder/kallisto/bustools_count/ \
    -g $virus_t2g \
    -e $outfolder/kallisto/matrix.ec \
    -t $outfolder/kallisto/transcripts.txt \
    $outfolder/kallisto/output_sorted.bus


[index] k-mer length: 31
[index] number of targets: 296,561
[index] number of k-mers: 37,541,757
[index] number of D-list k-mers: 90,815
[quant] running in single-end mode
[quant] will process file 1: SRR12698539_2_short.fastq
[progress] 99M reads processed (0.0% mapped)              done
[quant] processed 100,000,000 reads, 16,089 reads pseudoaligned

 all fits in buffer
Read in 16089 BUS records
reading time 0.000453s
sorting time 0.000893s
writing time 0.002018s


Convert bus file to txt file to get indeces of aligned reads:

In [9]:
# To use this, add --flags to bustools sort
!$bustools text \
    -f \
    -o $outfolder/kallisto/output_sorted.bus.txt \
    $outfolder/kallisto/output_sorted.bus

Read in 16089 BUS records


### Extract sequences that aligned to EBOV

In [10]:
!pip install -q biopython

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/3.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/3.1 MB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.1/3.1 MB[0m [31m34.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25h

!!! The function below should be updated to use `bustools capture` to extract the aligned virus reads as was done [here](https://github.com/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_5/Figure_5b/pull_out_reads_BLAST_comparisons.ipynb).

In [11]:
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
TQDM_BAR_FORMAT = (
    "Parsing fastq for reads: {n_fmt} reads processed."
)

def extract_kb_reads(kallisto_out, fastq, gene_id):
    """
    Create new fasta file including only reads that aligned to a specific
    gene/transcript ID after running kallisto.

    Args:
    - kallisto_out    Folder containing kallisto output.
    - fastq           Path to original fastq file.
    - gene_id         ID of gene/transcript of interest.

    Returns fasta file.

    Note:
    Only works if `kallisto bus` was run with argument `-n`,
    `--flags` was included when running `bustools sort`,
    and .bus output was converted to .txt using `bustools text`.
    """

    # Get 0-indexed line number of target ID from transcripts.txt
    with open(f"{kallisto_out}/transcripts.txt") as f:
        transcripts = f.read().splitlines()
    gene_id_idx = transcripts.index(gene_id)

    # Get equivalence class that matches to 0-indexed line number of target ID
    ec_df = pd.read_csv(f"{kallisto_out}/matrix.ec", sep="\t", header=None)
    gene_id_ec = int(ec_df[ec_df[1] == str(gene_id_idx)][0].values[0])

    # Get bus output (converted to txt)
    bus_df = pd.read_csv(f"{kallisto_out}/output_sorted.bus.txt", sep="\t", header=None)
    # Only keep reads that aligned to target ID
    bus_df_target = bus_df[bus_df[2] == gene_id_ec]

    # Get numbers of reads that aligned (adjust zero-indexed bus file to one-indexed fastqs)
    reads_aln = bus_df_target[4].values + 1

    keep_list = reads_aln.astype(str)

    # Loop over original fastq and write reads with index in keep_list to new fasta
    output_fasta = fastq.split(".fastq")[0] + f"_{gene_id}.fa"

    n_written = 0
    with open(fastq) as handle, open(output_fasta, "w") as out:
        records = SeqIO.parse(handle, "fastq")

        with tqdm(bar_format=TQDM_BAR_FORMAT) as pbar:
            for record in records:
                if record.id.split(".")[-1] in keep_list:
                    out.write(">" + record.id + "\n" + str(record.seq) + "\n")
                    n_written += 1
                pbar.update(n=1)

    print(f"{n_written} reads saved that matched ID {gene_id}.")

In [None]:
%%time
extract_kb_reads(f"{outfolder}/kallisto", test_fastq, "u10")

### Align with bowtie2

In [None]:
# Install bowtie2
!wget https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.2.5/bowtie2-2.2.5-linux-x86_64.zip
!unzip bowtie2-2.2.5-linux-x86_64.zip
bowtie2_build = "bowtie2-2.2.5/bowtie2-build"
bowtie2 = "bowtie2-2.2.5/bowtie2"

Generate Bowtie2 genome index:

In [None]:
# Download ZEBOV genome
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/ebola_ref/GCA_000848505.1_ViralProj14703_genomic.fna

In [None]:
b_index = "b_index"
!mkdir -p $b_index

In [None]:
!$bowtie2_build \
    GCA_000848505.1_ViralProj14703_genomic.fna \
    $b_index/ebov

Align extracted reads to ZEBOV genome:

In [None]:
!$bowtie2 \
    -x $b_index/ebov \
    -f -p $threads \
    -U SRR12698503_2_u10.fa \
    -S $outfolder/kallisto/SRR12698503_2_u10_EBOV_aligned.sam

### Use SAMtools to convert the SAM files to sorted BAM files

In [None]:
# Install SAMtools
!wget https://github.com/samtools/samtools/releases/download/1.6/samtools-1.6.tar.bz2
!tar -vxjf samtools-1.6.tar.bz2
!cd samtools-1.6; make
samtools = "samtools-1.6/samtools"

In [None]:
!$samtools view \
    -bS -F4 $outfolder/kallisto/SRR12698503_2_u10_EBOV_aligned.sam \
    > $outfolder/kallisto/SRR12698503_2_u10_EBOV_aligned.bam

In [None]:
!$samtools sort \
    $outfolder/kallisto/SRR12698503_2_u10_EBOV_aligned.bam \
    -o $outfolder/kallisto/SRR12698503_2_u10_EBOV_sorted.bam

In [None]:
!$samtools index \
    $outfolder/kallisto/SRR12698503_2_u10_EBOV_sorted.bam