In [18]:
bp = "PRJNA665227"
fastq_folder = f"/home/laura/projects/virus-watch-data/{bp}/bwa/raw"

In [19]:
virus_fasta = "/home/laura/projects/virus-watch-data/virus_ref/uniques_noduplicates.fa"
virus_index = "/home/laura/projects/virus-watch-data/virus_ref/kallisto_aa/noduplicates.idx"
virus_t2g = "/home/laura/projects/virus-watch-data/virus_ref/nodup_clu_t2g.txt"

out_folder = f"/home/laura/projects/virus-watch-data/{bp}/full_index_bwa"
host_out_folder = f"/home/laura/projects/virus-watch-data/{bp}/full_index_stringent_dlist_flank1/host_canine"

kallisto = "/home/laura/projects/kallisto/build/src/kallisto"
bustools = "/home/laura/projects/bustools/build/src/bustools"

Create new batch file with paths to files where host reads were removed based on bwa alignment:

In [20]:
sample_batch_file = f"{fastq_folder}/batch.txt"

In [21]:
import os
import glob

In [22]:
fastqs = []
for filename in glob.glob(f"{fastq_folder}/*.fastq"):
    fastqs.append(filename.split("/")[-1])
    
fastqs.sort()

In [23]:
len(fastqs)

212

In [24]:
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

In [25]:
samples = list(set(samples))
len(samples)

106

In [26]:
with open(sample_batch_file, "w") as batch_file:
    for sample in samples:
        fastq1 = sample + "_1.fastq"
        fastq2 = sample + "_2.fastq"
        batch_file.write(sample + "\t" + fastq_folder + "/" + fastq1 + "\t" + fastq_folder + "/" + fastq2 + "\n")

Generate virus reference index without dlist:

In [6]:
!/usr/bin/time -v $kallisto index \
    -t 30 \
    --aa \
    -i $virus_index \
    $virus_fasta


[build] loading fasta file /home/laura/projects/virus-watch-data/virus_ref/uniques_noduplicates.fa
[build] k-mer length: 31
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Finished
CompactedDBG::build(): Estimated number of k-mers occurring at least once: 37641510
CompactedDBG::build(): Estimated number of minimizer occurring at least once: 7877811
CompactedDBG::filter(): Processed 87630084 k-mers in 296561 reads
CompactedDBG::filter(): Found 37508917 unique k-mers
CompactedDBG::filter(): Number of blocks in Bloom filter is 257317
CompactedDBG::construct(): Extract approximate unitigs (1/2)
CompactedDBG::construct(): Extract approximate unitigs (2/2)
CompactedDBG::construct(): Closed all input files

CompactedDBG::construct(): Splitting unitigs (1/2)

CompactedDBG::construct(): Splitting unitigs (2/2)
CompactedDBG::construct(): Before split: 2040432 uni

Align files and correct barcodes based on host whitelist:

In [27]:
!/usr/bin/time -v $kallisto bus \
        -i $virus_index \
        -o $out_folder \
        --aa \
        -t 30 \
        -B $sample_batch_file \
        --batch-barcodes \
        -x 0,0,12:0,12,20:1,0,0


[bus] will try running read files supplied in batch file
[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology
[index] k-mer length: 31
[index] number of targets: 296,561
[index] number of k-mers: 37,541,756
[quant] running in single-end mode
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698573_1.fastq
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698573_2.fastq
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698501_1.fastq
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698501_2.fastq
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698523_1.fastq
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/bwa/raw/SRR12698523_2.fastq
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJN

In [28]:
%%time
!$bustools sort \
    -m 4G \
    -t 20 \
    -o $out_folder/output_sorted.bus \
    $out_folder/output.bus

!$bustools correct \
    -w $host_out_folder/bustools_whitelist.txt \
    -o $out_folder/output_sorted_corrected.bus \
    $out_folder/output_sorted.bus

!$bustools sort \
    -m 4G \
    -t 30 \
    -o $out_folder/output_sorted_corrected_sorted.bus \
    $out_folder/output_sorted_corrected.bus

!$bustools count \
    --genecounts \
    -o $out_folder/bustools_count/ \
    -g $virus_t2g \
    -e $out_folder/matrix.ec \
    -t $out_folder/transcripts.txt \
    $out_folder/output_sorted_corrected_sorted.bus

partition time: 3.74s
partition time: 2.21s
Read in 234424610 BUS records
reading time 4.44s
sorting time 49.3s
writing time 0s
Found 775516 barcodes in the on-list
Processed 65542131 BUS records
In on-list = 13464785
Corrected    = 15472040
Uncorrected  = 36605306
partition time: 0.48s
 all fits in buffer
Read in 28936825 BUS records
reading time 0.41s
sorting time 5.57s
writing time 0.85s
CPU times: user 1.85 s, sys: 378 ms, total: 2.23 s
Wall time: 1min 53s
