# Align to combined macaque and canine reference

In [1]:
bp = "PRJNA665227"
fastq_folder = f"/home/laura/projects/virus-watch-data/{bp}/raw"
sample_batch_file = f"{fastq_folder}/batch.txt"

Generate batch file:

In [2]:
import os
import glob

In [3]:
fastqs = []
for filename in glob.glob(f"{fastq_folder}/*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])
    
fastqs.sort()

In [4]:
len(fastqs)

212

In [5]:
# samples = []
# for fastq in fastqs:
#     samples.append(fastq.split("_")[0])

In [6]:
# samples = list(set(samples))
# len(samples)

In [7]:
# with open(sample_batch_file, "w") as batch_file:
#     for sample in samples:
#         fastq1 = sample + "_1.fastq.gz"
#         fastq2 = sample + "_2.fastq.gz"
#         batch_file.write(sample + "\t" + fastq_folder + "/" + fastq1 + "\t" + fastq_folder + "/" + fastq2 + "\n")

### Generate combined macaque / canine reference

In [6]:
host_fasta = "/home/laura/projects/virus-watch-data/rhesus_ref/Macaca_mulatta.Mmul_10.dna.toplevel.fa.gz"
host_gtf = "/home/laura/projects/virus-watch-data/rhesus_ref/Macaca_mulatta.Mmul_10.109.gtf.gz"

canine_fasta = "/home/laura/projects/virus-watch-data/canine_ref/Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz"
canine_gtf = "/home/laura/projects/virus-watch-data/canine_ref/Canis_lupus_familiaris.ROS_Cfam_1.0.109.gtf.gz"

canine_macaque_fasta = "/home/laura/projects/virus-watch-data/canine_macaque_ref/combined_genomes.dna.toplevel.fa.gz"

index = "/home/laura/projects/virus-watch-data/canine_macaque_ref/index.idx"
t2g = "/home/laura/projects/virus-watch-data/canine_macaque_ref/t2g.txt"
fa = "/home/laura/projects/virus-watch-data/canine_macaque_ref/transcriptome.fa"

kallisto = "/home/laura/projects/kallisto/build/src/kallisto"
bustools = "/home/laura/projects/bustools/build/src/bustools"

In [4]:
!kb ref \
    --overwrite --verbose \
    -t 30 \
    -i $index \
    -g $t2g \
    -f1 $fa \
    --kallisto $kallisto \
    --bustools $bustools \
    $canine_fasta,$host_fasta $canine_gtf,$host_gtf

[2023-07-05 13:06:12,688]   DEBUG [main] Printing verbose output
[2023-07-05 13:06:14,920]   DEBUG [main] kallisto binary located at /home/laura/projects/kallisto/build/src/kallisto
[2023-07-05 13:06:14,920]   DEBUG [main] bustools binary located at /home/laura/projects/bustools/build/src/bustools
[2023-07-05 13:06:14,920]   DEBUG [main] Creating `tmp` directory
[2023-07-05 13:06:14,931]   DEBUG [main] Namespace(list=False, command='ref', tmp=None, keep_tmp=False, verbose=True, i='/home/laura/projects/virus-watch-data/canine_macaque_ref/index.idx', g='/home/laura/projects/virus-watch-data/canine_macaque_ref/t2g.txt', f1='/home/laura/projects/virus-watch-data/canine_macaque_ref/transcriptome.fa', include_attribute=None, exclude_attribute=None, f2=None, c1=None, c2=None, d=None, k=None, t=30, d_list=None, aa=False, workflow='standard', distinguish=False, make_unique=False, overwrite=True, kallisto='/home/laura/projects/kallisto/build/src/kallisto', bustools='/home/laura/projects/bustools

### Align to combined macaque and canine genomes

In [3]:
host_out_folder = f"{fastq_folder}/../full_index/host_canine"

In [8]:
%%time
!/usr/bin/time -v $kallisto bus \
        -i $index \
        -o $host_out_folder \
        -t 30 \
        -B $sample_batch_file \
        --batch-barcodes \
        -x 0,0,12:0,12,20:1,0,0


[bus] will try running read files supplied in batch file
[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology
[index] k-mer length: 31
[index] number of targets: 119,563
[index] number of k-mers: 151,974,779
[index] number of distinguishing flanking k-mers: 7,948,912
[quant] running in single-end mode
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698516_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698516_2.fastq.gz
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698524_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698524_2.fastq.gz
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698536_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698536_2.fastq.gz
[quant] will pro

In [9]:
%%time
!$bustools sort \
    -m 4G \
    -t 30 \
    -o $host_out_folder/output_sorted.bus \
    $host_out_folder/output.bus

partition time: 3.62s
partition time: 3.2s
partition time: 2.94s
partition time: 2.84s
partition time: 2.67s
partition time: 2.86s
partition time: 2.78s
partition time: 2.77s
partition time: 2.79s
partition time: 2.76s
partition time: 2.78s
partition time: 2.9s
partition time: 2.91s
partition time: 2.95s
partition time: 3.15s
partition time: 3.24s
partition time: 3.33s
partition time: 3.59s
partition time: 4.04s
partition time: 3.01s
partition time: 2.81s
partition time: 2.83s
partition time: 2.72s
partition time: 2.8s
partition time: 2.84s
partition time: 2.72s
partition time: 2.82s
partition time: 2.81s
partition time: 2.88s
partition time: 3.54s
partition time: 3.74s
partition time: 3.06s
partition time: 3.14s
partition time: 4.51s
partition time: 3.84s
partition time: 3.48s
partition time: 2.98s
partition time: 2.84s
partition time: 2.71s
partition time: 2.77s
partition time: 2.83s
partition time: 2.78s
partition time: 2.83s
partition time: 2.73s
partition time: 2.79s
partition tim

In [10]:
%%time
# Generate onlist from data since SeqWell does not have a whitelist
# threshold: Minimum number of times a barcode must appear to be included in onlist
!$bustools whitelist \
    --threshold 1000 \
    -o $host_out_folder/bustools_whitelist.txt \
    $host_out_folder/output_sorted.bus

Read in 1917532843 BUS records, wrote 801033 barcodes to whitelist with threshold 1000
CPU times: user 252 ms, sys: 45.9 ms, total: 298 ms
Wall time: 14.6 s


In [11]:
%%time
!$bustools correct \
    -w $host_out_folder/bustools_whitelist.txt \
    -o $host_out_folder/output_sorted_corrected.bus \
    $host_out_folder/output_sorted.bus

Found 775516 barcodes in the on-list
Processed 1917532843 BUS records
In on-list = 1236574388
Corrected    = 181799192
Uncorrected  = 499159263
CPU times: user 9.77 s, sys: 1.95 s, total: 11.7 s
Wall time: 9min 59s


In [12]:
%%time
!$bustools sort \
    -m 4G \
    -t 30 \
    -o $host_out_folder/output_sorted_corrected_sorted.bus \
    $host_out_folder/output_sorted_corrected.bus

partition time: 2.34s
partition time: 2.26s
partition time: 2.17s
partition time: 2.17s
partition time: 2.08s
partition time: 2.1s
partition time: 2.03s
partition time: 2.08s
partition time: 2.11s
partition time: 2.13s
partition time: 1.16s
Read in 1418373580 BUS records
reading time 12.75s
sorting time 291.5s
writing time 0s
CPU times: user 5.1 s, sys: 1.19 s, total: 6.29 s
Wall time: 4min 54s


In [13]:
%%time
!$bustools count \
    --genecounts \
    -o $host_out_folder/bustools_count/ \
    -g $t2g \
    -e $host_out_folder/matrix.ec \
    -t $host_out_folder/transcripts.txt \
    $host_out_folder/output_sorted_corrected_sorted.bus

CPU times: user 4.67 s, sys: 1.01 s, total: 5.68 s
Wall time: 4min 47s


### Align to PalmDB

Concatenate macaque and canine genome files for d-list:

In [20]:
!cat $canine_fasta $host_fasta > $canine_macaque_fasta

Generate virus index:

In [9]:
virus_fasta = "/home/laura/projects/virus-watch-data/virus_ref/uniques_noduplicates.fa"
virus_index = "/home/laura/projects/virus-watch-data/virus_ref/kallisto_aa/noduplicates_rhesus_canine.idx"
virus_t2g = "/home/laura/projects/virus-watch-data/virus_ref/nodup_clu_t2g.txt"

In [10]:
# Generate virus reference index (with default d-list-overhang = 3)
!/usr/bin/time -v $kallisto index \
    -t 30 \
    --aa \
    --d-list=$canine_macaque_fasta \
    -i $virus_index \
    $virus_fasta


[build] loading fasta file /home/laura/projects/virus-watch-data/virus_ref/uniques_noduplicates.fa
[build] k-mer length: 31
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Start computing k-mer cardinality estimations (1/2)
KmerStream::KmerStream(): Finished
CompactedDBG::build(): Estimated number of k-mers occurring at least once: 37641510
CompactedDBG::build(): Estimated number of minimizer occurring at least once: 7877811
CompactedDBG::filter(): Processed 87630084 k-mers in 296561 reads
CompactedDBG::filter(): Found 37509161 unique k-mers
CompactedDBG::filter(): Number of blocks in Bloom filter is 257317
CompactedDBG::construct(): Extract approximate unitigs (1/2)
CompactedDBG::construct(): Extract approximate unitigs (2/2)
CompactedDBG::construct(): Closed all input files

CompactedDBG::construct(): Splitting unitigs (1/2)

CompactedDBG::construct(): Splitting unitigs (2/2)
CompactedDBG::construct(): Before split: 2040617 uni

Align to PalmDB and correct barcodes using host onlist:

In [11]:
out_folder = f"{fastq_folder}/../full_index/virus"

In [12]:
sample_batch_file

'/home/laura/projects/virus-watch-data/PRJNA665227/raw/batch.txt'

In [15]:
%%time
!/usr/bin/time -v $kallisto bus \
        -i $virus_index \
        -o $out_folder \
        --aa \
        -t 30 \
        -B $sample_batch_file \
        --batch-barcodes \
        -x 0,0,12:0,12,20:1,0,0


[bus] will try running read files supplied in batch file
[bus] Note: Strand option was not specified; setting it to --unstranded for specified technology
[index] k-mer length: 31
[index] number of targets: 296,561
[index] number of k-mers: 37,632,058
[index] number of distinguishing flanking k-mers: 90,302
[quant] running in single-end mode
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698516_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698516_2.fastq.gz
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698524_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698524_2.fastq.gz
[quant] will process file 1: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698536_1.fastq.gz
[quant] will process file 2: /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698536_2.fastq.gz
[quant] will process

In [16]:
%%time
!$bustools sort \
    -m 4G \
    -t 20 \
    -o $out_folder/output_sorted.bus \
    $out_folder/output.bus

!$bustools correct \
    -w $host_out_folder/bustools_whitelist.txt \
    -o $out_folder/output_sorted_corrected.bus \
    $out_folder/output_sorted.bus

!$bustools sort \
    -m 4G \
    -t 30 \
    -o $out_folder/output_sorted_corrected_sorted.bus \
    $out_folder/output_sorted_corrected.bus

!$bustools count \
    --genecounts \
    -o $out_folder/bustools_count/ \
    -g $virus_t2g \
    -e $out_folder/matrix.ec \
    -t $out_folder/transcripts.txt \
    $out_folder/output_sorted_corrected_sorted.bus

^C
/bin/bash: correct: command not found
Error: File not found, /home/laura/projects/virus-watch-data/PRJNA665227/raw/../full_index/virus/output_sorted_corrected.bus
Usage: bustools sort [options] bus-files

Options: 
Default behavior (with no flag) is to sort by barcode, UMI, ec, then flag
-t, --threads         Number of threads to use
-m, --memory          Maximum memory used
-T, --temp            Location and prefix for temporary files 
                      required if using -p, otherwise defaults to output
-o, --output          File for sorted output
-p, --pipe            Write to standard output
    --umi             Sort by UMI, barcode, then ec
    --count           Sort by multiplicity, barcode, UMI, then ec
    --flags           Sort by flag, ec, barcode, then UMI
    --flags-bc        Sort by flag, barcode, UMI, then ec

Error: File not found /home/laura/projects/virus-watch-data/PRJNA665227/raw/../full_index/virus/matrix.ec
Error: File not found /home/laura/projects/virus-w