Align to Ebola genome to see how many cells were missed when aligning to RdRP only. 

In [41]:
bp = "PRJNA665227"
fastq_folder = f"/home/laura/projects/virus-watch-data/{bp}/raw"
out_folder = f"{fastq_folder}/../ebola"

### Build Ebola genome fasta and gtf
We will split the complete genome into 500 bp pieces and give each of them a number stored in the gtf.

Ebola genome ViralProj14703 (linked to NC_002549.1) downloaded from https://www.ncbi.nlm.nih.gov/data-hub/genome/?taxon=186538

Raw file:  
/home/laura/projects/kraken2-2.1.2/ebov_fna/ncbi_dataset/data/GCA_000848505.1/GCA_000848505.1_ViralProj14703_genomic.fna

In [42]:
ebov_fna = "/home/laura/projects/kraken2-2.1.2/ebov_fna/ncbi_dataset/data/GCA_000848505.1/GCA_000848505.1_ViralProj14703_genomic.fna"

!grep -o '>' $ebov_fna  | wc -l

1


In [43]:
from Bio import SeqIO
import textwrap

In [44]:
# Load raw genome
records = list(SeqIO.parse(ebov_fna, "fasta"))
print(records[0].id)
print("genome length: ", len(records[0].seq))

AF086833.2
genome length:  18959


In [45]:
ebola_fasta = "/home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.fasta"
ebola_gtf = "/home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.gtf"

In [46]:
# Split genome into 500 bp pieces
sequences = textwrap.wrap(str(records[0].seq), 500)

Create new fasta and gtf files:

In [47]:
with open(ebola_gtf, "w") as gtf, open(ebola_fasta, "w") as dna:
    genome_name = "GCA_000848505"
    genome_date = "Oct_2000"

    # Add header lines to GTF
    gtf.write(
        f"#!genome-build {genome_name}.1\n#!genome-version {genome_name}\n#!genome-date {genome_date}\n#!genome-build-accession {genome_name}\n#!genebuild-last-updated {genome_date}\n")

    start = 1
    for i, seq in enumerate(sequences):
        source = genome_name
        features = ["gene", "transcript", "exon", "CDS"]
        frames = [".", ".", ".", "0"]
        end = start  + len(seq)
        gene_id = i

        for feature, frame in zip(features, frames):
            if feature == "gene":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding";\n'
                    )
            if feature == "transcript":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; tag "basic";\n'
                )
            if feature == "exon":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; exon_id "{gene_id}E"; exon_version "1"; tag "basic";\n'
                )
            if feature == "CDS":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; protein_id "{gene_id}P"; protein_version "1"; tag "basic";\n'
                    )
                
        # Build dna/genome file
        dna.write(f">{gene_id} dna:chromosome chromosome:{genome_name}:{gene_id}:{start}:{end}:1 REF\n")
        dna.write(f"{seq}\n")

### Generate Ebola reference index

In [48]:
ebola_index = "/home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.idx"
ebola_t2g = "/home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505_t2g.txt"
ebola_f1 = "/home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505_f1.fa"

In [49]:
!/usr/bin/time -v kb ref \
    -i $ebola_index \
    -g $ebola_t2g \
    -f1 $ebola_f1 \
    -t 50 \
    --kallisto "/home/laura/bin/kallisto" \
    --bustools "/home/laura/anaconda3/bin/bustools" \
    $ebola_fasta $ebola_gtf

[2023-04-15 13:46:12,161]    INFO [ref] Preparing /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.fasta, /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.gtf
[2023-04-15 13:46:12,173]    INFO [ref] Splitting genome /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.fasta into cDNA at /home/laura/projects/virus-watch/seqwell/ebola_PRJNA665227/tmp/tmpo8m8kn5c
[2023-04-15 13:46:12,233]    INFO [ref] Concatenating 1 cDNAs to /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505_f1.fa
[2023-04-15 13:46:12,235]    INFO [ref] Creating transcript-to-gene mapping at /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505_t2g.txt
[2023-04-15 13:46:12,237]    INFO [ref] Indexing /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505_f1.fa to /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505.idx
	Command being timed: "kb ref -i /home/laura/projects/virus-watch-data/ebola_ref/ebov_GCA_000848505

### Align fastqs to Ebola index

In [50]:
import os
import glob

In [51]:
fastqs = []
for filename in glob.glob(f"{fastq_folder}/*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])
    
fastqs.sort()
fastqs

['SRR12698499_1.fastq.gz',
 'SRR12698499_2.fastq.gz',
 'SRR12698500_1.fastq.gz',
 'SRR12698500_2.fastq.gz',
 'SRR12698501_1.fastq.gz',
 'SRR12698501_2.fastq.gz',
 'SRR12698502_1.fastq.gz',
 'SRR12698502_2.fastq.gz',
 'SRR12698503_1.fastq.gz',
 'SRR12698503_2.fastq.gz',
 'SRR12698504_1.fastq.gz',
 'SRR12698504_2.fastq.gz',
 'SRR12698505_1.fastq.gz',
 'SRR12698505_2.fastq.gz',
 'SRR12698506_1.fastq.gz',
 'SRR12698506_2.fastq.gz',
 'SRR12698507_1.fastq.gz',
 'SRR12698507_2.fastq.gz',
 'SRR12698508_1.fastq.gz',
 'SRR12698508_2.fastq.gz',
 'SRR12698509_1.fastq.gz',
 'SRR12698509_2.fastq.gz',
 'SRR12698510_1.fastq.gz',
 'SRR12698510_2.fastq.gz',
 'SRR12698511_1.fastq.gz',
 'SRR12698511_2.fastq.gz',
 'SRR12698512_1.fastq.gz',
 'SRR12698512_2.fastq.gz',
 'SRR12698513_1.fastq.gz',
 'SRR12698513_2.fastq.gz',
 'SRR12698514_1.fastq.gz',
 'SRR12698514_2.fastq.gz',
 'SRR12698515_1.fastq.gz',
 'SRR12698515_2.fastq.gz',
 'SRR12698516_1.fastq.gz',
 'SRR12698516_2.fastq.gz',
 'SRR12698517_1.fastq.gz',
 

In [52]:
len(fastqs)

212

Loop over files and align one at a time:

In [53]:
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

In [54]:
samples = list(set(samples))
len(samples)

106

Align to Ebola index:

In [55]:
%%time

for sample in samples:
    fastq1 = sample + "_1.fastq.gz"
    fastq2 = sample + "_2.fastq.gz"
    
    !mkdir -p $out_folder/$sample
    
    !/usr/bin/time -v kallisto bus \
            -i $ebola_index \
            -o $out_folder/$sample/ \
            -t 50 \
            -x 0,0,12:0,12,20:1,0,0 \
            $fastq_folder/$fastq1 $fastq_folder/$fastq2 \
            &> $out_folder/$sample/kb_out.txt 
        
    !bustools sort -o $out_folder/$sample/output_sorted.bus $out_folder/$sample/output.bus

    !bustools count \
        --genecounts \
        --cm -m \
        -o $out_folder/$sample/bustools_count/ \
        -g $ebola_t2g \
        -e $out_folder/$sample/matrix.ec \
        -t $out_folder/$sample/transcripts.txt \
        $out_folder/$sample/output_sorted.bus

Read in 6 BUS records
Read in 2613 BUS records
Read in 6623 BUS records
Read in 10 BUS records
Read in 442 BUS records
Read in 840 BUS records
Read in 4751 BUS records
Read in 31224 BUS records
Read in 11752 BUS records
Read in 13348 BUS records
Read in 1123 BUS records
Read in 37 BUS records
Read in 47 BUS records
Read in 2210 BUS records
Read in 2207 BUS records
Read in 808413 BUS records
Read in 7 BUS records
Read in 5688 BUS records
Read in 18373 BUS records
Read in 644805 BUS records
Read in 16718 BUS records
Read in 756 BUS records
Read in 3659 BUS records
Read in 2502 BUS records
Read in 6553 BUS records
Read in 37 BUS records
Read in 9 BUS records
Read in 118 BUS records
Read in 7 BUS records
Read in 6 BUS records
Read in 8901 BUS records
Read in 47402 BUS records
Read in 5924 BUS records
Read in 2313 BUS records
Read in 10344 BUS records
Read in 45 BUS records
Read in 1161806 BUS records
Read in 10874 BUS records
Read in 3577 BUS records
Read in 543 BUS records
Read in 62 BUS 