<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_9/Supp_Fig_9c/1_align_blank_reagent_libraries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Contaminating virus-like sequences in laboratory reagents - Generate the count matrix
Sequencing data was generated from 'blank' sequencing libraries containing only sterile water and reagents (data source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8625350/).

In [2]:
# Number of threads to use during alignments
threads = 2

In [3]:
!pip install -q ffq gget kb_python

In [4]:
import glob
import json

# Download raw sequencing data

In [5]:
# Get download info for SRRs containing samples of reagents only
out = "bkg_viruses_data.json"
!ffq SRR14737466 SRR14737469 SRR14737470 SRR14737471 --ftp -o $out

# Open ffq results
f = open(out)
data = json.load(f)
f.close()

print(len(data))
data[0]

[2024-05-04 21:19:36,224]    INFO Parsing run SRR14737466
[2024-05-04 21:19:39,232]    INFO Parsing run SRR14737469
[2024-05-04 21:19:41,499]    INFO Parsing run SRR14737470
[2024-05-04 21:19:43,603]    INFO Parsing run SRR14737471
8


{'accession': 'SRR14737466',
 'filename': 'SRR14737466_1.fastq.gz',
 'filetype': 'fastq',
 'filesize': 445000738,
 'filenumber': 1,
 'md5': '05052583388046a53e52a065fe31733f',
 'urltype': 'ftp',
 'url': 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR147/066/SRR14737466/SRR14737466_1.fastq.gz'}

In [None]:
datafolder = "raw_data"
!mkdir $datafolder

In [7]:
# Download data
for dataset in data:
    url = dataset["url"]
    !cd $datafolder && curl -O $url

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  424M  100  424M    0     0  27.5M      0  0:00:15  0:00:15 --:--:-- 32.8M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  454M  100  454M    0     0  27.1M      0  0:00:16  0:00:16 --:--:-- 31.8M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1299M  100 1299M    0     0  31.2M      0  0:00:41  0:00:41 --:--:-- 32.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1326M  100 1326M    0     0  30.8M      0  0:00:43  0:00:43 --:--:-- 32.3M
  % Total    % Received % Xferd  Average Speed   Tim

# Download optimized PalmDB reference files

In [8]:
# Download the ID to taxonomy mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

--2024-05-04 21:24:20--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19705497 (19M) [text/plain]
Saving to: ‘ID_to_taxonomy_mapping.csv.1’


2024-05-04 21:24:20 (141 MB/s) - ‘ID_to_taxonomy_mapping.csv.1’ saved [19705497/19705497]

--2024-05-04 21:24:20--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4561689 (4.3M) [text/plain]
Saving 

# Generate PalmDB/virus amino acid (--aa) reference index

In [9]:
%%time
!kb ref \
  --workflow custom \
  --aa \
  -t $threads \
  -i virus_index.idx \
  palmdb_rdrp_seqs.fa

[2024-05-04 21:24:27,734]    INFO [ref_custom] Skipping kallisto index because virus_index.idx already exists. Use the --overwrite flag to overwrite.
CPU times: user 59.3 ms, sys: 10.8 ms, total: 70.1 ms
Wall time: 7.45 s


# Generate a count matrix for each library using kallisto translated search (--aa)

In [None]:
%%time
out_folder = "aligned"
!mkdir $out_folder

for filename in glob.glob(f"{datafolder}/*.fastq.gz"):
    sample = filename.split("/")[-1].split(".")[0]

    !kb count \
        --aa \
        -t $threads \
        -i virus_index.idx \
        -g palmdb_clustered_t2g.txt \
        --h5ad \
        -x bulk \
        --parity single \
        -o $out_folder/$sample \
        $filename

mkdir: cannot create directory ‘aligned’: File exists
[2024-05-04 21:24:36,375]    INFO [count] Using index virus_index.idx to generate BUS file to aligned/SRR14737470_1 from
[2024-05-04 21:24:36,375]    INFO [count]         raw_data/SRR14737470_1.fastq.gz
[2024-05-04 23:16:41,127]    INFO [count] Sorting BUS file aligned/SRR14737470_1/output.bus to aligned/SRR14737470_1/tmp/output.s.bus
[2024-05-04 23:16:47,654]    INFO [count] Inspecting BUS file aligned/SRR14737470_1/tmp/output.s.bus
[2024-05-04 23:16:48,759]    INFO [count] Generating count matrix aligned/SRR14737470_1/counts_unfiltered/cells_x_genes from BUS file aligned/SRR14737470_1/tmp/output.s.bus
[2024-05-04 23:16:51,091]    INFO [count] Writing gene names to file aligned/SRR14737470_1/counts_unfiltered/cells_x_genes.genes.names.txt
[2024-05-04 23:16:51,842]    INFO [count] Reading matrix aligned/SRR14737470_1/counts_unfiltered/cells_x_genes.mtx
[2024-05-04 23:16:52,286]    INFO [count] Writing matrix to h5ad aligned/SRR14737

# Add metadata to count matrices and combine them into a single count matrix

In [None]:
# Experimental conditions of each blank negative control sample as described in the original publication (linked above)
tech_lookup = {}
tech_lookup["SRR14737471"] = {
    "Sequencing Platform" : "Illumina Novaseq 6000 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "RNeasy Plus Universal Kits (Qiagen, Hilden, Germany)",
    "Library Preparation" : "Trio RNA-seq + UDI (NuGEN)"
}
tech_lookup["SRR14737470"] = {
    "Sequencing Platform" : "Illumina Novaseq 6000 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "RNeasy Plus Universal Kits (Qiagen, Hilden, Germany)",
    "Library Preparation" : "Trio RNA-seq + UDI (NuGEN)"
}
tech_lookup["SRR14737466"] = {
    "Sequencing Platform" : "Illumina MiSeq 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "Total RNA purification Kit (Norgen BioTek, Thorold, ON, Canada)",
    "Library Preparation" : "SMARTer Stranded Total RNA-Seq Kit v2-Pico Input Mammalian (Clontech)"
}
tech_lookup["SRR14737469"] = {
    "Sequencing Platform" : "Illumina NextSeq 500, mid-output 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "Total RNA Purification Kit (Norgen Biotek, Thorold, ON, Canada)",
    "Library Preparation" : "SMARTer Stranded Total RNA-Seq Kit v2-Pico Input Mammalian (Clontech)"
}

In [None]:
adatas = []
for filepath in sorted(glob.glob(f"{out_folder}/*")):
    adata_temp = anndata.read_h5ad(filepath + "/counts_unfiltered/adata.h5ad")

    # Add metadata to obs
    adata_temp.obs["Paired Read"] = filepath.split("/")[-1]
    srr = filepath.split("/")[-1].split("_")[0]
    adata_temp.obs["SRR"] = srr
    adata_temp.obs["Sequencing Platform"] = tech_lookup[srr]["Sequencing Platform"]
    adata_temp.obs["RNA Extraction"] = tech_lookup[srr]["Sequencing Platform"]
    adata_temp.obs["Library Preparation"] = tech_lookup[srr]["Library Preparation"]

    adatas.append(adata_temp)

In [None]:
# Concatenate anndata objects into a single object
adata = anndata.concat(adatas, join="outer")
adata.obs["Sequencing Platform (short)"] = [" ".join(i.split(" ")[:3]).replace(",", "") for i in adata.obs["Sequencing Platform"].values]
adata.obs["Total Reads"] = adata.X.sum(axis=1)
adata.obs = adata.obs.set_index("Paired Read", drop=True)
adata

In [None]:
# Remove virus IDs that don't have at least one count
adata = adata[:, (adata.X).sum(axis=0) > 0]
adata

In [None]:
# Save anndata object
adata.write("viral_sequences_in_laboratory_reagents.h5ad")