<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/1_align_blank_reagent_libraries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Contaminating virus-like sequences in laboratory reagents - Generate the count matrix
### NOTE: To make this notebook run faster, choose a runtime type with more cores, e.g. "TPU v2".
Sequencing data was generated from 'blank' sequencing libraries containing only sterile water and reagents (data source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8625350/).

In [12]:
!pip install -q anndata

In [13]:
import multiprocessing
import glob
import json
import anndata

In [2]:
# Get the number of cores
cores = multiprocessing.cpu_count()
print(f"Number of available cores: {cores}")

# Set number of threads to use during alignments equal to available number of cores
threads = cores

Number of available cores: 96


In [3]:
!pip install -q ffq gget kb_python

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.1/43.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.3/117.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m18.5 MB/s

# Download raw sequencing data

In [4]:
# Get download info for SRRs containing samples of reagents only
out = "bkg_viruses_data.json"
!ffq SRR14737466 SRR14737469 SRR14737470 SRR14737471 --ftp -o $out

# Open ffq results
f = open(out)
data = json.load(f)
f.close()

print(len(data))
data[0]

[2024-05-06 20:39:32,932]    INFO Parsing run SRR14737466
[2024-05-06 20:39:34,922]    INFO Parsing run SRR14737469
[2024-05-06 20:39:36,634]    INFO Parsing run SRR14737470
[2024-05-06 20:39:38,313]    INFO Parsing run SRR14737471
8


{'accession': 'SRR14737466',
 'filename': 'SRR14737466_1.fastq.gz',
 'filetype': 'fastq',
 'filesize': 445000738,
 'filenumber': 1,
 'md5': '05052583388046a53e52a065fe31733f',
 'urltype': 'ftp',
 'url': 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR147/066/SRR14737466/SRR14737466_1.fastq.gz'}

In [5]:
datafolder = "raw_data"
!mkdir $datafolder

In [6]:
# Download data
for dataset in data:
    url = dataset["url"]
    !cd $datafolder && curl -O $url

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  424M  100  424M    0     0  25.8M      0  0:00:16  0:00:16 --:--:-- 30.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  454M  100  454M    0     0  26.8M      0  0:00:16  0:00:16 --:--:-- 30.6M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1299M  100 1299M    0     0  29.3M      0  0:00:44  0:00:44 --:--:-- 31.0M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1326M  100 1326M    0     0  29.2M      0  0:00:45  0:00:45 --:--:-- 30.5M
  % Total    % Received % Xferd  Average Speed   Tim

# Download optimized PalmDB reference files

In [7]:
# Download the ID to taxonomy mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

--2024-05-06 20:44:48--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19705497 (19M) [text/plain]
Saving to: ‘ID_to_taxonomy_mapping.csv’


2024-05-06 20:44:49 (149 MB/s) - ‘ID_to_taxonomy_mapping.csv’ saved [19705497/19705497]

--2024-05-06 20:44:49--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4561689 (4.3M) [text/plain]
Saving to: 

# Generate PalmDB/virus amino acid (--aa) reference index

In [8]:
%%time
!kb ref \
  --workflow custom \
  --aa \
  -t $threads \
  -i virus_index.idx \
  palmdb_rdrp_seqs.fa

[2024-05-06 20:45:09,813]    INFO [ref_custom] Indexing palmdb_rdrp_seqs.fa to virus_index.idx
[2024-05-06 20:46:56,217]    INFO [ref_custom] Finished creating custom index
CPU times: user 829 ms, sys: 179 ms, total: 1.01 s
Wall time: 2min 5s


# Generate a count matrix for each library using kallisto translated search (--aa)

In [9]:
%%time
out_folder = "aligned"
!mkdir $out_folder

for filename in glob.glob(f"{datafolder}/*.fastq.gz"):
    sample = filename.split("/")[-1].split(".")[0]

    !kb count \
        --aa \
        -t $threads \
        -i virus_index.idx \
        -g palmdb_clustered_t2g.txt \
        --h5ad \
        -x bulk \
        --parity single \
        -o $out_folder/$sample \
        $filename

[2024-05-06 20:47:08,891]    INFO [count] Using index virus_index.idx to generate BUS file to aligned/SRR14737466_1 from
[2024-05-06 20:47:08,892]    INFO [count]         raw_data/SRR14737466_1.fastq.gz
[2024-05-06 20:47:42,313]    INFO [count] Sorting BUS file aligned/SRR14737466_1/output.bus to aligned/SRR14737466_1/tmp/output.s.bus
[2024-05-06 20:47:44,725]    INFO [count] Inspecting BUS file aligned/SRR14737466_1/tmp/output.s.bus
[2024-05-06 20:47:45,833]    INFO [count] Generating count matrix aligned/SRR14737466_1/counts_unfiltered/cells_x_genes from BUS file aligned/SRR14737466_1/tmp/output.s.bus
[2024-05-06 20:47:47,343]    INFO [count] Writing gene names to file aligned/SRR14737466_1/counts_unfiltered/cells_x_genes.genes.names.txt
[2024-05-06 20:47:47,636]    INFO [count] Reading matrix aligned/SRR14737466_1/counts_unfiltered/cells_x_genes.mtx
[2024-05-06 20:47:47,766]    INFO [count] Writing matrix to h5ad aligned/SRR14737466_1/counts_unfiltered/adata.h5ad
[2024-05-06 20:47:5

# Add metadata to count matrices and combine them into a single count matrix

In [10]:
# Experimental conditions of each blank negative control sample as described in the original publication (linked above)
tech_lookup = {}
tech_lookup["SRR14737471"] = {
    "Sequencing Platform" : "Illumina Novaseq 6000 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "RNeasy Plus Universal Kits (Qiagen, Hilden, Germany)",
    "Library Preparation" : "Trio RNA-seq + UDI (NuGEN)"
}
tech_lookup["SRR14737470"] = {
    "Sequencing Platform" : "Illumina Novaseq 6000 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "RNeasy Plus Universal Kits (Qiagen, Hilden, Germany)",
    "Library Preparation" : "Trio RNA-seq + UDI (NuGEN)"
}
tech_lookup["SRR14737466"] = {
    "Sequencing Platform" : "Illumina MiSeq 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "Total RNA purification Kit (Norgen BioTek, Thorold, ON, Canada)",
    "Library Preparation" : "SMARTer Stranded Total RNA-Seq Kit v2-Pico Input Mammalian (Clontech)"
}
tech_lookup["SRR14737469"] = {
    "Sequencing Platform" : "Illumina NextSeq 500, mid-output 150 cycle kit (2 × 75 nt reads)",
    "RNA Extraction" : "Total RNA Purification Kit (Norgen Biotek, Thorold, ON, Canada)",
    "Library Preparation" : "SMARTer Stranded Total RNA-Seq Kit v2-Pico Input Mammalian (Clontech)"
}

In [14]:
adatas = []
for filepath in sorted(glob.glob(f"{out_folder}/*")):
    adata_temp = anndata.read_h5ad(filepath + "/counts_unfiltered/adata.h5ad")

    # Add metadata to obs
    adata_temp.obs["Paired Read"] = filepath.split("/")[-1]
    srr = filepath.split("/")[-1].split("_")[0]
    adata_temp.obs["SRR"] = srr
    adata_temp.obs["Sequencing Platform"] = tech_lookup[srr]["Sequencing Platform"]
    adata_temp.obs["RNA Extraction"] = tech_lookup[srr]["Sequencing Platform"]
    adata_temp.obs["Library Preparation"] = tech_lookup[srr]["Library Preparation"]

    adatas.append(adata_temp)

In [15]:
# Concatenate anndata objects into a single object
adata = anndata.concat(adatas, join="outer")
adata.obs["Sequencing Platform (short)"] = [" ".join(i.split(" ")[:3]).replace(",", "") for i in adata.obs["Sequencing Platform"].values]
adata.obs["Total Reads"] = adata.X.sum(axis=1)
adata.obs = adata.obs.set_index("Paired Read", drop=True)
adata

  utils.warn_names_duplicates("obs")


AnnData object with n_obs × n_vars = 8 × 99228
    obs: 'SRR', 'Sequencing Platform', 'RNA Extraction', 'Library Preparation', 'Sequencing Platform (short)', 'Total Reads'

In [16]:
# Remove virus IDs that don't have at least one count
adata = adata[:, (adata.X).sum(axis=0) > 0]
adata

View of AnnData object with n_obs × n_vars = 8 × 2214
    obs: 'SRR', 'Sequencing Platform', 'RNA Extraction', 'Library Preparation', 'Sequencing Platform (short)', 'Total Reads'

In [17]:
# Save anndata object
adata.write("viral_sequences_in_laboratory_reagents.h5ad")

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
