<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_3/Figure_3a/1_human_SARSCoV_validation_bulk_autopsy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation using lung autopsy samples from COVID-19 patients
Data from https://www.nature.com/articles/s41467-020-20139-7

In [1]:
# Number of threads to use during alignments
threads = 2

## Install software

In [2]:
!pip install -q ffq gget kb_python anndata

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.1/43.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.2/119.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.2/25.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.9/21.9 MB[0m [31m54.9 MB/s

In [3]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Install bustools from source
!git clone -q https://github.com/BUStools/bustools.git
!cd bustools && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"
bustools = "/content/bustools/build/src/bustools"

  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Performing Test COMPILER_SUPPORTS_CXX17
-- Performing Test COMPILER_SUPPORTS_CXX17 - Success
[0mshared build[0m
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE  
-- Found ZLIB: /us

## Download data

In [4]:
import json
import glob

# Get ftp download links for raw data with ffq and store results in json file
!ffq GSE150316 \
    --ftp \
    -o ffq.json

# Load ffq output
f = open("ffq.json")
data_json = json.load(f)
f.close()

# Download raw data using FTP links fetched by ffq
for dataset in data_json:
    url = dataset["url"]
    !curl -O $url

[2023-12-13 05:03:19,824]    INFO Parsing GEO GSE150316
[2023-12-13 05:03:20,012]    INFO Finding supplementary files for GEO GSE150316
[2023-12-13 05:03:22,347]    INFO Parsing GSM GSM4546576
[2023-12-13 05:03:22,671]    INFO Finding supplementary files for GSM GSM4546576
[2023-12-13 05:03:23,793]    INFO No supplementary files found for GSM4546576
[2023-12-13 05:03:25,219]    INFO Getting sample for GSM4546576
[2023-12-13 05:03:26,599]    INFO Parsing sample SRS6645124
[2023-12-13 05:03:28,082]    INFO Getting Experiment for SRS6645124
[2023-12-13 05:03:28,083]    INFO Parsing Experiment SRX8325539
[2023-12-13 05:03:28,090]    INFO Parsing run SRR11772358
[2023-12-13 05:03:30,203]    INFO Parsing GSM GSM4546577
[2023-12-13 05:03:30,659]    INFO Finding supplementary files for GSM GSM4546577
[2023-12-13 05:03:31,698]    INFO No supplementary files found for GSM4546577
[2023-12-13 05:03:32,158]    INFO Getting sample for GSM4546577
[2023-12-13 05:03:32,803]    INFO Parsing sample SRS66

## Download optimized PalmDB reference files

In [5]:
# Download the ID to taxonomy mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

--2023-12-13 05:35:54--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19705497 (19M) [text/plain]
Saving to: ‘ID_to_taxonomy_mapping.csv’


2023-12-13 05:35:55 (140 MB/s) - ‘ID_to_taxonomy_mapping.csv’ saved [19705497/19705497]

--2023-12-13 05:35:55--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4561689 (4.3M) [text/plain]
Saving to: 

## Build virus reference index from PalmDB amino acid sequences and mask host (here, human) sequences
You can find the kb manual and tutorials [here](https://www.kallistobus.tools/).

The --aa argument tells kb that this is an amino acid reference.

The --d-list argument is the path to the host transcriptome. These sequences will be masked in the index. Here, we are using gget to fetch the human genome and transcriptome (release 110).

We are using --workflow custom here since we do not have a .gtf file for the PalmDB fasta file.

Building the index will take some time (~20 min), since the human genomes is quite large.

In [6]:
!gget ref -r 110 -w cdna,dna -d human

# Concatenate human genome and transcriptome into one file
!cat Homo_sapiens.GRCh38.cdna.all.fa.gz Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz > Homo_sapiens.GRCh38.cdna_dna.fa.gz

Wed Dec 13 05:36:03 2023 INFO Fetching reference information for homo_sapiens from Ensembl release: 110.
{
    "homo_sapiens": {
        "transcriptome_cdna": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz",
            "ensembl_release": 110,
            "release_date": "2023-04-22",
            "release_time": "04:25",
            "bytes": "75M"
        },
        "genome_dna": {
            "ftp": "http://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz",
            "ensembl_release": 110,
            "release_date": "2023-04-21",
            "release_time": "17:28",
            "bytes": "841M"
        }
    }
}
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75.2M  100 75.2M    0     0   770k      0  0:01:39  0:01:39 --:--:--  780k
  % Total    

In [7]:
%%time
!kb ref \
  --workflow custom \
  --aa \
  --d-list Homo_sapiens.GRCh38.cdna_dna.fa.gz \
  -t $threads \
  -i index.idx \
  palmdb_rdrp_seqs.fa

[2023-12-13 05:56:52,934]    INFO [ref_custom] Indexing palmdb_rdrp_seqs.fa to index.idx
[2023-12-13 08:03:58,517]    INFO [ref_custom] Finished creating custom index
CPU times: user 32 s, sys: 4.6 s, total: 36.6 s
Wall time: 2h 7min 19s


## Align data using kallisto translated search

In [8]:
fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()
fastqs

['SRR11772358_1.fastq.gz',
 'SRR11772358_2.fastq.gz',
 'SRR11772359_1.fastq.gz',
 'SRR11772359_2.fastq.gz',
 'SRR11772360_1.fastq.gz',
 'SRR11772360_2.fastq.gz',
 'SRR11772361_1.fastq.gz',
 'SRR11772361_2.fastq.gz',
 'SRR11772362_1.fastq.gz',
 'SRR11772362_2.fastq.gz',
 'SRR11772363_1.fastq.gz',
 'SRR11772363_2.fastq.gz',
 'SRR11772364_1.fastq.gz',
 'SRR11772364_2.fastq.gz',
 'SRR11772365_1.fastq.gz',
 'SRR11772365_2.fastq.gz',
 'SRR11772366_1.fastq.gz',
 'SRR11772366_2.fastq.gz',
 'SRR11772367_1.fastq.gz',
 'SRR11772367_2.fastq.gz',
 'SRR11772368_1.fastq.gz',
 'SRR11772368_2.fastq.gz',
 'SRR11772369_1.fastq.gz',
 'SRR11772369_2.fastq.gz',
 'SRR11772370_1.fastq.gz',
 'SRR11772370_2.fastq.gz',
 'SRR11772371_1.fastq.gz',
 'SRR11772371_2.fastq.gz',
 'SRR11772372_1.fastq.gz',
 'SRR11772372_2.fastq.gz',
 'SRR11772373_1.fastq.gz',
 'SRR11772373_2.fastq.gz',
 'SRR11772374_1.fastq.gz',
 'SRR11772374_2.fastq.gz',
 'SRR11772375_1.fastq.gz',
 'SRR11772375_2.fastq.gz',
 'SRR11772376_1.fastq.gz',
 

In [None]:
%%time
for fastq in fastqs:
    sample = fastq.split(".fastq.gz")[0]

    !mkdir -p $sample

    !$kallisto bus \
            --aa \
            -i index.idx \
            -o $sample \
            -t $threads \
            -x bulk \
            $fastq

    !$bustools sort -o $sample/output_sorted.bus $sample/output.bus

    !$bustools count \
        --genecounts \
        --cm \
        -o $sample/bustools_count \
        -g palmdb_clustered_t2g.txt \
        -e $sample/matrix.ec \
        -t $sample/transcripts.txt \
        $sample/output_sorted.bus


[index] k-mer length: 31
[index] number of targets: 296,561
[index] number of k-mers: 37,541,757
[index] number of D-list k-mers: 48,235
[quant] running in single-end mode
[quant] will process file 1: SRR11772358_1.fastq.gz
[progress] 25M reads processed (0.0% mapped)              done
[quant] processed 25,767,655 reads, 5,608 reads pseudoaligned

 all fits in buffer
Read in 5608 BUS records
reading time 6.6e-05s
sorting time 0.000475s
writing time 0.000141s

[index] k-mer length: 31
[index] number of targets: 296,561
[index] number of k-mers: 37,541,757
[index] number of D-list k-mers: 48,235
[quant] running in single-end mode
[quant] will process file 1: SRR11772358_2.fastq.gz
[progress] 25M reads processed (0.0% mapped)              done
[quant] processed 25,767,655 reads, 5,976 reads pseudoaligned

 all fits in buffer
Read in 5976 BUS records
reading time 7.7e-05s
sorting time 0.00061s
writing time 0.000226s

[index] k-mer length: 31
[index] number of targets: 296,561
[index] numb

## Plot virus counts

Create adata objects from count matrices:

In [None]:
import kb_python.utils as kb_utils
import anndata
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.colors
%config InlineBackend.figure_format='retina'

def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

In [None]:
adatas = []
for fastq in fastqs:
    # Load data
    sample = fastq.split(".fastq.gz")[0]

    # Filepath to counts
    X = f"{sample}/bustools_count/output.mtx"
    # Filepath to barcode metadata
    var_path = f"{sample}/bustools_count/output.genes.txt"
    # Filepath to gene metadata
    obs_path = f"{sample}/bustools_count/output.barcodes.txt"

    # Create AnnData object
    adata = kb_utils.import_matrix_as_anndata(X, obs_path, var_path)

    # Add sample name
    adata.obs["sample"] = sample

    # Append to adata list
    adatas.append(adata)

In [None]:
# Concatenate AnnData objects
adata = anndata.concat(adatas)
adata

In [None]:
# Set sample as index and drop unnecessary columns
adata.obs = adata.obs.set_index("sample")
adata.obs

In [None]:
# Load library metadata
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_3/Figure_3a/human_SARSCoV_validation_bulk_autopsy_SraRunTable.txt
metadata = pd.read_csv("human_SARSCoV_validation_bulk_autopsy_SraRunTable.txt", sep=",")

In [None]:
# Add case ID from metadata to adata
cases = []
reads = []
for srr in adata.obs.index:
    srr_temp = srr.split("_")[0]

    # Get case ID
    if str(metadata[metadata["Run"] == srr_temp]["case"].values[0]) != "nan":
        cases.append(str(metadata[metadata["Run"] == srr_temp]["case"].values[0]))
    else:
        cases.append(str(metadata[metadata["Run"] == srr_temp]["sample_case"].values[0]))

    if "_1" in srr:
        reads.append("R1")
    else:
        reads.append("R2")

adata.obs["case"] = cases
adata.obs["read"] = reads

Load RNAish % from paper:

In [None]:
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_3/Figure_3a/human_SARSCoV_validation_bulk_autopsy_rnaish.csv

In [None]:
rna_ish = pd.read_csv("human_SARSCoV_validation_bulk_autopsy_rnaish.csv", sep="\t")
rna_ish["case_clean"] = rna_ish["case_clean"].values.astype(str)
rna_ish

Load virus ID to sOTU mapping:

In [None]:
tax_df = pd.read_csv("ID_to_taxonomy_mapping.csv")
tax_df[tax_df["species"].str.contains("Severe acute respiratory syndrome")]

Plot:

In [None]:
case_list = rna_ish["case_clean"].values

In [None]:
fig, ax = plt.subplots(figsize=(6, 7))
fontsize = 16

target_ids = tax_df[tax_df["species"].str.contains("Severe acute respiratory syndrome-related coronavirus")]["rep_ID"].values

kb_counts = []
kb_errs = []
rnaish_counts = []
# Not including Control since there are no RNAish% values for control
for i, case in enumerate(case_list[1:]):
    kb_count_r1 = adata[(adata.obs["case"] == case) & (adata.obs["read"] == "R1"), adata.var.index.isin(target_ids)].X
    kb_count_r2 = adata[(adata.obs["case"] == case) & (adata.obs["read"] == "R2"), adata.var.index.isin(target_ids)].X

    kb_c = [kb_count_r1.sum(), kb_count_r2.sum()]
    count_mean = np.mean(kb_c)
    kb_counts.append(np.mean(kb_c))

    kb_err = [
        [count_mean - np.min(kb_c)],
        [np.max(kb_c) - count_mean],
    ]
    kb_errs.append(kb_err)

    rnaish_count = rna_ish[rna_ish["case_clean"] == case]["Viral load% by RNA ISH"].values[0]
    rnaish_counts.append(rnaish_count)

    ax.errorbar(count_mean, rnaish_count, xerr = kb_err, c= "black", ls="none", elinewidth=1)
    ax.scatter(count_mean, rnaish_count, c="#003049", edgecolors="black", s=250, zorder=2)

# ax.set_yscale("symlog")
# ax.set_xscale("symlog")
ax.set_ylabel("RNA-ISH (SARS-CoV %)", fontsize=fontsize)
ax.set_xlabel(
    "kallisto\n(raw counts for SARS-CoV)",
    fontsize=fontsize,
)

ax.text(5150, 1.5, "n=23", fontsize=fontsize)

# Add diagonal
# ax.plot([0, 1], [0, 1], transform=ax.transAxes, c="black", ls="-", lw=1, zorder=1)

ax.tick_params(axis="both", labelsize=fontsize)
ax.set_title(f"Lung autopsy samples\nfrom COVID-19 patients", fontsize=fontsize+2)

# plt.tight_layout()

ax.grid(True, which="both", color="lightgray", ls="--", lw=1)
ax.set_axisbelow(True)

plt.savefig("bulk_validation_PRJNA631753.png", dpi=300, bbox_inches="tight")

fig.show()