<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_3%20/Figure_3a/1_human_SARSCoV_validation_bulk_autopsy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validation using lung autopsy samples from COVID-19 patients
Data from https://www.nature.com/articles/s41467-020-20139-7

In [None]:
# Number of threads to use during alignments
threads = 20 # Change to 2 if not using TPU runtime

## Install software

In [None]:
!pip install -q ffq gget kb_python

## Download data

In [None]:
import json
import glob

# Get ftp download links for raw data with ffq and store results in json file
!ffq GSE150316 \
    --ftp \
    -o ffq.json

# Load ffq output
f = open("ffq.json")
data_json = json.load(f)
f.close()

# Download raw data using FTP links fetched by ffq
for dataset in data_json:
    url = dataset["url"]
    !curl -O $url

[2023-12-08 23:49:05,833]    INFO Parsing GSM GSM4698570
[2023-12-08 23:49:06,158]    INFO Finding supplementary files for GSM GSM4698570
[2023-12-08 23:49:07,315]    INFO No supplementary files found for GSM4698570


## Download optimized PalmDB reference files

In [None]:
# Download the ID to taxonomy mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

## Build virus reference index from PalmDB amino acid sequences and mask host (here, human) sequences
You can find the kb manual and tutorials [here](https://www.kallistobus.tools/).

The --aa argument tells kb that this is an amino acid reference.

The --d-list argument is the path to the host transcriptome. These sequences will be masked in the index. Here, we are using gget to fetch the human genome and transcriptome (release 110).

We are using --workflow custom here since we do not have a .gtf file for the PalmDB fasta file.

Building the index will take some time (~20 min), since the human genomes is quite large.

In [None]:
!gget ref -r 110 -w cdna,dna -d human

# Concatenate human genome and transcriptome into one file
!cat Homo_sapiens.GRCh38.cdna.all.fa.gz Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz > Homo_sapiens.GRCh38.cdna_dna.fa.gz

In [None]:
%%time
!kb ref \
  --workflow custom \
  --aa \
  --d-list Homo_sapiens.GRCh38.cdna_dna.fa.gz \
  -t $threads \
  -i index.idx \
  palmdb_rdrp_seqs.fa

## Align data using kallisto translated search

In [None]:
fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()
fastqs

In [None]:
%%time
for fastq in fastqs:
    sample = fastq.split(".fastq.gz")[0]

    !mkdir -p $sample

    !$kallisto bus \
            --aa \
            -i index.idx \
            -o $sample \
            -t $threads \
            -x bulk \
            $fastq

    !$bustools sort -o $sample/output_sorted.bus $sample/output.bus

    !$bustools count \
        --genecounts \
        --cm \
        -o $sample/bustools_count \
        -g palmdb_clustered_t2g.txt \
        -e $sample/matrix.ec \
        -t $sample/transcripts.txt \
        $sample/output_sorted.bus

## Plot virus counts

Create adata objects from count matrices:

In [None]:
!pip install -q kb_python

import kb_python.utils as kb_utils
import anndata
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.colors
%config InlineBackend.figure_format='retina'

def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

In [None]:
adatas = []
for fastq in fastqs:
    # Load data
    sample = fastq.split(".fastq.gz")[0]

    # Filepath to counts
    X = f"{sample}/bustools_count/output.mtx"
    # Filepath to barcode metadata
    var_path = f"{sample}/bustools_count/output.genes.txt"
    # Filepath to gene metadata
    obs_path = f"{sample}/bustools_count/output.barcodes.txt"

    # Create AnnData object
    adata = kb_utils.import_matrix_as_anndata(X, obs_path, var_path)

    # Add sample name
    adata.obs["sample"] = sample

    # Append to adata list
    adatas.append(adata)

In [None]:
# Concatenate AnnData objects
adata = anndata.concat(adatas)
adata

In [None]:
# Set sample as index and drop unnecessary columns
adata.obs = adata.obs.set_index("sample")
adata.obs

In [None]:
# Load library metadata
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_3/Figure_3a/human_SARSCoV_validation_bulk_autopsy_SraRunTable.txt
metadata = pd.read_csv("human_SARSCoV_validation_bulk_autopsy_SraRunTable.txt", sep=",")

In [None]:
# Add case ID from metadata to adata
cases = []
reads = []
for srr in adata.obs.index:
    srr_temp = srr.split("_")[0]

    # Get case ID
    if str(metadata[metadata["Run"] == srr_temp]["case"].values[0]) != "nan":
        cases.append(str(metadata[metadata["Run"] == srr_temp]["case"].values[0]))
    else:
        cases.append(str(metadata[metadata["Run"] == srr_temp]["sample_case"].values[0]))

    if "_1" in srr:
        reads.append("R1")
    else:
        reads.append("R2")

adata.obs["case"] = cases
adata.obs["read"] = reads

Load RNAish % from paper:

In [None]:
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_3/Figure_3a/human_SARSCoV_validation_bulk_autopsy_rnaish.csv

In [None]:
rna_ish = pd.read_csv("human_SARSCoV_validation_bulk_autopsy_rnaish.csv", sep="\t")
rna_ish["case_clean"] = rna_ish["case_clean"].values.astype(str)
rna_ish

Load virus ID to sOTU mapping:

In [None]:
tax_df = pd.read_csv("ID_to_taxonomy_mapping.csv")
tax_df[tax_df["species"].str.contains("Severe acute respiratory syndrome")]

Plot:

In [None]:
case_list = rna_ish["case_clean"].values

In [None]:
fig, ax = plt.subplots(figsize=(6, 7))
fontsize = 16

target_ids = tax_df[tax_df["species"].str.contains("Severe acute respiratory syndrome-related coronavirus")]["rep_ID"].values

kb_counts = []
kb_errs = []
rnaish_counts = []
# Not including Control since there are no RNAish% values for control
for i, case in enumerate(case_list[1:]):
    kb_count_r1 = adata[(adata.obs["case"] == case) & (adata.obs["read"] == "R1"), adata.var.index.isin(target_ids)].X
    kb_count_r2 = adata[(adata.obs["case"] == case) & (adata.obs["read"] == "R2"), adata.var.index.isin(target_ids)].X

    kb_c = [kb_count_r1.sum(), kb_count_r2.sum()]
    count_mean = np.mean(kb_c)
    kb_counts.append(np.mean(kb_c))

    kb_err = [
        [count_mean - np.min(kb_c)],
        [np.max(kb_c) - count_mean],
    ]
    kb_errs.append(kb_err)

    rnaish_count = rna_ish[rna_ish["case_clean"] == case]["Viral load% by RNA ISH"].values[0]
    rnaish_counts.append(rnaish_count)

    ax.errorbar(count_mean, rnaish_count, xerr = kb_err, c= "black", ls="none", elinewidth=1)
    ax.scatter(count_mean, rnaish_count, c="#003049", edgecolors="black", s=250, zorder=2)

# ax.set_yscale("symlog")
# ax.set_xscale("symlog")
ax.set_ylabel("RNA-ISH (SARS-CoV %)", fontsize=fontsize)
ax.set_xlabel(
    "kallisto\n(raw counts for SARS-CoV)",
    fontsize=fontsize,
)

ax.text(5150, 1.5, "n=23", fontsize=fontsize)

# Add diagonal
# ax.plot([0, 1], [0, 1], transform=ax.transAxes, c="black", ls="-", lw=1, zorder=1)

ax.tick_params(axis="both", labelsize=fontsize)
ax.set_title(f"Lung autopsy samples\nfrom COVID-19 patients", fontsize=fontsize+2)

# plt.tight_layout()

ax.grid(True, which="both", color="lightgray", ls="--", lw=1)
ax.set_axisbelow(True)

plt.savefig("bulk_benchmark_PRJNA631753.png", dpi=300, bbox_inches="tight")

fig.show()