<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/show_primer_bias_splitcode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Align SPLIT-Seq data from lung samples from mice infected with SARS-CoV-2
Reference: https://doi.org/10.1038/s41586-022-05344-2

### Install software

In [None]:
!pip install -q kb_python anndata
import numpy as np
from scipy import stats
import anndata
import pandas as pd
import scanpy as sc
import json
import os
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
%config InlineBackend.figure_format='retina'

def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

In [None]:
# Install kallisto from source
!git clone -q https://github.com/pachterlab/kallisto.git
!cd kallisto && mkdir build && cd build && cmake .. && make

# Define paths to kallisto and bustools binaries
kallisto = "/content/kallisto/build/src/kallisto"

In [None]:
# Download the customized transcripts to gene mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_clustered_t2g.txt
# Download the RdRP amino acid sequences
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/palmdb_rdrp_seqs.fa

virus_fasta = "palmdb_rdrp_seqs.fa"
virus_t2g = "palmdb_clustered_t2g.txt"

In [None]:
# Number of threads to use in alignment
threads = 2

### Download raw data

In [None]:
!pip install -q ffq
import json

out = "data.json"

!ffq GSE199498 --ftp -o $out

f = open(out)
data = json.load(f)
f.close()

print(len(data))

for dataset in data:
    url = dataset["url"]
    !curl -O $url

### Align data to PalmDB using kallisto translated search

Generate virus index with masked host (here, mouse) genome and transcriptome sequences:

In [None]:
# Get host genomes and concatenate them into a single file
!pip install -q gget
!gget ref -w cdna,dna -r 110 -d mouse
host_cdna = "Mus_musculus.GRCm39.cdna.all.fa.gz"
host_dna = "Mus_musculus.GRCm39.dna.primary_assembly.fa.gz"

host_combined = "combined.cdna_dna.all.fa.gz"
!cat $host_cdna $host_dna > $host_combined

In [None]:
# Generate virus reference index
virus_index = "virus_index.idx"

!/usr/bin/time -v kallisto index \
    --aa \
    -t $threads \
    --d-list $host_combined \
    -i $virus_index \
    $virus_fasta

Get fastq files:

In [None]:
import os
import glob

In [None]:
fastqs = []
for filename in glob.glob("*.fastq.gz"):
    fastqs.append(filename.split("/")[-1])

fastqs.sort()
fastqs

In [None]:
len(fastqs)

In [None]:
samples = []
for fastq in fastqs:
    samples.append(fastq.split("_")[0])

In [None]:
samples = list(set(samples))
len(samples)

Align data to PalmDB:  
The SPLIT-Seq barcode onlist files (r1_RT_replace.txt and r1r2r3.txt) were provided by Delaney Sullivan (07/15/2023).

In [None]:
# Download SPLIT-Seq barcode onlist files
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/r1_RT_replace.txt
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/r1r2r3.txt

In [None]:
%%time
out_folder = "palmdb"
for sample in samples:
    fastq1 = sample + "_1.fastq.gz"
    fastq2 = sample + "_2.fastq.gz"

    !mkdir -p $out_folder/$sample

    !kb count \
        --aa \
        --h5ad \
        -t $threads \
        -i $virus_index \
        -g $virus_t2g \
        -x SPLIT-Seq \
        -r r1_RT_replace.txt \
        -w r1r2r3.txt \
        -o $out_folder/$sample/ \
        $fastq1 $fastq2

## Data QC

In [None]:
# Get SRRs
srrs = []
for filename in glob.glob("*.fastq.gz"):
    srrs.append(filename.split("/")[-1].split("_")[0])

srrs = list(set(srrs))

srrs.sort()
print(len(srrs))
srrs

Add primer type and well from onlist based on the last 8 bases of barcode:

In [None]:
# Download barcode to well mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/splitseq_bc2well.csv
bc2well = pd.read_csv("splitseq_bc2well.csv")
bc2well

In [None]:
palmdb_adatas = []
for srr in srrs:
    # Load data
    adata = anndata.read(f"{out_folder}/{srr}/counts_unfiltered/adata.h5ad")

    # Add sample name
    adata.obs["srr"] = srr

    # Create new column containing identifying barcode
    adata.obs["iden_bc"] = adata.obs.index.str[-8:]

    # Create copy of barcode column
    adata.obs["barcode"] = adata.obs.index

    # Merge well and primer information
    adata.obs = adata.obs.merge(bc2well, left_on="iden_bc", right_on="sequence", how="left").set_index("barcode", drop=False)

    # Append to adata list
    palmdb_adatas.append(adata)

In [None]:
## Concatenate datasets
palmdb_adata = palmdb_adatas[0]
for adata_obj in palmdb_adatas[1:]:
    palmdb_adata = palmdb_adata.concatenate(adata_obj,
                             join='outer',
                             batch_key='sample_index',
                             batch_categories=None,
                             uns_merge=None,
                             fill_value=np.nan)

palmdb_adata

In [None]:
# Drop unnecessary/duplicated columns
palmdb_adata.obs = palmdb_adata.obs.drop(["sequence", "sample_index"], axis=1)
palmdb_adata.obs

Add metadata:

In [None]:
# Load first set of metadata provided by authors
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/GSM5974202_sample_well_IDs.tsv
meta1 = pd.read_csv("GSM5974202_sample_well_IDs.tsv", sep='\t')
meta1["well_ID_list"] = meta1["well_ID"]
meta1["well_ID"] = meta1["well_ID_list"].str.split("-")
meta1 = meta1.explode("well_ID")
meta1.head()

In [None]:
# Copy current index
palmdb_adata.obs["barcode_idx"] = palmdb_adata.obs.index.values

In [None]:
palmdb_adata.obs = palmdb_adata.obs.merge(meta1, left_on="well", right_on="well_ID", how="left").drop("well_ID_list", axis=1)
palmdb_adata.obs

In [None]:
# Load second set of metadata provided by authors
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_4/Supp_Fig_4c/GSM5974202_README.txt
meta2 = pd.read_csv("GSM5974202_README.txt", sep='\t')
meta2.head()

In [None]:
# Merge second set of metadata into adata object
palmdb_adata.obs = palmdb_adata.obs.merge(meta2, left_on="sample", right_on="Sample_name", how="left").set_index("barcode_idx", drop=True)
palmdb_adata.obs

In [None]:
# Note: More than half of the well should have been empty since they are not included in look-up included by authors
palmdb_adata.obs.dropna()

## Plot fractions of virus counts detected per primer type

In [None]:
# Download virus ID to sOTU mapping
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/PalmDB/ID_to_taxonomy_mapping.csv
id2tax_df = pd.read_csv("ID_to_taxonomy_mapping.csv")
id2tax_df.head()

In [None]:
# Minimum count
threshold_per_cell = 0

In [None]:
# Get fraction of positive cells per virus strandedness per primer type
vtypes = ['+ssRNA', '-ssRNA', 'dsRNA', 'dsDNA', '+ssRNA or dsRNA']
vlabels = ['+ssRNA', '-ssRNA', 'dsRNA', 'dsDNA', '+ssRNA\nor dsRNA']

t_counts = []
r_counts = []
for vt in vtypes:
    target_ids = id2tax_df[id2tax_df["virus_type"] == vt]["Label"].values

    # Get counts for poly T primer
    tc = nd(palmdb_adata[(palmdb_adata.obs["type"]=="T") & (palmdb_adata.obs["condition"] == "infected"), palmdb_adata.var.index.isin(target_ids)].X.todense())
    tc = tc[tc > threshold_per_cell]

    # Get counts for random primers
    rc = nd(palmdb_adata[(palmdb_adata.obs["type"]=="R") & (palmdb_adata.obs["condition"] == "infected"), palmdb_adata.var.index.isin(target_ids)].X.todense())
    rc = rc[rc > threshold_per_cell]

    t_counts.append(np.sum(tc) / (np.sum(tc) + np.sum(rc)))
    r_counts.append(np.sum(rc) / (np.sum(tc) + np.sum(rc)))

In [None]:
# Get fraction of positive cells for SARS-CoV-2 per primer type
species = [
    "Severe acute respiratory syndrome-related coronavirus",
]

labels = [
    "Severe acute respiratory\nsyndrome-related\ncoronavirus",
]

t_counts_sars = []
r_counts_sars = []
total_count = []
for sp in species:
    target_ids = id2tax_df[id2tax_df["species"] == sp]["rep_ID"].values

    # Get counts for poly T primer
    tc = nd(palmdb_adata[(palmdb_adata.obs["type"]=="T") & (palmdb_adata.obs["condition"] == "infected"), palmdb_adata.var.index.isin(target_ids)].X.todense())
    tc = tc[tc > threshold_per_cell]

    # Get counts for random primers
    rc = nd(palmdb_adata[(palmdb_adata.obs["type"]=="R") & (palmdb_adata.obs["condition"] == "infected"), palmdb_adata.var.index.isin(target_ids)].X.todense())
    rc = rc[rc > threshold_per_cell]

    t_counts_sars.append(np.sum(tc) / (np.sum(tc) + np.sum(rc)))
    r_counts_sars.append(np.sum(rc) / (np.sum(tc) + np.sum(rc)))

    total_count.append(np.sum(tc) + np.sum(rc))

In [None]:
fig, axs = plt.subplots(figsize=(7, 7), ncols=2, sharey=True, gridspec_kw={'width_ratios': [1, 5], 'wspace': 0.05})

fontsize = 14
width = 0.4
colors = ["#003049", "#98c1d9"]

# Plot SARS-CoV2
ax = axs[0]
ax.bar(0-width/2, r_counts_sars[0], width=width, color=colors[0])
ax.bar(0+width/2, t_counts_sars[0], width=width, color=colors[1])
ax.set_xticks([0], ["SARS-CoV"], fontsize=fontsize)

# ax.set_xlabel("Known viral infection", fontsize=fontsize)
ax.set_ylabel("Fraction of kallisto counts", fontsize=fontsize)
ax.set_xmargin(0.1)

# Plot general strandedness
ax = axs[1]
x = np.arange(len(vtypes))
ax.bar(x-width/2, r_counts, width=width, color=colors[0], label="Random\nhexamer")
ax.bar(x+width/2, t_counts, width=width, color=colors[1], label="Poly(T)")

ax.legend(title="Primer type", fontsize=fontsize, title_fontsize=fontsize)
ax.set_xticks(np.arange(len(vlabels)), vlabels, fontsize=fontsize)
ax.set_xmargin(0.02)

for ax in axs:
    ax.tick_params(axis='y', labelsize=fontsize)

fig.suptitle("Lung samples from mice infected\nwith SARS-CoV-2 (SPLiT-seq)", fontsize=fontsize, y=0.95)

fig.savefig("figures/4_random_vs_polyT_final.png", dpi=300, bbox_inches="tight")

fig.show()