<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_3/Supp_Fig_3d/2_macaque_celltype_cluster_fraction_plots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Plot macaque marker gene expression per cell type and fractions of cells occupied by different cell types in each Leiden cluster

In [None]:
!pip install -q anndata gget scanpy

In [None]:
import anndata
import numpy as np
import pandas as pd
import json
import os
import gget
import glob
import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
import scanpy as sc

def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

___
# Load data

The count matrix and cell type assignments were generated as shown [here](https://github.com/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_3/Supp_Fig_3d/1_macaque_celltype_assignment.ipynb).

In [None]:
# Load clean macaque count matrix with cell type assignments from Caltech Data
!wget https://data.caltech.edu/records/sh33z-hrx98/files/macaque_QC_norm_leiden_celltypes.h5ad?download=1
!mv macaque_QC_norm_leiden_celltypes.h5ad?download=1 macaque_QC_norm_leiden_celltypes.h5ad

In [None]:
host_adata = anndata.read("macaque_QC_norm_leiden_celltypes.h5ad")
host_adata

Get marker genes:

In [None]:
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Supp_Fig_3/Supp_Fig_3d/marker_genes.csv
marker_df = pd.read_csv("marker_genes.csv")

In [None]:
# Get Ensembl IDs from gene symbols using gget
search_results = gget.search(marker_df["gene_symbol"].values, species="macaca_mulatta", release=109)
# Filter gget results for exact matches and protein-coding biotypes
search_results_filtered = search_results[search_results["gene_name"].isin(marker_df["gene_symbol"].values)][search_results["biotype"]=="protein_coding"]

In [None]:
ensembl_id_list = []
for i, gene_symbol in enumerate(marker_df["gene_symbol"].values):
    ensembl_ids = search_results_filtered[search_results_filtered["gene_name"] == gene_symbol]["ensembl_id"].values

    if len(ensembl_ids) < 1:
        if gene_symbol.startswith("ENS"):
            ensembl_id_list.append([gene_symbol])
        else:
            print(f"An Ensembl ID for gene {gene_symbol} could not be found.")
            ensembl_id_list.append(np.NaN)

    else:
        ensembl_id_list.append(ensembl_ids)

marker_df["ensembl_id_list"] = ensembl_id_list

# Only keep first ID
ensembl_id_list_final = []
for id_list in ensembl_id_list:
    try:
        ensembl_id_list_final.append(id_list[0])
    # If already nan, keep nan
    except:
        ensembl_id_list_final.append(np.NaN)

marker_df["ensembl_id"] = ensembl_id_list_final

marker_df

___
# Create fraction plots

In [None]:
# Sort celltype labels using categorical data type
celltype_order = [
    "B cells 1",
    "B cells 2",
    "B cells 3",
    "B cells 4",
    "B cells 5",
    "B cells 6",
    "B cells 7",
    "T cells 1",
    "T cells 2",
    "T cells 3",
    "T cells 4",
    "T cells 5",
    "Natural killer 1",
    "Monocytes 1",
    "Monocytes 2",
    "Immature neutrophils 1",
    "Undefined 1",
    "Undefined 2",
    "Undefined 3"
]
host_adata.obs["celltype_clusters"] = host_adata.obs["celltype_clusters"].cat.reorder_categories(list(celltype_order))

In [None]:
fontsize = 12

celltypes = [
    "B cells",
    "Plasmablast",
    "Dendritic cells",
    "T cells",
    "Natural killer cells",
    "Monocytes",
    "Immature neutrophils",
    "Panleukocyte",
    "Cytokines",
    "Proliferation"
]
# Grab markers in order of celltypes
markers = []
x_labels = []
for celltype in celltypes:
    markers = markers + list(
        marker_df.iloc[marker_df.index[marker_df["celltype"] == celltype].tolist()][
            "ensembl_id"
        ].values
    )
    x_labels = x_labels + list(
        marker_df.iloc[marker_df.index[marker_df["celltype"] == celltype].tolist()][
            "gene_name"
        ].values
    )
# Define celltype gene groups
var_groups = []
start = 0
for celltype in celltypes:
    c_indeces = marker_df.index[marker_df["celltype"] == celltype].tolist()
    var_groups.append((start, start + (c_indeces[-1] - c_indeces[0])))
    start = start + (c_indeces[-1] - c_indeces[0]) + 1

temp_var_labels = [""] * len(celltypes)

ax = sc.pl.dotplot(
    host_adata[~(host_adata.obs["celltype_clusters"] == "Undefined 1"), :], # !!! Removing Undefined 1 because it only has 12 cells
    markers,
    groupby="celltype_clusters",
    use_raw=True,
    dendrogram=False,
    var_group_positions=var_groups,
    var_group_labels=temp_var_labels,
    var_group_rotation=45,
    swap_axes=False,
    standard_scale="var",
    show=False,
)

# ax["mainplot_ax"].set_title(f"Expression of marker genes in {'-'.join(adata.obs['tissue'][0].split('/'))} dataset", pad=120)

## Rotate var labels
# Get initial label positions from: ax["gene_group_ax"].properties()["children"]
var_positions = [1.45, 3.45, 4.95, 8.95, 13.45, 19.45, 25.45, 27.45, 29.95, 32.45]
for position, label in zip(var_positions, celltypes):
    ax["gene_group_ax"].text(
        position, 1.1, label, fontsize=fontsize, rotation=45, ha="left"
    )

ax["mainplot_ax"].set_xticklabels(x_labels, rotation=45, ha="right", fontsize=fontsize)
ax["mainplot_ax"].set_xlabel("Gene", fontsize=fontsize+2)
ax["mainplot_ax"].set_ylabel("Leiden cluster", fontsize=fontsize+2)
ax["mainplot_ax"].tick_params(axis="both", labelsize=fontsize)

plt.tight_layout()

# Save plot
plt.savefig(
    f"host_celltype_cluster_marker-gene-expression.png",
    dpi=300,
    bbox_inches="tight",
)

In [None]:
# Plot number of cells per cluster
fig, ax = plt.subplots(figsize = (2.5,7))
fontsize=12

cts = ['B cells 1',
 'B cells 2',
 'B cells 3',
 'B cells 4',
 'B cells 5',
 'B cells 6',
 'B cells 7',
 'T cells 1',
 'T cells 2',
 'T cells 3',
 'T cells 4',
 'T cells 5',
 'Natural killer 1',
 'Monocytes 1',
 'Monocytes 2',
 'Immature neutrophils 1',
 'Undefined 2',
 'Undefined 3'
      ]
cts.reverse()

# Do not plot Undefined clusters
counts = []
for ct in cts:
    counts.append(len(host_adata.obs[host_adata.obs["celltype_clusters"] == ct]))

ax.barh(cts, counts, color="#67000d")

ax.tick_params(axis="both", labelsize=fontsize)
ax.margins(0.03)

# Add number of cells
for i, count in enumerate(counts):
    ax.text(count+100, i, "{:,}".format(count), fontsize=fontsize-2, va="center")
ax.set_xlim(left=0, right=40000)

# ax.grid(True, which="both", color="lightgray", ls="--", lw=1)
# ax.set_axisbelow(True)

ax.set_xlabel("Number of cells", fontsize=fontsize+2)

# Save plot
plt.savefig(
    f"host_celltype_cluster_numcells.png",
    dpi=300,
    bbox_inches="tight",
)

fig.show()

Plot fractions per celltype:

In [None]:
def plot_fraction_per_sample(
    adata,
    groups="leiden",
    fractions="title",
    colors=None,
    frac_order=None,
    group_order=None,
    normalize_to_total=False,
    title=None,
    xticklabel_rotation=0,
    figsize=None
):
    ## Save fractions to plot in dataframe
    # # Convert sample column to string (instead of categorical)
    # adata.obs[fractions] = adata.obs[fractions].astype(str)

    df_normalized = pd.DataFrame()

    if normalize_to_total:
        # Normalize each celltype count to total number of cells in that batch by dividing
        df_normalized = ((adata.obs.groupby(groups)[fractions].value_counts()/ adata.obs.groupby(fractions)["species"].count())
            .unstack()
            .fillna(0)
        )
        # Convert columns to string
        df_normalized.columns = df_normalized.columns.astype(str)
        # Get total normalized count
        df_normalized["total_normalized_count"] = df_normalized.sum(axis=1).values

        # Compute fraction of normalized total count for each sample
        for sample in adata.obs[fractions].unique():
            df_normalized[f"{sample}_fraction"] = (df_normalized[sample] / df_normalized["total_normalized_count"]).values

    else:
        # Do not normalize to total count (use if we do not expect these to be equally distributed across the clusters)
        df_normalized = (adata.obs.groupby(groups)[fractions].value_counts().unstack().fillna(0))
        df_normalized["total_count"] = df_normalized.sum(axis=1).values

        for sample in df_normalized.columns[:-1]:
            df_normalized[f"{sample}_fraction"] = (df_normalized[sample] / df_normalized["total_count"]).values

    # Get total cellcount
    df_normalized["total_cellcount"] = (adata.obs.groupby(groups).size().values.astype(int))

    # Sort data
    if group_order:
        df_normalized = df_normalized.reindex(group_order)
    else:
        df_normalized = df_normalized.sort_values(list(df_normalized.filter(like='_fraction').columns.values))

    ## Plot
    if figsize:
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig, ax = plt.subplots(figsize=(15, 5))
    width = 0.75
    alpha = 1
    fontsize = 14

    clusters = df_normalized.index.values
    cellcounts = df_normalized["total_cellcount"].values

    previous_samples = 0
    if frac_order:
        for c_idx, sample in enumerate(frac_order):
            ax.bar(
                clusters,
                df_normalized[f"{sample}_fraction"].values,
                width,
                bottom=previous_samples,
                color=colors[c_idx] if colors else None,
                label=sample,
                alpha=alpha,
            )
            previous_samples = previous_samples + df_normalized[f"{sample}_fraction"].values
    else:
        for c_idx, sample in enumerate(adata.obs[fractions].unique()):
            ax.bar(
                clusters,
                df_normalized[f"{sample}_fraction"].values,
                width,
                bottom=previous_samples,
                color=colors[c_idx] if colors else None,
                label=sample,
                alpha=alpha,
            )
            previous_samples = previous_samples + df_normalized[f"{sample}_fraction"].values

    # Add value above each bar
    for index, value in enumerate(cellcounts):
        ax.text(x=index, y=1.01, s=int(value), size=fontsize - 3, ha="center")

    if xticklabel_rotation == 0:
        ax.set_xticklabels(clusters)
    else:
        ax.set_xticklabels(clusters, rotation=xticklabel_rotation, ha="right")

    ax.legend(bbox_to_anchor=(1.001, 1.025), loc="upper left", fontsize=fontsize)

    ax.set_title(title, y=1.0, pad=20, fontsize=fontsize + 2)
    ax.set_ylabel("Fraction of cells", fontsize=fontsize)
    if "leiden" in groups:
        ax.set_xlabel("Leiden cluster", fontsize=fontsize)
    else:
        ax.set_xlabel(groups.capitalize(), fontsize=fontsize)
    ax.tick_params(axis="both", labelsize=fontsize)

    # ax.axhline(y=0.25, color="grey", linestyle="--")
    # ax.axhline(y=0.5, color="grey", linestyle="--")
    # ax.axhline(y=0.75, color="grey", linestyle="--")

    ax.margins(x=0.01, y=0.06)
    ax.grid(False)

    plt.tight_layout()

    if normalize_to_total:
        plt.savefig(
            f"host_{groups}_{fractions}_norm_fractions.png",
            dpi=300,
            bbox_inches="tight",
        )
    else:
        plt.savefig(
            f"host_{groups}_{fractions}_fractions.png",
            dpi=300,
            bbox_inches="tight",
        )

    fig.show()

In [None]:
celltype_order.reverse()

In [None]:
plot_fraction_per_sample(
    host_adata,
    groups="celltype_clusters",
    fractions="srr",
    group_order=celltype_order,
    normalize_to_total=False,
    title=None,
    xticklabel_rotation=45
)

The sequencing libraries did not separate into different clusters (suggesting the lack of a batch effect).

In [None]:
plot_fraction_per_sample(
    host_adata,
    groups="celltype_clusters",
    fractions="dpi_clean",
    colors=["#b8dd5a", "#64aa37", "#3e8938", "#b3cde0", "#005c96", "gold", "orange", "#e06237", "#c73d32", "#b2172b", "#7d0d0f"],
    frac_order=['-30d', '-4d', '0d', '4h', '24h', '3d', '4d', '5d', '6d', '7d', '8d'],
    group_order=celltype_order,
    normalize_to_total=False,
    title=None,
    xticklabel_rotation=45
)

Same but normalized to total number of cells for each timepoint:

In [None]:
# Convert to string
host_adata.obs["dpi_clean_merged"] = host_adata.obs["dpi_clean_merged"].astype(str)

In [None]:
groups="celltype_clusters"
fractions="dpi_clean_merged"
host_adata.obs.groupby(groups)[fractions].value_counts()/ host_adata.obs.groupby(fractions)["species"].count()

In [None]:
plot_fraction_per_sample(
    host_adata,
    groups="celltype_clusters",
    fractions="dpi_clean_merged",
    colors=["#b8dd5a", "#64aa37", "#3e8938", "#b3cde0", "#005c96", "gold", "orange", "#e06237", "#b2172b", "#7d0d0f"],
    frac_order=['-30d', '-4d', '0d', '4h', '24h', '3d', '4d', '5d', '6d', '7d/8d'],
    group_order=celltype_order[:-4],
    normalize_to_total=True,
    title=None,
    xticklabel_rotation=45,
    figsize=(15, 7)
)

In [None]:
plot_fraction_per_sample(
    host_adata,
    groups="celltype",
    fractions="dpi_clean_merged",
    colors=["#b8dd5a", "#64aa37", "#3e8938", "#b3cde0", "#005c96", "gold", "orange", "#e06237", "#c73d32", "#b2172b", "#7d0d0f"],
    frac_order=['-30d', '-4d', '0d', '4h', '24h', '3d', '4d', '5d', '6d', '7d/8d'],
    group_order=["B cells", "T cells", "Natural killer",  "Monocytes", "Immature neutrophils"],
    normalize_to_total=True,
    title=None,
    xticklabel_rotation=45,
    figsize=(6.5, 7)
)