In [1]:
import os
import csv
import scanpy as sc
import pandas as pd
import numpy as np
import harmonypy as hm
import anndata as ad
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib as mpl

import dask.dataframe as dd
import pyarrow
import loompy
import phate
import time
import mygene
import scanpy.external as sce 
import scipy.sparse
import seaborn as sns


from matplotlib.lines import Line2D
from matplotlib.colors import LinearSegmentedColormap
from anndata import AnnData

from scipy import sparse
from sklearn.manifold import Isomap

In [None]:
### Read Adata Object.

os.chdir("/folder/")
adata =  ad.read_h5ad("adata.h5ad")

In [None]:
### Transform ENSEMBLID to Symbol.

mg = mygene.MyGeneInfo()

ensembl_ids = adata.var_names.tolist()

query_res = mg.querymany(ensembl_ids, scopes = 'ensembl.gene', fields = 'symbol', species = 'human')
df = pd.DataFrame(query_res)
df['symbol'] = df['symbol'].fillna(df['query'])

id2symbol = dict(zip(df['query'], df['symbol']))

adata.var_names = [id2symbol[x] for x in adata.var_names]

adata_sub = adata[:, ~adata.var_names.str.startswith("ENSG")].copy()

In [None]:
### Merge Adata Objects from Different Datasets.

os.chdir("/folder/")
adata_1 =  ad.read_h5ad("adata_1.h5ad")
adata_2 =  ad.read_h5ad("adata_2.h5ad")


adatas_to_merge = [
    adata_1,
    adata_2
]

## Inner Join.

adata_merged = ad.concat(
    adatas_to_merge,
    join = 'inner'
)


## Outer Join.

adata_merged = ad.concat(
    adatas_to_merge,
    join = 'outer'
)

In [None]:
### Process Adata Object. (Change Parameters Depending on the Context (Size of Dataset, Informative PCs etc.). For ex. n_comps, n_neighbors, resolution)

adata = adata_merged.copy()

adata.X = adata.layers['counts_RNA'].copy()
adata.X.max()

sc.pp.normalize_total(adata, target_sum = 1e4)
adata.layers["data"] = adata.X.copy()
sc.pp.log1p(adata)
adata.layers["log1p_normalized"] = adata.X.copy()
sc.pp.highly_variable_genes(adata, n_top_genes = 5000, flavor = 'seurat')
sc.pp.scale(adata, max_value = 10)
adata.layers["scale.data"] = adata.X.copy()

sc.pp.pca(adata, n_comps = 50)
sc.tl.pca(adata, n_comps = 50)
adata.obs['Sample_ID_Dataset'] = adata.obs['Sample_ID'].astype(str) + "_" + adata.obs['Dataset'].astype(str)
sce.pp.harmony_integrate(adata, key = ['Sample_ID'], max_iter_harmony = 100, nclust = 40, max_iter_kmeans = 20)
sc.pp.neighbors(adata, use_rep = 'X_pca_harmony', n_neighbors = 20) 
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 0.1, key_added = "leiden_0.1")

In [None]:
### Subset Adata Object to Different Clusters.

adata_subset = adata[adata.obs["Cluster_Column"] == "Cluster_Value"].copy()

In [None]:
### Check Counts of Different Clusters.

cell_counts = adata.obs['Cluster_Column'].value_counts()
print(cell_counts)

In [None]:
### Check Counts Between Different Clusters.

matrix = pd.crosstab(adata.obs["Cluster_Column_1"], adata.obs["Cluster_Column_2"])
matrix

In [None]:
### Subset .obs to Keep Columns of Interest.

keep_cols = [
    'nCount_RNA', 'nFeature_RNA', 'percent.mt', 
    'Barcode', 'Sample_ID',
    'Dataset', 'Age', 'Age_Type', 'Sex', 'Organism', 'Region',
    'Subregion', 'Matter', 'Tissue', 'Enrichment', 'Enrichment_Cell',
    'Cell_ID', 'Status'
]

adata.obs = adata.obs[keep_cols].copy()

In [None]:
### Basic UMAP Plots.

sc.settings.set_figure_params(dpi = 200, figsize = (20, 20))

sc.pl.scatter(
    adata,
    basis = 'umap', 
    color = 'Cluster_Column',
    size = 10,
    legend_loc = 'on data'
)

In [None]:
### Basic UMAP Feature Plots.

sc.settings.set_figure_params(dpi = 200, figsize = (30, 30))

colors = ["#D5E5F7", "#2488F0", "#7F3F98", "#E22929", "#A81B1B"]
custom_cmap = LinearSegmentedColormap.from_list("custom", colors, N = 256)

sc.pl.umap(
    adata,
    color = "Feature_Name", 
    layer = "log1p_normalized",
    size = 20,
    cmap = custom_cmap,
    frameon = False,
    title = "",
    show = False
)

plt.gca().set_facecolor("white")
plt.gcf().set_facecolor("white")
plt.savefig("Feature.png", dpi = 300, bbox_inches = "tight", facecolor = "white")

plt.show()

In [None]:
### Basic DotPlots.

features = [
    "Feature_Name"
]

sc.pl.dotplot(
    adata,
    var_names = features,
    groupby = "Cluster_Column",
    standard_scale = "var",
    dot_min = 0.1,   
    dot_max = 1.0   
)

In [None]:
### Basic QC plots.

cols = ['nCount_RNA', 'nFeature_RNA', 'percent.mt']

for col in cols:
    adata.obs[col] = pd.to_numeric(adata.obs[col], errors = 'coerce')

    
sc.pl.violin(
    adata,
    keys = ['percent.mt'],
    groupby = 'Cluster_Name',
    stripplot = False 
)

In [None]:
### Subset Adata to Filter Out Low Quality Clusters/Cells.

adata = adata[~adata.obs['Cluster_Column'].isin(["Cluster_Value"])].copy()

In [None]:
### Rename Cluster Values.

adata.obs["Cluster_Column_1"] = adata.obs["Cluster_Column_1"].astype("str")
adata.obs["Cluster_Column_2"] = adata.obs["Cluster_Column_2"].astype("str")
adata.obs.loc[adata.obs["Cluster_Column_1"].isin(["Cluster_Value"]), "Cluster_Column_2"] = "Cluster_Value_New"

In [None]:
### Retrieve Top 100 PC Loadings.

genes = adata.var_names

loadings = pd.DataFrame(
    adata.varm["PCs"],
    index = genes,
    columns = [f"PC{i+1}" for i in range(adata.varm["PCs"].shape[1])]
)

top_loadings = {}
for pc in loadings.columns:
    top = loadings[pc].abs().sort_values(ascending = False).head(100).index
    top_loadings[pc] = top.tolist()

top_df = pd.DataFrame(top_loadings)
top_df

top_df.to_excel("top100_PC_Loadings.xlsx")

print("Saved top PC loadings to top100_pc_loadings.xlsx")

In [None]:
### Write Adata File.

os.chdir("/folder/")
adata.write("adata_new.h5ad")