In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import anndata as ad

plt.rcParams['figure.figsize'] = (15, 15)
from scipy.stats import median_abs_deviation

# feature selection
import anndata2ri
import logging
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro

sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=80,
    facecolor="white",
    frameon=False,
)

# Raw data processing 
### Read count file and create anndata object

In [None]:
counts = pd.read_csv('/home/p/pohll/Desktop/MP/data/xie/GSE104154_d0_d21_sma_tm_Expr_raw.csv', index_col='id')

In [None]:
counts.head()

In [None]:
counts.shape

In [None]:
counts.index = counts.symbol
counts = counts.drop('symbol', axis=1)

In [None]:
xie_csv = ad.AnnData(counts)

In [None]:
xie_csv.obs_names_make_unique()

In [None]:
xie_csv = xie_csv.T

In [None]:
xie_csv

10411 barcodes (cells, obs), 27998 genes (vars)

In [None]:
anno_ctrl = pd.read_excel('/home/p/pohll/Desktop/MP/data/xie/GSE104154_cell_type_annotation_d0_d21.xlsx', sheet_name=1, index_col='Barcode')
anno_bleo = pd.read_excel('/home/p/pohll/Desktop/MP/data/xie/GSE104154_cell_type_annotation_d0_d21.xlsx', sheet_name=2, index_col='Barcode')

In [None]:
anno_ctrl["defined"].unique()

In [None]:
anno_xie = pd.concat([anno_ctrl, anno_bleo])

In [None]:
anno_xie.head()

In [None]:
anno_xie.index = anno_xie.index.str.replace(pat='.', repl='-')

In [None]:
xie_csv.obs['annotation']=anno_xie

In [None]:
xie_csv

In [None]:
xie_csv.obs.index

### show highest expressed genes 

In [None]:
sc.pl.highest_expr_genes(xie_csv, n_top=20)

MALAT1 is the most expressed gene which is frequently detected in poly-A captured scRNA-Seq data, independent of protocol. This gene has been shown to have an inverse correlation with cell health. Especially dead/dying cells have a higher expression of MALAT1.

In [None]:
xie_csv.var_names_make_unique()
xie_csv

# Qualitly Control 

In [None]:
xie_csv.var_names.str.startswith("MT-").sum()

In [None]:
xie_csv.var_names.str.startswith("mt-").sum()

In [None]:
xie_csv.var_names

In [None]:
# mitochondrial genes
xie_csv.var["mt"] = xie_csv.var_names.str.startswith(("MT-", "mt-"))
# ribosomal genes
xie_csv.var["ribo"] = xie_csv.var_names.str.startswith(("RPS", "RPL", "rps", "rpl"))
# hemoglobin genes.
xie_csv.var["hb"] = xie_csv.var_names.str.contains(("^HB[^(P)]"))

In [None]:
xie_csv.obs.index

In [None]:
subset_xie.var.index

In [None]:
sc.pp.calculate_qc_metrics(
    xie_csv, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
xie_csv

In [None]:
xie_csv

In [None]:
xie_csv.obs['sample']=xie_csv.obs.index.str.split('-').str[1]

In [None]:
subset_xie = xie_csv[xie_csv.obs.index.str.endswith("-1")]

In [None]:
subset_xie.obs

### QC covariates: 
- n_genes_by_counts in .obs is the number of genes with positive counts in a cell,
- total_counts is the total number of counts for a cell, this might also be known as library size, and
- pct_counts_mt is the proportion of total counts for a cell which are mitochondrial.

In [None]:
plt.rcParams['figure.figsize'] = (7, 7)
samples = ["-1","-2", "-3","-4","-5","-6"]
for x in samples: 
    subset_xie = xie_csv[xie_csv.obs.index.str.endswith(x)]
    p1 = sns.histplot(subset_xie.obs["total_counts"], bins=100, kde=False, legend = True).set(xlim=(0,30000),ylim=(0,350))



In [None]:
sns.histplot(
    xie_csv.obs, x="total_counts", hue="sample",bins=100, kde=False, legend = True
)

In [None]:
for x in samples: 
    subset_xie = xie_csv[xie_csv.obs.index.str.endswith(x)]
    p1 = sns.histplot(subset_xie.obs["total_counts"], bins=100, kde=False).set(xlim=(0,30000),ylim=(0,350))
    # sc.pl.violin(adata, 'total_counts')
    p2 = sc.pl.violin(subset_xie, "pct_counts_mt",title='Sample '+ x)
    p3 = sc.pl.scatter(subset_xie, "total_counts", "n_genes_by_counts", color="pct_counts_mt", title='Sample '+ x)
   
    

    

In [None]:
# plot the three QC covariates n_genes_by_counts, total_counts and pct_counts_mt per sample 
# to assess how well the respective cells were captured.
p1 = sns.displot(xie_csv.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(xie_csv, "pct_counts_mt")
p3 = sc.pl.scatter(xie_csv, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sns.histplot(
    xie_csv.obs, x="total_counts", bins=100, kde=False, legend = True
).set(xlim=(10000,20000), ylim=(0,50))

### comparison to Xie et al. 
xie et al paper: "Cells with percentage of reads mapped on mitochondrial genes > 15% or total number of genes expressed < 300 were removed from further analysis" --> 300 ✅, > 15% --> 30 obs - maybe even 10%? cells already dead? 

filter by:  min_genes=300, max_genes=4000, max_counts=18000
"Cells with a relatively high fraction of mitochondrial counts might for example be involved in respiratory processes and should not be filtered out."- SCBP

In [None]:
# histogramm, bereich zw. 2000 6000, beim oberen: randbereiche genauer anschauen, 
# outliers mit high count and low numer of genes: population evtl genauer anschauen 

In [None]:

#filtered_df = xie_csv.obs[(xie_csv.obs['total_counts'] > 100) & (xie_csv.obs['n_genes_by_counts'] < 300)]

# Extract the elements of the "annotation" column from the filtered DataFrame
#annotations = filtered_df['annotation']

In [None]:
np.array(xie_csv.obs["pct_counts_mt"] > 15).sum()

In [None]:
np.array(xie_csv.obs["pct_counts_mt"] > 10).sum()

In [None]:
print(f"Total number of cells: {xie_csv.n_obs}")

In [None]:
sc.pp.filter_cells(xie_csv, min_genes=300)
sc.pp.filter_cells(xie_csv, max_genes=4000)
sc.pp.filter_cells(xie_csv, max_counts=15000)

In [None]:
np.array(xie_csv.obs["pct_counts_mt"] > 10).sum()

In [None]:
xie_csv.obs["mt_outlier"] = xie_csv.obs["pct_counts_mt"] > 10

In [None]:
xie_csv = xie_csv[(~xie_csv.obs.mt_outlier)].copy()

In [None]:
np.array(xie_csv.obs["pct_counts_mt"] > 10).sum()

### data after filtering low quality reads

In [None]:
print(f"Number of cells after filtering of low quality cells: {xie_csv.n_obs}")

p1 = sc.pl.scatter(xie_csv, "total_counts", "n_genes_by_counts", color="pct_counts_mt")



In [None]:
#xie_csv.write('/home/p/pohll/Desktop/MP/data/xie_leonie.h5ad')
#xie = sc.read('/home/p/pohll/Desktop/MP/data/xie_leonie.h5ad')

## doublets analysis

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(Seurat)
library(scater)
library(scDblFinder)
library(BiocParallel)

In [None]:
xie_mat = xie.X.T #input for scDblFinder

- `sce$scDblFinder.score` : the final doublet score (the higher the more likely that the cell is a doublet)
- `sce$scDblFinder.ratio` : the ratio of artificial doublets in the cell’s neighborhood
- `sce$scDblFinder.class` : the classification (doublet or singlet)

In [None]:
%%R -i xie_mat -o doublet_score -o doublet_class

set.seed(123)
sce = scDblFinder(
    SingleCellExperiment(
        list(counts=xie_mat),
    ) 
)
doublet_score = sce$scDblFinder.score
doublet_class = sce$scDblFinder.class

In [None]:
xie.obs["scDblFinder_score"] = doublet_score
xie.obs["scDblFinder_class"] = doublet_class
xie.obs.scDblFinder_class.value_counts()

9293 singlets, 933 doublets, leave doublets in for now 

In [None]:
xie

# Normalization 

In [None]:
scales_counts = sc.pp.normalize_total(xie, target_sum=None, inplace=False)
# log1p transform
xie.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(xie.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(xie.layers["log1p_norm"].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
plt.show()

In [None]:
xie = sc.read('/home/p/pohll/Desktop/MP/data/xie_leonie.h5ad')

In [None]:
xie

# Feature Selection 
exclude uninformative genes
- use deviance for feature selection which works on raw counts. 
- can be computed in closed form and quantifies whether genes show a constant expression profile across cells as these are not informative
- Genes with constant expression: multinomial null model
- Highly informative genes across cells: high deviance value, do not fit null model 
- According to the deviance values, the method then ranks all genes and obtains only highly deviant genes.

## 1. using scry --> eher nicht

In [None]:
sc.settings.verbosity = 0
sc.settings.set_figure_params(
    dpi=80,
    facecolor="white",
    frameon=False,
)

rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

In [None]:
load_ext rpy2.ipython

In [None]:
%%R
library(scry)

save the AnnData object in our R environment

In [None]:
ro.globalenv["xie"] = xie

In [None]:
#%%R -i xie
#xie

In [None]:
%%R
sce = devianceFeatureSelection(xie, assay="X")

In [None]:
binomial_deviance = ro.r("rowData(sce)$binomial_deviance").T

sort the vector an select the top 4,000 highly deviant genes and save them as an additional column in .var as ‘highly_deviant’. 
save the computed binomial deviance in case we want to sub-select a different number of highly variable genes afterwards.

In [None]:
idx = binomial_deviance.argsort()[-4000:]
mask = np.zeros(xie.var_names.shape, dtype=bool)
mask[idx] = True

xie.var["highly_deviant"] = mask
xie.var["binomial_deviance"] = binomial_deviance

In [None]:
xie.var["highly_deviant"].value_counts()

In [None]:
# setting highly variable as highly deviant to use scanpy 'use_highly_variable' argument in sc.pp.pca
xie.var["highly_variable"] = xie.var["highly_deviant"]

In [None]:
#xie.X = xie.layers["log1p_norm"]

## 2. Using sc.pp.highly_variable_genes

In [None]:
sc.pp.log1p(xie)

In [None]:
sc.pp.highly_variable_genes(xie, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(xie)

In [None]:
xie

# Dim Reduction + UMAP

## 1. PCA 

In [None]:
sc.tl.pca(xie, svd_solver='arpack', use_highly_variable=True )

In [None]:
sc.pl.pca(xie, color='total_counts')

In [None]:
sc.pl.pca_variance_ratio(xie, log=True) # später rufer neighbours/UMAP mit verschiedenen PCs auf 

## 2. UMAP

In [None]:
sc.pp.neighbors(xie)
sc.tl.umap(xie)

In [None]:
sc.pl.umap(xie, color="total_counts")

In [None]:
sc.pl.umap(
    xie,
    color=["total_counts", "pct_counts_mt", "scDblFinder_score", "scDblFinder_class"],
)
# todo doublets + hohe counts nochmal anschaune --> rausfiltern

In [None]:
xie_filtered_doublet = xie.copy()

In [None]:
mask = (xie_filtered_doublet.obs['scDblFinder_class'] != 2) & (xie_filtered_doublet.obs['scDblFinder_score'] <= 0.9) & (xie_filtered_doublet.obs['total_counts'] <= 12000)


In [None]:
filtered_obs = xie_filtered_doublet.obs[mask]

# Filter the AnnData object using the mask
xie_filtered_doublet = xie_filtered_doublet[mask]

# Update the AnnData object with the filtered `obs`
xie_filtered_doublet.obs = filtered_obs

In [None]:
xie_filtered_doublet.obs.shape

In [None]:
xie_filtered_doublet

In [None]:
xie.obs.shape

In [None]:
sc.pp.log1p(xie_filtered_doublet)

In [None]:
sc.tl.pca(xie_filtered_doublet, svd_solver='arpack', use_highly_variable=True )

In [None]:
sc.pp.neighbors(xie_filtered_doublet)
sc.tl.umap(xie_filtered_doublet)

In [None]:
sc.pl.umap(
    xie_filtered_doublet,
    color=["total_counts", "pct_counts_mt", "scDblFinder_score", "scDblFinder_class"],
)


In [None]:
xie_filtered_doublet.write('/home/p/pohll/Desktop/MP/data/xie_leonie_filtered.h5ad')

In [None]:
#xie.write('/home/p/pohll/Desktop/MP/data/xie_leonie_dim_reduction.h5ad')
xie = sc.read('/home/p/pohll/Desktop/MP/data/xie_leonie_dim_reduction.h5ad')

# Clustering using Leiden Algorithm
-->  identification of cellular structure in the dataset.

In [None]:
sc.tl.leiden(xie)

In [None]:
sc.tl.leiden(xie, key_added="leiden_res0_1", resolution=0.1)
sc.tl.leiden(xie, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(xie, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(xie, key_added="leiden_res1", resolution=1.0)

In [None]:
sc.pl.umap(
    xie,
    color=["leiden_res0_1","leiden_res0_25", "leiden_res0_5", "leiden_res1"],
    legend_loc="on data",
)

In [None]:
#Define a nice colour map for gene expression
from matplotlib import colors
colors2 = plt.cm.Reds(np.linspace(0, 1, 128))
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,1))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)
def plot_markers(adata):
    print('Macrophages')
    sc.pl.umap(adata, color=['Apoe', 'Mrc1', 'Marco', 'Mertk'], cmap=mymap, size=20)
    print('Monocytes')
    sc.pl.umap(adata, color=['Cd14', 'Vcan'], cmap=mymap, size=20)
    print('T-Cells')
    sc.pl.umap(adata, color=['Cd3e', 'Cd4','Cd8a', 'Cd3d'], cmap=mymap, size=40)
    print('B-Cells')
    sc.pl.umap(adata, color=['Cd19', 'Cd79a'], cmap=mymap, size=40)
    print('AT2')
    sc.pl.umap(adata, color=['Muc1', 'Sftpc', 'Sftpd', 'Lcn2'], cmap=mymap, size=30)
    print('AT1')
    sc.pl.umap(adata, color=['Vegfa'], cmap=mymap, size=30)
    print('Krt8')
    sc.pl.umap(adata, color='Krt8', cmap=mymap, size=30)
    print('Endothelial')
    sc.pl.umap(adata, color='Pecam1', cmap=mymap, size=30)
    print('Fibroblasts')
    sc.pl.umap(adata, color='Col1a2', cmap=mymap, size=30)
    print('Myofibroblasts')
    sc.pl.umap(adata, color=['Col3a1', 'Cthrc1', 'Postn', 'Spp1', 'Tnc', 'S100a6', 'Ccl2'], cmap=mymap, size=30)
    
def plot_marker_genes(adata):
    plot_markers(adata)

In [None]:
plot_markers(xie)

In [None]:
xie.obs['sample']=xie.obs.index.str.split('-').str[1]

In [None]:
xie.obs['cond']='control'

In [None]:
xie.obs.loc[xie.obs['sample'].isin(['4', '5', '6']), 'cond']='bleo'

# Analyze Batch effects

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
sc.pl.umap(xie, color=['sample', 'cond'], size=30)

- fewer contol cells than bleo cells, makes sense since in total 1943 (normal)+ 3386 (fibrotic) cells. 
- batch effect: each batch only "represents" one condition - bleo or normal 

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
sc.pl.umap(xie, color=['annotation', 'sample'], size=30)

-  cells from same label are generally near each other
- but shift between batches. 
+ todo: inspecting the distribution of marker genes (SCBP)

In [None]:
#xie.write('/home/p/pohll/Desktop/MP/data/xie_leonie_clustering.h5ad')
xie = sc.read('/home/p/pohll/Desktop/MP/data/xie_leonie_clustering.h5ad')

In [None]:
# daniel's xie adata:
xie_processed = sc.read('/home/p/pohll/Desktop/MP/data/xie/processed.h5ad') 
# use (backed="r") if h5ad very big 