In [None]:
import sys
# adding notebooks to the system path
sys.path.insert(0, '/home/southark/notebooks')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.display.float_format = '{:.4f}'.format
import matplotlib
from tqdm import tqdm

matplotlib.rcParams['pdf.fonttype'] = 42
%load_ext autoreload
%autoreload 2
%matplotlib inline

from perturbseq import *

import scanpy as sc
import bbknn

In [None]:
data_path = '/data/norman/southark/tfs_standardized/'

# Load gene programs

In [None]:
#comps: i.e. definitions of the sparse gene expression programs (positive if a gene is included)
comps = pd.read_csv(f'{data_path}20240331_fibroblast_bulk_comps.csv', index_col=0)

In [None]:
#Return strong program genes for each program in dictionary form
def df_to_dict(df, threshold=0.05):
    result_dict = {}
    for idx, row in df.iterrows():
        filtered_columns = [col for col, val in row.items() if val > threshold]
        if filtered_columns:
            result_dict[idx] = filtered_columns
    return result_dict

# Convert the DataFrame
gene_prog_dict = df_to_dict(comps)

In [None]:
update_names = {"SARS": "SARS1", "H2AFV": "H2AZ2", "GARS": "GARS1",
                "TARS": "TARS1", "NARS": "NARS1", "EPRS": "EPRS1",
                "WARS": "WARS1", "H3F3A": "H3-3A", "H2AFZ": "H2AZ1",
                "HIST1H2AC": "H2AC6", "H2AFX": "H2AX", "IARS": "IARS1",
                "HIST1H4C": "H4C3", "H2AFJ": "H2AJ"}

filtered_comps = comps.copy()
filtered_comps = filtered_comps.rename(columns=update_names)

# Example Load an in vivo dataset 

Ilya Korsunsky, Kevin Wei, Mathilde Pohin, Edy Y. Kim, Francesca Barone, Triin Major, Emily Taylor, Rahul Ravindran, Samuel Kemble, Gerald F.M. Watts, A. Helena Jonsson, Yunju Jeong, Humra Athar, Dylan Windell, Joyce B. Kang, Matthias Friedrich, Jason Turner, Saba Nayar, Benjamin A. Fisher, Karim Raza, Jennifer L. Marshall, Adam P. Croft, Tomoyoshi Tamura, Lynette M. Sholl, Marina Vivero, Ivan O. Rosas, Simon J. Bowman, Mark Coles, Andreas P. Frei, Kara Lassen, Andrew Filer, Fiona Powrie, Christopher D. Buckley, Michael B. Brenner, Soumya Raychaudhuri,

## Cross-tissue, single-cell stromal atlas identifies shared pathological fibroblast phenotypes in four chronic inflammatory diseases

Med,
Volume 3, Issue 7,
2022,
Pages 481-518.e14,
ISSN 2666-6340,
https://doi.org/10.1016/j.medj.2022.05.002.
(https://www.sciencedirect.com/science/article/pii/S2666634022001842)

### Abstract: 

Pro-inflammatory fibroblasts are critical for pathogenesis in rheumatoid arthritis, inflammatory bowel disease, interstitial lung disease, and Sjögren’s syndrome and represent a novel therapeutic target for chronic inflammatory disease. However, the heterogeneity of fibroblast phenotypes, exacerbated by the lack of a common cross-tissue taxonomy, has limited our understanding of which pathways are shared by multiple diseases.
Methods
We profiled fibroblasts derived from inflamed and non-inflamed synovium, intestine, lungs, and salivary glands from affected individuals with single-cell RNA sequencing. We integrated all fibroblasts into a multi-tissue atlas to characterize shared and tissue-specific phenotypes.
Findings
Two shared clusters, CXCL10+CCL19+ immune-interacting and SPARC+COL3A1+ vascular-interacting fibroblasts, were expanded in all inflamed tissues and mapped to dermal analogs in a public atopic dermatitis atlas. We confirmed these human pro-inflammatory fibroblasts in animal models of lung, joint, and intestinal inflammation.
Conclusions
This work represents a thorough investigation into fibroblasts across organ systems, individual donors, and disease states that reveals shared pathogenic activation states across four chronic inflammatory diseases.
Funding
Grant from F. Hoffmann-La Roche (Roche) AG.
**Keywords: fibroblasts; atlas; scRNA-seq; inflammation; integration; Sjögren's syndrome; interstitial lung disease; ulcerative colitis; rheumatoid arthritis; stromal**


In [None]:
import scanpy as sc

# Load the AnnData object from disk
#loading the human single cell data from the Med 2022 fibro atlas paper
adata = sc.read("/data/norman/angel/fibro_datasets/fibroblast_atlas_med_2022.h5ad")
#adata.obsm["umap"] = adata.obsm["UMAP.OLD"]

# Convert from DataFrame to the standard format
adata.obsm['X_umap'] = adata.obsm['UMAP.OLD'][['umapold_1', 'umapold_2']].to_numpy()

# Now plotting should work
#sc.pl.embedding(adata, 'X_umap', color=['SPARC'])

In [None]:
# # mitochondrial genes, "MT-" for human, "Mt-" for mouse
# adata.var["mt"] = adata.var_names.str.startswith("MT-")
# # ribosomal genes
# adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# # hemoglobin genes
# adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

In [None]:
# #qc genes already filtered?
# sc.pp.calculate_qc_metrics(
#     adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
# )

In [None]:
# sc.pl.violin(
#     adata,
#     ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
#     jitter=0.4,
#     multi_panel=True,
# )

In [None]:
# Check if UMAP coordinates exist
print(adata.obsm.keys())  # Should contain 'X_umap'

# Verify UMAP shape
if 'X_umap' in adata.obsm:
    print(adata.obsm['X_umap'].shape)

# Apply gene set scores to the single cells

In [None]:
# Step 2: Create a new dictionary for filtered gene sets
filtered_gene_prog_dict = {}

# Step 3: Filter gene lists and populate the new dictionary
for key, gene_set in gene_prog_dict.items():
    filtered_gene_set = set(gene_set) & set(adata.var.index)
    if len(filtered_gene_set) > 0: 
        filtered_gene_prog_dict[key] = list(filtered_gene_set)
    else:
        print(f'for prog {key} there are no genes left')

In [None]:
from _weighted_score_genes import *

# prog31 = comps.loc[32][comps.loc[32] > 0]

# weighted_score = weighted_score_genes(adata, gene_list=prog31.index, weights = prog31.values , score_name="prog_KLF4_weighted")

for key, gene_set in filtered_gene_prog_dict.items():
    # Score genes for the current gene set
    print(f'key:{key}')

    prog = filtered_comps.loc[key][filtered_comps.loc[key] >= 0.05]#.sort_values().tail(50)
    #print(prog.index)
    weighted_score_genes(adata, gene_list=prog.index, weights = prog.values, ctrl_size=100, score_name=f"prog_{key}")


In [None]:
programs_to_plot = [f"prog_{key}" for key in filtered_gene_prog_dict.keys()]

In [None]:
adata.write('/data/norman/southark/external_datasets/fibroblast_atlas_med_2022/250123_med_2022_normalized_expr_scored.h5ad')

In [None]:
# Plot old UMAP
sc.pl.umap(adata, color=['cell_type_integrated'])

In [None]:
adata_filtered = adata[adata.obs.cell_type_integrated.isin(['CD34+MFAP5+ C9','CXCL10+CCL19+ C11',
                                      'SPARC+COL3A1+ C4', 'MYH11+ C13', 'PTGS2+SEMA4A+ C8','FBLN1+ C5'])
                         ].copy()

In [None]:
# Plot old UMAP
sc.pl.umap(adata_filtered, color=['cell_type_integrated'])

In [None]:
from _umap_plots import UMAPPlotter

# Initialize plotter
plotter = UMAPPlotter(dataset_name="med-2022",
                      base_output_dir="figures",
                      default_continuous_cmap='viridis',
                      categorical_width_ratio= 1.33,
                      continuous_width_ratio = 1,
                     )

# Plot clusters with exact column name
plotter.plot_clusters(adata_filtered, 
                     cluster_key="cell_type_integrated",
                     save=True,
                     show=True)
# Plot program
plotter.plot_program(adata_filtered,
                     program_key="prog_32",
                     save=True,
                     show=True) 

plotter.plot_program(adata_filtered,
                     program_key="prog_31",
                     width_ratio = 0.95,
                     save=True,
                     show=True) 

plotter.plot_program(adata_filtered,
                     program_key="prog_27",
                     save=True,
                     show=True) 

plotter.plot_program(adata_filtered,
                     program_key="prog_19",
                     width_ratio = 1.05,
                     save=True,
                     show=True)

plotter.plot_program(adata_filtered,
                     program_key="prog_4",
                     width_ratio = 1.05,
                     save=True,
                     show=True) 

In [None]:
marker_genes = {'myofibroblast' : ['ACTA2', 'TAGLN', 'CNN1'],
                'inflammatory': ['SPARC', 'COL3A1', 'COL1A1'],
                'universal': ['IGFBP6','S100A10', 'DCN'],#top 3 de in marginal
                'interferon-repsonse':['HLA-A', 'HLA-B','ISG15'],
               }

In [None]:
sc.pl.matrixplot(adata_filtered, marker_genes, groupby="cell_type_integrated", standard_scale="var",
                 figsize = (4.5,2),
                save = 'med_2022_markerplot.pdf'
                )

In [None]:
sc.pl.matrixplot(adata, marker_genes, groupby="cell_type_integrated", standard_scale="var",
                 figsize = (4.5,5),
                save = 'med_2022_full_markerplot.pdf'
                )

In [None]:
def calculate_cluster_ratio_score(
    adata,
    numerator_genes=['PLAGL1'],
    denominator_genes=['KLF2', 'KLF4'],
    cluster_key='leiden',
    score_name='cluster_ratio_score',
    target_sum=1e4,
    use_raw=True
):
    """
    Calculate ratio score based on relative expression levels within clusters.
    
    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.
    numerator_genes : list
        List of genes for numerator.
    denominator_genes : list
        List of genes for denominator.
    cluster_key : str
        Name of the clustering annotation in adata.obs.
    score_name : str
        Name for the resulting score in adata.obs.
    target_sum : float
        Target sum for normalization if using raw data.
    use_raw : bool
        Whether to use raw data (if available) and normalize it, or use
        the normalized data in adata.X.
    """
    def get_expression(adata, genes, use_raw):
        if use_raw and adata.raw is not None:
            # Get raw counts and normalize
            expr = adata.raw[:, genes].X
            if scipy.sparse.issparse(expr):
                expr = expr.toarray()
                
            total_counts = np.sum(adata.raw.X.toarray(), axis=1, keepdims=True)
            expr_normalized = (expr / total_counts) * target_sum
            
        else:
            # Use already normalized data from adata.X
            expr = adata[:, genes].X
            if scipy.sparse.issparse(expr):
                expr = expr.toarray()
            expr_normalized = expr
            
        return expr_normalized
    
    # Get expression values
    num_expr = get_expression(adata, numerator_genes, use_raw)
    denom_expr = get_expression(adata, denominator_genes, use_raw)
    
    clusters = adata.obs[cluster_key].unique()
    cluster_stats = {}
    
    for cluster in clusters:
        mask = adata.obs[cluster_key] == cluster
        
        num_expr_cluster = num_expr[mask]
        denom_expr_cluster = denom_expr[mask]
        
        if use_raw:
            # If using raw data, calculate mean then log transform
            num_mean = np.log1p(np.mean(num_expr_cluster))
            denom_mean = np.log1p(np.mean(denom_expr_cluster))
        else:
            # If using normalized data, exp-1 transform, take mean, then log1p transform back
            num_mean = np.log1p(np.mean(np.expm1(num_expr_cluster)))
            denom_mean = np.log1p(np.mean(np.expm1(denom_expr_cluster)))
        
        # Store statistics
        cluster_stats[cluster] = {
            'numerator_mean': num_mean,
            'denominator_mean': denom_mean,
            'ratio_score': num_mean / denom_mean if denom_mean > 0 else np.inf
        }
        
        # Store individual gene expressions
        for gene, expr in zip(numerator_genes, np.mean(num_expr_cluster, axis=0)):
            if not use_raw:
                expr = np.log1p(np.mean(np.expm1(expr)))
            cluster_stats[cluster][f'{gene}_mean'] = expr
        for gene, expr in zip(denominator_genes, np.mean(denom_expr_cluster, axis=0)):
            if not use_raw:
                expr = np.log1p(np.mean(np.expm1(expr)))
            cluster_stats[cluster][f'{gene}_mean'] = expr
    
    stats_df = pd.DataFrame.from_dict(cluster_stats, orient='index')
    stats_df = stats_df.sort_values('ratio_score', ascending=False)
    
    # Add score to original adata object
    adata.obs[score_name] = adata.obs[cluster_key].map(stats_df['ratio_score'])
    adata.obs[score_name] = adata.obs[score_name].astype('float64')
    
    return stats_df


# Calculate cluster-level ratios
numerator_genes=['PLAGL1']
denominator_genes=['KLF2','KLF4']
cluster_key='cell_type_integrated'
score_name='PLAGL1_cluster_ratio'

stats = calculate_cluster_ratio_score(adata,
                                    numerator_genes,
                                    denominator_genes,
                                    cluster_key,
                                    score_name)


# Print the statistics to see values for each cluster
print("\nCluster Statistics:")
print(stats)

# Print the statistics to see values for each cluster
print("\nCluster Statistics:")
print(stats)
 
#Scale and save to new layer
adata.layers['zscore'] = sc.pp.scale(adata, copy=True).X

# Plot
genes_to_plot = numerator_genes + denominator_genes #+ [score_name]

print(genes_to_plot)
sc.pl.dotplot(adata[adata.obs[cluster_key].isin(['SPARC+COL3A1+ C4', 'CD34+MFAP5+ C9']),:], 
              var_names=genes_to_plot,
              layer = 'zscore',
              #cmap = 'RdBu_r',
              vcenter = 0,
              #var_group_positions=[[0], [1,2], [3]],
              groupby=cluster_key,
             save = 'med2022_plag_klf_expr_dotplot.pdf')

sc.pl.dotplot(adata[adata.obs[cluster_key].isin(['SPARC+COL3A1+ C4', 'CD34+MFAP5+ C9']),:], 
              var_names='PLAGL1_cluster_ratio',
              layer = 'zscore',
              cmap = 'viridis',
              vcenter = 1,
              #var_group_positions=[[0], [1,2], [3]],
              groupby=cluster_key,
             save = 'med2022_plag_klf_ratio_dotplot.pdf')