In [None]:
import sys
# adding notebooks to the system path
sys.path.insert(0, '/home/southark/notebooks')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.display.float_format = '{:.4f}'.format
import matplotlib
from tqdm import tqdm

matplotlib.rcParams['pdf.fonttype'] = 42
%load_ext autoreload
%autoreload 2
%matplotlib inline

from perturbseq import *

import scanpy as sc

In [None]:
data_path = '/data/norman/southark/tfs_standardized/'

# Load gene programs

In [None]:
#comps: i.e. definitions of the sparse gene expression programs (positive if a gene is included)
comps = pd.read_csv(f'{data_path}20240331_fibroblast_bulk_comps.csv', index_col=0)

In [None]:
#Return strong program genes for each program in dictionary form
def df_to_dict(df, threshold=0.05):
    result_dict = {}
    for idx, row in df.iterrows():
        filtered_columns = [col for col, val in row.items() if val > threshold]
        if filtered_columns:
            result_dict[idx] = filtered_columns
    return result_dict

# Convert the DataFrame
gene_prog_dict = df_to_dict(comps)

# Load an in vivo dataset 

## Cross-tissue human fibroblast atlas reveals myofibroblast subtypes with distinct roles in immune modulation
Gao, Yang et al.
Cancer Cell, Volume 42, Issue 10, 1764 - 1783.e10

In [None]:
import scanpy as sc
import pandas as pd
from scipy import io

# Read the Matrix Market format sparse matrix
mtx = io.mmread('/data/norman/southark/external_datasets/cancer_cell_fibro_atlas/expression_matrix.mtx')

# Read genes and barcodes
genes = pd.read_csv('/data/norman/southark/external_datasets/cancer_cell_fibro_atlas/genes.tsv', header=None)[0]
barcodes = pd.read_csv('/data/norman/southark/external_datasets/cancer_cell_fibro_atlas/barcodes.tsv', header=None)[0]

# Read metadata
metadata = pd.read_csv('/data/norman/southark/external_datasets/cancer_cell_fibro_atlas/metadata.csv')

# Print initial dimensions
print("Original matrix shape:", mtx.shape)
print("Number of genes:", len(genes))
print("Number of barcodes:", len(barcodes))
print("Number of metadata rows:", len(metadata))

# Transpose the matrix to be (cells x genes)
mtx = mtx.T.tocsr()

print("\nTransposed matrix shape:", mtx.shape)

# Create AnnData object
adata = sc.AnnData(X=mtx, 
                   obs=metadata,
                   var=pd.DataFrame(index=genes))

# Make sure the obs_names match the barcodes
adata.obs_names = barcodes

# Basic verification
print(f"\nFinal AnnData object shape: {adata.shape}")
print(f"Number of cells: {adata.n_obs}")
print(f"Number of genes: {adata.n_vars}")
print("\nMetadata columns:", adata.obs.columns.tolist())


In [None]:
# # mitochondrial genes, "MT-" for human, "Mt-" for mouse
# adata.var["mt"] = adata.var_names.str.startswith("MT-")
# # ribosomal genes
# adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# # hemoglobin genes
# adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

In [None]:
# #qc genes already filtered?
# sc.pp.calculate_qc_metrics(
#     adata, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True
# )

In [None]:
# sc.pl.violin(
#     adata,
#     ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
#     jitter=0.4,
#     multi_panel=True,
# )

In [None]:
# Transfer UMAP coordinates to the correct location in AnnData
adata.obsm['X_umap'] = adata.obs[['UMAP_1', 'UMAP_2']].values


# Apply gene set scores to the single cells

In [None]:
# Step 2: Create a new dictionary for filtered gene sets
filtered_gene_prog_dict = {}

# Step 3: Filter gene lists and populate the new dictionary
for key, gene_set in gene_prog_dict.items():
    filtered_gene_set = set(gene_set) & set(adata.var.index)
    if len(filtered_gene_set) > 0: 
        filtered_gene_prog_dict[key] = list(filtered_gene_set)
    else:
        print(f'for prog {key} there are no genes left')

In [None]:
from _weighted_score_genes import *

# prog31 = comps.loc[32][comps.loc[32] > 0]

# weighted_score = weighted_score_genes(adata, gene_list=prog31.index, weights = prog31.values , score_name="prog_KLF4_weighted")

for key, gene_set in filtered_gene_prog_dict.items():
    # Score genes for the current gene set
    print(f'key:{key}')

    prog = comps.loc[key][comps.loc[key] >= 0.05].sort_values()#.tail(50)
    #print(prog.index)
    weighted_score_genes(adata, gene_list=prog.index, weights = prog.values , score_name=f"prog_{key}")


In [None]:
programs_to_plot = [f"prog_{key}" for key in filtered_gene_prog_dict.keys()]

In [None]:
sc.pl.embedding(adata, basis='X_umap', color=programs_to_plot, layer= "logcounts")

In [None]:
from _umap_plots import UMAPPlotter

# Initialize plotter
plotter = UMAPPlotter(dataset_name="cancer-cell-2024",
                      base_output_dir="figures",
                      default_continuous_cmap='viridis',
                      categorical_width_ratio= 1.3,
                      continuous_width_ratio = 1.1
                     )

# Plot clusters with exact column name
plotter.plot_clusters(adata, 
                     cluster_key="Cluster",
                     point_size = 3,
                     save=True,
                     show=True)
# Plot program
plotter.plot_program(adata,
                     program_key="prog_32",
                     vmin = "p00.1",
                     vmax = "p99.9",
                      point_size = 5,
                     save=True,
                     show=True) 

plotter.plot_program(adata,
                     program_key="prog_31",
                       vmin = "p00.1",
                     vmax = "p99.9",
                          point_size = 5,
                     save=True,
                     show=True) 

plotter.plot_program(adata,
                     program_key="prog_27",
                       vmin = "p00.1",
                     vmax = "p99.9",
                          point_size = 5,
                     save=True,
                     show=True) 

plotter.plot_program(adata,
                     program_key="prog_19",
                       vmin = "p00.1",
                     vmax = "p99.9",
                          point_size = 5,
                     save=True,
                     show=True)

plotter.plot_program(adata,
                     program_key="prog_4",
                       vmin = "p00.1",
                     vmax = "p99.9",
                          point_size = 5,
                     save=True,
                     show=True) 

In [None]:
marker_genes = {'myofibroblast' : ['ACTA2', 'TAGLN', 'CNN1', 'TPM1', 'MYH9'],
                'inflammatory': ['SPARC', 'COL3A1', 'COL1A1', 'COL1A2', 'CTHRC1'],
                'universal': ['IGFBP6','S100A10', 'DCN', 'AHNAK', 'ANXA1'],
                'interferon-repsonse':['HLA-A', 'HLA-B','ISG15', 'GBP1', 'IFI6'],
               }

In [None]:
sc.pl.matrixplot(adata, 
                 marker_genes, groupby="Cluster", standard_scale="var",
                figsize = (5,5),
                save = 'cancercell_2024_full_markerplot.pdf')

In [None]:
sc.pl.matrixplot(adata[adata.obs.Cluster.isin(['c01', 'c03', 'c04', 'c16', 'c05', 'c19']), :], 
                 marker_genes, groupby="Cluster", standard_scale="var",
                figsize = (5,2),
                save = 'cancercell_2024_markerplot.pdf')

In [None]:
# Convert var index to strings
adata.var.index.name = 'gene_name'

In [None]:
adata.var.index

In [None]:
adata.write('/data/norman/southark/external_datasets/cancer_cell_fibro_atlas/250127_Fib_Atlas_cancercell2024_scored.h5ad')

In [None]:
def calculate_cluster_ratio_score(
    adata,
    numerator_genes=['PLAGL1'],
    denominator_genes=['KLF2', 'KLF4'],
    cluster_key='leiden',
    score_name='cluster_ratio_score',
    target_sum=1e4,
    use_raw=True
):
    """
    Calculate ratio score based on relative expression levels within clusters.
    
    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.
    numerator_genes : list
        List of genes for numerator.
    denominator_genes : list
        List of genes for denominator.
    cluster_key : str
        Name of the clustering annotation in adata.obs.
    score_name : str
        Name for the resulting score in adata.obs.
    target_sum : float
        Target sum for normalization if using raw data.
    use_raw : bool
        Whether to use raw data (if available) and normalize it, or use
        the normalized data in adata.X.
    """
    def get_expression(adata, genes, use_raw):
        if use_raw and adata.raw is not None:
            # Get raw counts and normalize
            expr = adata.raw[:, genes].X
            if scipy.sparse.issparse(expr):
                expr = expr.toarray()
                
            total_counts = np.sum(adata.raw.X.toarray(), axis=1, keepdims=True)
            expr_normalized = (expr / total_counts) * target_sum
            
        else:
            # Use already normalized data from adata.X
            expr = adata[:, genes].X
            if scipy.sparse.issparse(expr):
                expr = expr.toarray()
            expr_normalized = expr
            
        return expr_normalized
    
    # Get expression values
    num_expr = get_expression(adata, numerator_genes, use_raw)
    denom_expr = get_expression(adata, denominator_genes, use_raw)
    
    clusters = adata.obs[cluster_key].unique()
    cluster_stats = {}
    
    for cluster in clusters:
        mask = adata.obs[cluster_key] == cluster
        
        num_expr_cluster = num_expr[mask]
        denom_expr_cluster = denom_expr[mask]
        
        if use_raw:
            # If using raw data, calculate mean then log transform
            num_mean = np.log1p(np.mean(num_expr_cluster))
            denom_mean = np.log1p(np.mean(denom_expr_cluster))
        else:
            # If using normalized data, exp-1 transform, take mean, then log1p transform back
            num_mean = np.log1p(np.mean(np.expm1(num_expr_cluster)))
            denom_mean = np.log1p(np.mean(np.expm1(denom_expr_cluster)))
        
        # Store statistics
        cluster_stats[cluster] = {
            'numerator_mean': num_mean,
            'denominator_mean': denom_mean,
            'ratio_score': num_mean / denom_mean if denom_mean > 0 else np.inf
        }
        
        # Store individual gene expressions
        for gene, expr in zip(numerator_genes, np.mean(num_expr_cluster, axis=0)):
            if not use_raw:
                expr = np.log1p(np.mean(np.expm1(expr)))
            cluster_stats[cluster][f'{gene}_mean'] = expr
        for gene, expr in zip(denominator_genes, np.mean(denom_expr_cluster, axis=0)):
            if not use_raw:
                expr = np.log1p(np.mean(np.expm1(expr)))
            cluster_stats[cluster][f'{gene}_mean'] = expr
    
    stats_df = pd.DataFrame.from_dict(cluster_stats, orient='index')
    stats_df = stats_df.sort_values('ratio_score', ascending=False)
    
    # Add score to original adata object
    adata.obs[score_name] = adata.obs[cluster_key].map(stats_df['ratio_score'])
    adata.obs[score_name] = adata.obs[score_name].astype('float64')
    
    return stats_df


# Calculate cluster-level ratios
numerator_genes=['PLAGL1']
denominator_genes=['KLF2', 'KLF4']
cluster_key='Cluster'
score_name='PLAGL1_cluster_ratio'


stats = calculate_cluster_ratio_score(adata,
                                    numerator_genes,
                                    denominator_genes,
                                    cluster_key,
                                    score_name)


# Print the statistics to see values for each cluster
print("\nCluster Statistics:")
print(stats)
 
# Scale and save to new layer
adata.layers['zscore'] = sc.pp.scale(adata, copy=True).X

# Plot
genes_to_plot = numerator_genes + denominator_genes #+ [score_name]

print(genes_to_plot)
sc.pl.dotplot(adata[adata.obs[cluster_key].isin(['c03','c04', 'c05', 'c16']),:], 
              var_names=genes_to_plot,
              layer = 'zscore',
              #cmap = 'RdBu_r',
              vcenter = 0,
              #var_group_positions=[[0], [1,2], [3]],
              groupby=cluster_key,
             save = 'cancercell2024_plag_klf_expr_dotplot.pdf')

sc.pl.dotplot(adata[adata.obs[cluster_key].isin(['c03','c04', 'c05', 'c16']),:], 
              var_names='PLAGL1_cluster_ratio',
              layer = 'zscore',
              cmap = 'viridis',
              vcenter = 1,
              #var_group_positions=[[0], [1,2], [3]],
              groupby=cluster_key,
             save = 'cancercell2024_plag_klf_ratio_dotplot.pdf')