In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os

### def

In [2]:
# Define function to calculate ECF
def calculate_ecf(adata, genes, cell_type, cell_type_col, exclude_type=None):
    gene_ecf_list = []
    for gene_name in genes:
        if gene_name in adata.var_names:
            gene_expression = adata[:, gene_name].X.toarray().flatten()
            expressed_cells = np.sum(gene_expression > 0)
            total_cells = gene_expression.shape[0]

            # Exclude certain cell type if specified
            if exclude_type:
                excluded_cells_count = np.sum(adata.obs[cell_type_col] == exclude_type)
                adjusted_total_cells = total_cells - excluded_cells_count
                ecf = (expressed_cells / adjusted_total_cells) * 100 if adjusted_total_cells > 0 else 0
            else:
                ecf = (expressed_cells / total_cells) * 100

            gene_ecf_list.append({
                'Gene': gene_name,
                f'{cell_type}_ECF': ecf
            })
        else:
            gene_ecf_list.append({
                'Gene': gene_name,
                f'{cell_type}_ECF': np.nan
            })
    return pd.DataFrame(gene_ecf_list)

# Define function to calculate cell type percentage
def calculate_cell_type_percentage(genes, adata, cell_types, cell_type_col):
    data = []
    for gene_name in genes:
        gene_data = {'Gene': gene_name}
        if gene_name in adata.var_names:
            gene_expression = adata[:, gene_name].X.toarray().flatten()
            expressed_cells = adata[gene_expression > 0, :]

            for cell_type in cell_types:
                total_cells_in_type = np.sum(adata.obs[cell_type_col] == cell_type)
                expressed_cells_in_type = np.sum(expressed_cells.obs[cell_type_col] == cell_type)
                percentage = (expressed_cells_in_type / total_cells_in_type) * 100 if total_cells_in_type > 0 else 0
                gene_data[cell_type] = round(percentage, 2)
        else:
            for cell_type in cell_types:
                gene_data[cell_type] = np.nan
        data.append(gene_data)
    
    return pd.DataFrame(data)

# Define function to create empty dataframe
def create_empty_df(genes, cell_types):
    return pd.DataFrame(0.0, index=genes, columns=cell_types)

# Define function to calculate cell type percentage among expressed cells
def calculate_exp_cell_type_percentage(genes, adata, cell_types, cell_type_col):
    exp_cell_type_percentage_df = create_empty_df(genes, cell_types)
    for gene_name in genes:
        if gene_name in adata.var_names:
            gene_expression = adata[:, gene_name].X
            expressed_cells = adata[gene_expression > 0, :]

            if expressed_cells.n_obs > 0:
                cell_type_counts = expressed_cells.obs[cell_type_col].value_counts(normalize=True) * 100

                for cell_type, percentage in cell_type_counts.items():
                    exp_cell_type_percentage_df.loc[gene_name, cell_type] = round(percentage, 2)
    
    exp_cell_type_percentage_df.fillna(0, inplace=True)
    return exp_cell_type_percentage_df

## Load adata

In [3]:
# no prepro data
adata_gs = sc.read('data/CCA_Lung_geosketch.h5ad')

adata_gs_tumor = adata_gs[adata_gs.obs['cell_type_subset'] == 'tumor'].copy()
adata_gs_Tip_Cells = adata_gs[adata_gs.obs['cell_type_subset'] == 'Tip_Cells'].copy()

In [4]:
# prepro data
adata_pre = sc.read('data/CCA_Lung_geosketch_prepro.h5ad')

adata_pre_tumor = adata_pre[adata_pre.obs['cell_type_subset'] == 'tumor'].copy()
adata_pre_Tip_Cells = adata_pre[adata_pre.obs['cell_type_subset'] == 'Tip_Cells'].copy()

In [5]:
# Load result files
path = 'res/'
dfs = {}
file_list = [f for f in os.listdir(path) if f.startswith("gcam_") and f.endswith(".csv")]

for file in file_list:
    key = file.split("gcam_")[1].split("_res")[0]  # Extract the part of the string between 'gcam_' and '._res'
    df = pd.read_csv(path + file)
    dfs[key] = df

## Gene filtering

In [6]:
# Generate gene lists
typelist = 'cell_type_subset'
celltype = 'Tip_Cells'
tumor = 'TumorCell'

adata_pre_celltype = adata_pre[adata_pre.obs[typelist] == celltype].copy()
genelist1 = set(dfs[celltype][celltype].tolist())
genelist2 = set(dfs[celltype][tumor].tolist())

avg_genelist1 = set(adata_pre_celltype.to_df().mean()[adata_pre_celltype.to_df().mean() > 0].index.tolist())
avg_genelist2 = set(adata_pre_tumor.to_df().mean()[adata_pre_tumor.to_df().mean() > 0].index.tolist())

fil_genelist1 = list(genelist1 & avg_genelist1)
fil_genelist2 = list(genelist2 & avg_genelist2)

fil = dfs[celltype][dfs[celltype][celltype].isin(fil_genelist1)]
fil = fil[fil[tumor].isin(fil_genelist2)]
Tip = fil.sort_values(by='Mean Normalized_Weight', ascending = False).reset_index(drop = True)

In [7]:
# Calculate ECF
tumor_genes = list(Tip['TumorCell'].unique())
tip_genes = list(Tip['Tip_Cells'].unique())
cell_types_gs = adata_gs.obs['cell_type_subset'].unique()

df_tip_ecf_all_unique = df_tip_ecf_all.drop_duplicates(subset=['Gene'], keep='first').reset_index(drop=True)
df_tumor_ecf_all_unique = df_tumor_ecf_all.drop_duplicates(subset=['Gene'], keep='first').reset_index(drop=True)

# Calculate cell type percentage for TumorCell gene list
tumor_cell_type_percentage_df = calculate_cell_type_percentage(tumor_genes, adata_gs, cell_types_gs, 'cell_type_subset')
tumor_cell_type_percentage_df['Tumor_Total_ECF'] = df_tumor_ecf_all['Total_ECF']

# Calculate cell type percentage for Tip_Cells gene list
tip_cell_type_percentage_df = calculate_cell_type_percentage(tip_genes, adata_gs, cell_types_gs, 'cell_type_subset')
tip_cell_type_percentage_df['Tip_Total_ECF'] = df_tip_ecf_all_unique['Total_ECF']

# Set index and filter results
tumor_cell_type_percentage_df.set_index(tumor_cell_type_percentage_df.columns[0], inplace=True)
filtered_df_tumor = tumor_cell_type_percentage_df[tumor_cell_type_percentage_df['tumor'] == tumor_cell_type_percentage_df.drop('Tumor_Total_ECF', axis=1).max(axis=1)]

tip_cell_type_percentage_df.set_index(tip_cell_type_percentage_df.columns[0], inplace=True)
filtered_df_tip = tip_cell_type_percentage_df[tip_cell_type_percentage_df['Tip_Cells'] == tip_cell_type_percentage_df.drop('Tip_Total_ECF', axis=1).max(axis=1)]

In [8]:
# Extract gene lists
gene_list_tumor = filtered_df_tumor.index.tolist()
gene_list_tip = filtered_df_tip.index.tolist()

tmp = Tip[Tip['TumorCell'].isin(gene_list_tumor)]
tmp = tmp[tmp['Tip_Cells'].isin(gene_list_tip)]
common_pair = tmp.reset_index(drop=True)
common_pair = common_pair[common_pair['Count'] >= 4]
common_pair.reset_index(drop=True).round(4)