# Load the data files

In [2]:
wes_raw = pd.read_csv("/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/WES-qiushi/updated_annovar_processed.txt",
                       sep='\t',quoting=csv.QUOTE_NONE)
display(wes_raw.head(2))

visium_dea_filelist = ["/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/res_0.8/Whole_bladder_MGBvsWT_Filtered.csv",
            "/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/res_0.8/Outer_layer_Filtered_diffgenes_fdr_1MGBvsWT.csv",
            "/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/res_0.8/Intermediate_Filtered_diffgenes_fdr_0MGBvsWT.csv",
            "/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/res_0.8/Vascular_Filtered_diffgenes_fdr_2MGBvsWT.csv",
            "/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/res_0.8/subSMC_Filtered_fdr_SMCvsOther.csv"]

def combine_visium_dea_data(filelist):
    """ 
    Combines multiple Visium Differential Expression Analysis (DEA) CSV files into a single DataFrame.
    
    Reads multiple CSV files, assigns each file a spatial region based on its filename, 
    and merges them into a single DataFrame with an added 'bladder_region' column.

    Parameters:
        filelist (list): A list of file paths to Visium DEA data CSV files.
    
    Returns:
        pd.DataFrame: A concatenated DataFrame containing all DEA data with an additional 'bladder_region' column.
    """
    visium = {} 
    for file in filelist:
        basename = os.path.basename(file)
        spatial_region = basename.split("_")[0]
        visium[spatial_region] = pd.read_csv(file)
        visium[spatial_region]["bladder_region"] = spatial_region
        display(visium[spatial_region].head(2))
    combined = pd.concat(visium.values())
    return(combined)

def grab_visium_dea(gene, data):
    """
    Extracts differential expression data for a specific gene from a Visium DEA DataFrame.

    Filters the provided data to return only the rows corresponding to the given gene, 
    while omitting certain statistical columns.

    Parameters:
        gene (str): The gene of interest.
        data (pd.DataFrame): A DataFrame containing Visium DEA data.
    
    Returns:
        pd.DataFrame: A DataFrame with DEA results for the specified gene, excluding selected columns.
    """
    df = pd.DataFrame()
    if "gene" not in data.columns:
        data.reset_index().rename(columns={df.index.name:'gene'})
    cols_2_omit = ["p_val", "pct.1", "pct.2"]
    sliced = data[data["gene"] == gene]
    mask = ~sliced.columns.isin(cols_2_omit)
    sliced = sliced.loc[:, mask]
    return(sliced)

visium_data = combine_visium_dea_data(visium_dea_filelist)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,AAChange.refGeneWithVer,...,D177F1_B1,D177M1_B1,D177P1_B1,D178F1_B1,D178M1_B1,D178P1_B1,D179F1_B1,D179M1_B1,D179P1_B1,Column_369
0,chr1,924568,924568,T,A,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.137T>A:p.L46H,S...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D125P1_B1
1,chr1,924573,924573,C,T,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.142C>T:p.P48S,S...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D132P2_B1


Unnamed: 0,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,bladder_region
0,Myh11,6.746602e-21,-4.88385,0.217,0.973,1.014217e-16,Whole
1,Actg2,1.1881779999999999e-20,-3.982674,0.326,0.991,1.786189e-16,Whole


Unnamed: 0,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,p_val_adj_fdr,cluster,bladder_region
0,Myh11,2.712271e-08,-5.201319,0.154,1.0,0.000408,5.1e-05,1,Outer
1,Mylk,2.907453e-08,-2.759718,0.692,1.0,0.000437,5.1e-05,1,Outer


Unnamed: 0,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,p_val_adj_fdr,cluster,bladder_region
0,Ibsp,4.15e-06,-4.123217,0.0,0.571,0.062419,0.002589,0,Intermediate
1,Cnn1,1.14e-07,-3.427077,0.038,0.762,0.001716,0.00016,0,Intermediate


Unnamed: 0,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,p_val_adj_fdr,cluster,bladder_region
0,Col10a1,2.402534e-07,-5.35406,0.0,0.947,0.003612,0.002761,2,Vascular
1,Mmp13,7.822386e-07,-4.2705,0.0,0.895,0.011759,0.004496,2,Vascular


Unnamed: 0,gene,p_val,avg_log2FC,pct.1,pct.2,p_val_adj,p_val_adj_fdr,bladder_region
0,Col3a1,3.273982e-09,-0.804882,1.0,1.0,4.9e-05,3.2e-05,subSMC
1,Krt10,5.65012e-09,-1.121357,1.0,1.0,8.5e-05,3.2e-05,subSMC


## Exrtract only useful data to combine with WES

In [3]:
cols_2_extract = ["gene", "avg_log2FC", "p_val_adj", "bladder_region"]
mask = visium_data.columns.isin(cols_2_extract)
sliced = visium_data.loc[:, mask]
sliced['gene'] = sliced['gene'].str.upper()
sliced

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sliced['gene'] = sliced['gene'].str.upper()


Unnamed: 0,gene,avg_log2FC,p_val_adj,bladder_region
0,MYH11,-4.883850,1.014217e-16,Whole
1,ACTG2,-3.982674,1.786189e-16,Whole
2,ACTA2,-3.083006,1.287243e-14,Whole
3,MYL9,-2.544347,5.061845e-13,Whole
4,MYLK,-2.767663,5.469357e-12,Whole
...,...,...,...,...
40,PTP4A1,0.677543,1.000000e+00,subSMC
41,TACSTD2,-1.742884,1.000000e+00,subSMC
42,ITGA5,0.900331,1.000000e+00,subSMC
43,PARVB,-2.154998,1.000000e+00,subSMC


# Get Patient counts for each DEA genes from WES data

In [4]:
import pandas as pd
import re

wes_data = wes_raw.copy(deep=True)

# Identify patient and family columns
total_Dcols = len([col for col in wes_data.columns if re.match(r'^D[0-9]', col)])
patient_cols = [col for col in wes_data.columns if re.match(r'^D\d{3}P', col)]  # D followed by 3 digits and P (e.g., D123P)
family_cols = [col for col in wes_data.columns if re.match(r'^D\d{3}(?!P)', col)]       # D followed by 3 digits but not Pafter that (e.g., D001M*)

assert total_Dcols == len(patient_cols) + len(family_cols), "number of columns not totalling up correctly"

# Extract genotype prefixes (strip quotes and split at colon)
patient_genotypes = wes_data[patient_cols].replace(
    r'^"?(.*?):.*',  # Strip quotes and capture text before first colon
    r'\1', 
    regex=True
)
family_genotypes = wes_data[family_cols].replace(
    r'^"?(.*?):.*', 
    r'\1', 
    regex=True
)

# Define invalid genotypes
invalid_genotypes = ['./.', '0/0', '0|0']

# Create masks for valid genotypes
p_valid_mask = patient_genotypes.notna() & ~patient_genotypes.isin(invalid_genotypes)
f_valid_mask = family_genotypes.notna() & ~family_genotypes.isin(invalid_genotypes) 

# Function to generate comma-separated IDs
def get_ids(row_mask):
    return ', '.join(row_mask.index[row_mask])

wes_data['patient_count'] = p_valid_mask.sum(axis=1)
wes_data['patient_ids'] = p_valid_mask.apply(get_ids, axis=1)
wes_data['family_count'] = f_valid_mask.sum(axis=1)
wes_data['family_ids'] = f_valid_mask.apply(get_ids, axis=1)

In [5]:
wes_data

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGeneWithVer,Gene.refGeneWithVer,GeneDetail.refGeneWithVer,ExonicFunc.refGeneWithVer,AAChange.refGeneWithVer,...,D178M1_B1,D178P1_B1,D179F1_B1,D179M1_B1,D179P1_B1,Column_369,patient_count,patient_ids,family_count,family_ids
0,chr1,924568,924568,T,A,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.137T>A:p.L46H,S...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D125P1_B1,1,D125P1_B1,0,
1,chr1,924573,924573,C,T,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.142C>T:p.P48S,S...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D132P2_B1,1,D132P2_B1,0,
2,chr1,924628,924628,C,T,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.197C>T:p.P66L,S...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D132P2_B1,1,D132P2_B1,0,
3,chr1,924733,924733,C,T,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.302C>T:p.A101V,...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D078F1_B1,0,,1,D078F1_B1
4,chr1,924930,924930,C,A,exonic,SAMD11,.,nonsynonymous SNV,"""SAMD11:NM_001385640.1:exon1:c.499C>A:p.R167S,...",...,"""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""","""./.:0,0:0:0:0,0,0""",D133P1_B1,1,D133P1_B1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33588,chrX,154357624,154357624,-,GTGATCTGGACAGCCAGCAGGCCCTCCCCGGCGTCCTTTGCA,splicing,FLNA,NM_001110556.2:exon29:c.4756-1->TGCAAAGGACGCCG...,.,.,...,"""0/0:31,0:31:90:0,90,1113""","""0/0:31,0:31:90:0,90,1085""","""0/0:34,0:34:93:0,93,1161""","""0/0:32,0:32:90:0,90,1239""","""0/0:30,0:30:90:0,90,1234""",D081P1_B1,1,D081P1_B1,0,
33589,chrX,154358356,154358444,CTGTGAGGGATTGGTGTTGTGAGCAGTCAGACAGGTTCTCAGCATC...,-,splicing,FLNA,NM_001110556.2:exon27:r.spl;NM_001110556.2:exo...,.,.,...,"""0/0:31,0:31:90:.:.:0,90,1139:.""","""0/0:32,0:32:90:.:.:0,90,1167:.""","""0/0:30,0:30:90:.:.:0,90,1171:.""","""0/0:40,0:40:90:.:.:0,90,1320:.""","""0/0:34,0:34:90:.:.:0,90,1350:.""",D081P1_B1,1,D081P1_B1,0,
33590,chrX,154365260,154365348,CTGCCAAGACAAGGAGGGCCTCAGGCCTGCCCAGCAGTGAACCCGG...,-,splicing,FLNA,NM_001110556.2:exon10:r.spl;NM_001110556.2:exo...,.,.,...,"""0/0:43,0:43:90:0,90,1350""","""0/0:33,0:33:90:0,90,1350""","""0/0:31,0:31:90:0,90,1063""","""0/0:31,0:31:90:0,90,1350""","""0/0:35,0:35:90:0,90,1289""",D081P1_B1,1,D081P1_B1,0,
33591,chrX,154405656,154405656,-,GTAG,splicing,DNASE1L1,NM_006730.4:exon2:UTR5;NM_001009934.2:exon2:UT...,.,.,...,"""0/0:30,0:30:90:.:.:0,90,1169:.""","""0/0:34,0:34:90:.:.:0,90,1236:.""","""0/0:30,0:30:90:.:.:0,90,1111:.""","""0/0:32,0:32:93:.:.:0,93,996:.""","""0/0:34,0:34:90:.:.:0,90,1350:.""",D105P1_B1,1,D105P1_B1,0,


In [28]:
cols_dtype_convert = ["gnomad41_exome_AF", "gnomad41_exome_faf99", "gnomad41_exome_AF_raw", "Polyphen2_HDIV_score", 
                      "SIFT_score", "AlphaMissense_score", "MutationAssessor_score", "MutationTaster_score", "CADD_phred"]
wes_data[["Gene.refGeneWithVer"] + [i for i in wes_data.columns if re.match(r'Alpha', i)] + cols_dtype_convert]

Unnamed: 0,Gene.refGeneWithVer,AlphaMissense_score,AlphaMissense_rankscore,AlphaMissense_pred,gnomad41_exome_AF,gnomad41_exome_faf99,gnomad41_exome_AF_raw,Polyphen2_HDIV_score,SIFT_score,AlphaMissense_score.1,MutationAssessor_score,MutationTaster_score,CADD_phred
0,SAMD11,0.299,0.52921,B,0.0,0.0,0.000029,0.0,0.0,0.299,0.0,0.0,21.70
1,SAMD11,0.088,0.11893,B,0.0,0.0,0.000000,0.0,0.0,0.088,0.0,0.0,22.30
2,SAMD11,0.099,0.16843,B,0.0,0.0,0.000006,0.0,0.0,0.099,0.0,0.0,18.18
3,SAMD11,0.110,0.21011,B,0.0,0.0,0.000011,0.0,0.0,0.110,0.0,0.0,17.72
4,SAMD11,1.000,0.99395,P,0.0,0.0,0.000013,0.0,0.0,1.000,0.0,0.0,33.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33588,FLNA,0.000,.,.,0.0,0.0,0.000000,0.0,0.0,0.000,0.0,0.0,0.00
33589,FLNA,0.000,.,.,0.0,0.0,0.000000,0.0,0.0,0.000,0.0,0.0,0.00
33590,FLNA,0.000,.,.,0.0,0.0,0.000000,0.0,0.0,0.000,0.0,0.0,0.00
33591,DNASE1L1,0.000,.,.,0.0,0.0,0.000000,0.0,0.0,0.000,0.0,0.0,0.00


## Filter the WES data

In [49]:
cols_dtype_convert = ["gnomad41_exome_AF", "gnomad41_exome_faf99", "gnomad41_exome_AF_raw", "Polyphen2_HDIV_score", "SIFT_score", 
                      "AlphaMissense_score", "MutationAssessor_score", "MutationTaster_score", "CADD_phred"]
wes_data[cols_dtype_convert] = wes_data[cols_dtype_convert].replace('.','0').astype(float)
af_cut = 0.001# %0.1
af_filtered = wes_data[wes_data["gnomad41_exome_AF"] < af_cut] # AF < 0.001%

# Conditions to test, we are including if values are zero because 
# 1. in the previous step we converted '.' tp 0 and 
# 2. The indels or frameshifts will have 0s in all the filters
conditions = [
    (af_filtered["Polyphen2_HDIV_score"] > 0.9) | (af_filtered["Polyphen2_HDIV_score"] == 0),          # Condition 1
    (af_filtered["SIFT_score"] < 0.1) | (af_filtered["SIFT_score"] == 0),                              # Condition 2
    (af_filtered["AlphaMissense_score"] > 0.5) | (af_filtered["AlphaMissense_score"] == 0),            # Condition 3
    (af_filtered["MutationAssessor_score"] > 1.90) | (af_filtered["MutationAssessor_score"] == 0),     # Condition 4 # 
    (af_filtered["MutationTaster_score"] > 0.5) | (af_filtered["MutationTaster_score"] == 0),                                                         # Condition 5  
    (af_filtered["CADD_phred"] > 20) | (af_filtered["CADD_phred"] == 0),                                                                    # Condition 6
    
]
# Sum the True con ditions for each row (True = 1, False = 0)
count_met = sum(conditions)
# Create a mask where at least 4 conditions are True
mask = count_met >= 5
# Filter the DataFrame using the mask
filtered_wes_data = af_filtered[mask]

In [50]:
filtered_gene = filtered_wes_data[filtered_wes_data["Gene.refGeneWithVer"] == "CNN2"]
display(filtered_gene[["AAChange.refGeneWithVer"] + cols_dtype_convert])

unfiltered_gene = wes_data[wes_data["Gene.refGeneWithVer"] == "CNN2"]
display(unfiltered_gene[["AAChange.refGeneWithVer"] + cols_dtype_convert])

Unnamed: 0,AAChange.refGeneWithVer,gnomad41_exome_AF,gnomad41_exome_faf99,gnomad41_exome_AF_raw,Polyphen2_HDIV_score,SIFT_score,AlphaMissense_score,MutationAssessor_score,MutationTaster_score,CADD_phred
27311,"""CNN2:NM_001303499.2:exon2:c.104G>A:p.R35H,CNN...",2.9e-05,2e-05,2.9e-05,0.982,0.058,0.318,2.855,0.988787,23.7
27312,"""CNN2:NM_201277.3:exon6:c.695C>A:p.P232H,CNN2:...",5.6e-05,4.2e-05,0.0049,0.998,0.006,0.324,2.005,0.990314,26.3
27314,"""CNN2:NM_201277.3:exon6:c.720_721insAC:p.A241T...",0.0,0.0,7.9e-05,0.0,0.0,0.0,0.0,0.0,0.0
27316,"""CNN2:NM_201277.3:exon6:c.723_724del:p.D242Wfs...",0.0,0.0,7.8e-05,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,AAChange.refGeneWithVer,gnomad41_exome_AF,gnomad41_exome_faf99,gnomad41_exome_AF_raw,Polyphen2_HDIV_score,SIFT_score,AlphaMissense_score,MutationAssessor_score,MutationTaster_score,CADD_phred
27311,"""CNN2:NM_001303499.2:exon2:c.104G>A:p.R35H,CNN...",2.943e-05,1.971e-05,2.9e-05,0.982,0.058,0.318,2.855,0.988787,23.7
27312,"""CNN2:NM_201277.3:exon6:c.695C>A:p.P232H,CNN2:...",5.56e-05,4.213e-05,0.0049,0.998,0.006,0.324,2.005,0.990314,26.3
27313,"""CNN2:NM_201277.3:exon6:c.715A>C:p.T239P,CNN2:...",6.911e-07,0.0,0.0024,0.001,0.382,0.052,-0.205,1.0,5.225
27314,"""CNN2:NM_201277.3:exon6:c.720_721insAC:p.A241T...",0.0,0.0,7.9e-05,0.0,0.0,0.0,0.0,0.0,0.0
27315,"""CNN2:NM_201277.3:exon6:c.721G>A:p.A241T,CNN2:...",1.383e-06,9e-08,0.0023,0.501,0.524,0.068,1.59,1.0,17.75
27316,"""CNN2:NM_201277.3:exon6:c.723_724del:p.D242Wfs...",0.0,0.0,7.8e-05,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#sample_counts_df = filtered_wes_data.groupby(['Gene.refGeneWithVer'], as_index=False)[['patient_count', 'family_count']].sum()
sample_counts_df = filtered_wes_data.groupby(['Gene.refGeneWithVer'], as_index=False).agg({
    'patient_count': 'sum',
    'family_count': 'sum',
    'patient_ids': lambda x: ', '.join(
        [str(s).strip() for s in x if pd.notna(s) and str(s).strip() != '']),
    'family_ids': lambda x: ', '.join(
        [str(s).strip() for s in x if pd.notna(s) and str(s).strip() != ''])
}).reset_index()

sample_counts_df.rename(columns={"Gene.refGeneWithVer": "gene"}, errors="raise", inplace=True)
for groupname, group_df in sliced.groupby("bladder_region"):
    print(groupname)
    common_genes = pd.concat([group_df.set_index('gene'),sample_counts_df.set_index('gene')], axis=1, join='inner')
    display(common_genes)
    common_genes.to_csv(f"/Users/nxr042/research/visium/megabladder/results/all_int_bl_no_mgb1/WES-qiushi/{groupname}-WES-{af_cut}_patient_counts_df.csv")
    print(common_genes.shape)

Intermediate


Unnamed: 0_level_0,avg_log2FC,p_val_adj,bladder_region,index,patient_count,family_count,patient_ids,family_ids
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MYH11,-3.397872,4.67e-07,Intermediate,1997,1,1,D013P1_B1,D013M1_B1
ACTG2,-3.29618,8.95e-07,Intermediate,47,3,0,"D034P1_B1, D074P1_B1, D065P1_B1",
POLR1A,-2.193822,0.08350236,Intermediate,2502,0,1,,D040F1_B1
MYLK,-2.028758,0.000151267,Intermediate,2009,1,0,D165P1_B1,
MYL9,-1.93252,5.07e-06,Intermediate,2008,1,1,D156P1_B1,D040M1_B1
ARHGEF11,-1.658454,0.1152607,Intermediate,236,1,0,D164P1_B1,
IGFBP2,-1.467422,0.03615104,Intermediate,1479,0,2,,"D109M2_B1, D109M2_B1"


(7, 8)
Outer


Unnamed: 0_level_0,avg_log2FC,p_val_adj,bladder_region,index,patient_count,family_count,patient_ids,family_ids
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MYH11,-5.201319,0.000408,Outer,1997,1,1,D013P1_B1,D013M1_B1
MYLK,-2.759718,0.000437,Outer,2009,1,0,D165P1_B1,
COL1A2,1.253755,0.000439,Outer,687,1,0,D150P1_B1,
ACTG2,-3.668688,0.000524,Outer,47,3,0,"D034P1_B1, D074P1_B1, D065P1_B1",
MYL9,-2.319767,0.000633,Outer,2008,1,1,D156P1_B1,D040M1_B1
FLNA,-1.519349,0.001699,Outer,1171,5,1,"D081P1_B1, D088P1_B1, D019P1_B1, D019P2_B1, D0...",D018B3_B1
FN1,1.396596,0.006217,Outer,1178,1,0,D005P1_B1,
LRRC17,2.039854,0.01197,Outer,1765,1,0,D007P1_B1,
CNN2,-1.120957,0.013178,Outer,669,62,15,"D092P1_B1, D001P1_B1, D005P1_B1, D006P1_B1, D0...","D013M1_B1, D018B2_B1, D018B3_B1, D018M1_B1, D0..."
PPP1R12A,-1.870127,0.029501,Outer,2529,2,0,"D018P1_B1, D108P1_B1",


(17, 8)
Vascular


Unnamed: 0_level_0,avg_log2FC,p_val_adj,bladder_region,index,patient_count,family_count,patient_ids,family_ids
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


(0, 8)
Whole


Unnamed: 0_level_0,avg_log2FC,p_val_adj,bladder_region,index,patient_count,family_count,patient_ids,family_ids
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MYH11,-4.88385,1.014217e-16,Whole,1997,1,1,D013P1_B1,D013M1_B1
ACTG2,-3.982674,1.786189e-16,Whole,47,3,0,"D034P1_B1, D074P1_B1, D065P1_B1",
MYL9,-2.544347,5.061845e-13,Whole,2008,1,1,D156P1_B1,D040M1_B1
MYLK,-2.767663,5.469357e-12,Whole,2009,1,0,D165P1_B1,
PPP1R12A,-1.753351,5.688667e-07,Whole,2529,2,0,"D018P1_B1, D108P1_B1",
FLNA,-1.203795,6.005021e-06,Whole,1171,5,1,"D081P1_B1, D088P1_B1, D019P1_B1, D019P2_B1, D0...",D018B3_B1
CNN2,-1.017152,6.83392e-05,Whole,669,62,15,"D092P1_B1, D001P1_B1, D005P1_B1, D006P1_B1, D0...","D013M1_B1, D018B2_B1, D018B3_B1, D018M1_B1, D0..."
LMOD1,-2.932242,0.002717616,Whole,1740,1,0,D098P1_B1,
SLC38A1,-3.335785,0.003194307,Whole,2964,1,2,D109P1_B1,"D109M1_B1, D109M3_B1"
KRT6A,2.529611,0.003805985,Whole,1678,2,0,"D116P1_B1, D119P1_B1",


(12, 8)
subSMC


Unnamed: 0_level_0,avg_log2FC,p_val_adj,bladder_region,index,patient_count,family_count,patient_ids,family_ids
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KRT10,-1.121357,8.5e-05,subSMC,1669,2,0,"D014P1_B1, D138P1_B1",
MYH11,1.256399,0.001776,subSMC,1997,1,1,D013P1_B1,D013M1_B1
COL1A2,-0.870309,0.005751,subSMC,687,1,0,D150P1_B1,
MYLK,0.991432,0.011603,subSMC,2009,1,0,D165P1_B1,
ACTG2,1.173604,0.049401,subSMC,47,3,0,"D034P1_B1, D074P1_B1, D065P1_B1",
FLNA,0.753236,0.092131,subSMC,1171,5,1,"D081P1_B1, D088P1_B1, D019P1_B1, D019P2_B1, D0...",D018B3_B1
MYL9,0.962548,0.097923,subSMC,2008,1,1,D156P1_B1,D040M1_B1
SPTBN1,-0.848132,0.984124,subSMC,3093,0,1,,D179M1_B1
PPP1R12A,0.911623,1.0,subSMC,2529,2,0,"D018P1_B1, D108P1_B1",
FNDC3A,-2.164905,1.0,subSMC,1181,1,0,D127P1_B1,


(10, 8)


In [10]:
unique_genes_df1 = sliced['gene'].dropna().unique()
unique_genes_df2 = sample_counts_df['gene'].dropna().unique()

# Find intersection (genes present in both)
common_genes = set(unique_genes_df1) & set(unique_genes_df2)
print(f"Common genes: {common_genes}")

Common genes: {'LMOD1', 'IGFBP2', 'KRT6A', 'ACTG2', 'POLR1A', 'FN1', 'DCN', 'TCERG1L', 'PPP1R12A', 'THBS2', 'MYLK', 'LRRC17', 'ARHGEF11', 'TNNT2', 'AOC3', 'FNDC3A', 'ACTN1', 'SLC38A1', 'TSNAX', 'KRT1', 'CNN2', 'FLNA', 'JPH2', 'COL12A1', 'COL1A2', 'MYL9', 'KRT10', 'CNN1', 'DZIP1L', 'MYH11', 'TMED2', 'SPTBN1', 'PPP1R12B'}
