In [58]:
import pandas as pd
import os

wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG'
os.chdir(wd_dir)

In [59]:
def find_intersection(file1_path, file2_path, gene_list=None):
    """
    Find intersection between two CSV files based on gene, Endogenous_Promoter, 
    and Exogenous_Promoter columns.
    
    Args:
        file1_path (str): Path to first CSV file
        file2_path (str): Path to second CSV file
        
    Returns:
        pandas.DataFrame: Merged data frame containing matching rows
    """
    # Read the CSV files
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)
    
    if gene_list is not None:
        # Filter df1 to only include genes from gene_list
        df1 = df1[df1['gene'].isin(gene_list)]
    
    # Select only the columns we want to compare
    columns_to_compare = ['gene', 'Endogenous_Promoter', 'Exogenous_Promoter']
    
    # Ensure boolean values are consistent
    for df in [df1, df2]:
        df['Endogenous_Promoter'] = df['Endogenous_Promoter'].astype(bool)
        df['Exogenous_Promoter'] = df['Exogenous_Promoter'].astype(bool)
    
    # Merge the dataframes on the specified columns
    merged_df = pd.merge(df1, 
                        df2,
                        on=columns_to_compare,
                        suffixes=('_file1', '_file2'))
    
    # Create a summary of the results
    total_rows_file1 = len(df1)
    total_rows_file2 = len(df2)
    matching_rows = len(merged_df)
    
    print(f"Total rows in file 1: {total_rows_file1}")
    print(f"Total rows in file 2: {total_rows_file2}")
    print(f"Number of matching rows: {matching_rows}")
    
    return merged_df

## Neuron disregulated genes

In [60]:
# Load gene lists from CSV files
upregulated = pd.read_csv('custom_pipeline/DATA/neuron_upregulated_genes.csv')['gene'].tolist()
downregulated = pd.read_csv('custom_pipeline/DATA/neuron_downregulated_genes.csv')['gene'].tolist() 

disregulated = upregulated + downregulated

In [61]:
file1_path = 'custom_pipeline/results/Neuron_peak_analysis_annotated_clean.csv'
file2_path = 'custom_pipeline/DATA/allgenes_NEU_total.csv'

In [62]:
%%script false --no-raise-error
# Find intersection
result = find_intersection(file1_path, file2_path)

# Display the first few rows of the intersection
result.head()

# save the results
# path_to_save = 'custom_pipeline/results'
# result.to_csv(f'{path_to_save}/matches_Neuron_file2.csv', index=False)

In [63]:
# Find intersection
result = find_intersection(file1_path, file2_path, disregulated)

Total rows in file 1: 3109
Total rows in file 2: 383
Number of matching rows: 372


In [64]:
result.head()

Unnamed: 0,location,baseMean_file1,Endogenous_Promoter,Exogenous_Promoter,chrom,coords,start,end,gene,baseMean_file2,log2FoldChange,Direction
0,chr13:12096677-12096805,36.13,False,True,chr13,12096677-12096805,12096677,12096805,Ryr2,3463.711688,-0.508089,DOWN
1,chr3:130315981-130316311,52.43,False,True,chr3,130315981-130316311,130315981,130316311,Col25a1,1176.761395,-0.950117,DOWN
2,chr5:141836791-141837462,39.44,False,True,chr5,141836791-141837462,141836791,141837462,Sdk1,216.344835,-0.683171,DOWN
3,chr9:60414018-60414689,121.26,True,False,chr9,60414018-60414689,60414018,60414689,Thsd4,187.100745,-0.575236,DOWN
4,chr13:46517193-46517674,84.35,False,True,chr13,46517193-46517674,46517193,46517674,Cap2,3241.927847,-0.637369,DOWN


In [65]:
result_up = find_intersection(file1_path, file2_path, upregulated)

Total rows in file 1: 429
Total rows in file 2: 383
Number of matching rows: 43


In [66]:
result_down = find_intersection(file1_path, file2_path, downregulated)

Total rows in file 1: 2680
Total rows in file 2: 383
Number of matching rows: 329


## NSC disregulated genes

In [67]:
# Load gene lists from CSV files
upregulated = pd.read_csv('custom_pipeline/DATA/nsc_upregulated_genes.csv')['gene'].tolist()
downregulated = pd.read_csv('custom_pipeline/DATA/nsc_downregulated_genes.csv')['gene'].tolist() 

disregulated = upregulated + downregulated

In [68]:
file1_path = 'custom_pipeline/results/NSC_peak_analysis_annotated_clean.csv'
file2_path = 'custom_pipeline/DATA/allgenes_NSC_total.csv'

In [69]:
%%script false --no-raise-error
# Find intersection
result = find_intersection(file1_path, file2_path)

# Display the first few rows of the intersection
result.head()

# save the results
# path_to_save = 'custom_pipeline/results'
# result.to_csv(f'{path_to_save}/matches_NSC_file2.csv', index=False)

In [70]:
# Find intersection
result = find_intersection(file1_path, file2_path, disregulated)

Total rows in file 1: 7889
Total rows in file 2: 3071
Number of matching rows: 609


In [71]:
result.head()

Unnamed: 0,location,baseMean_file1,Endogenous_Promoter,Exogenous_Promoter,chrom,coords,start,end,gene,baseMean_file2,log2FoldChange,Direction
0,chr18:12269975-12270216,48.49,True,False,chr18,12269975-12270216,12269975,12270216,Ankrd29,725.629024,1.282269,UP
1,chr14:67769750-67770300,74.82,True,False,chr14,67769750-67770300,67769750,67770300,Dock5,220.478208,1.649975,UP
2,chr2:27969165-27969462,49.74,False,True,chr2,27969165-27969462,27969165,27969462,Col5a1,278.615894,2.333655,UP
3,chr18:34839111-34839371,28.21,True,False,chr18,34839111-34839371,34839111,34839371,Kdm3b,3248.113447,0.583268,UP
4,chr1:33907599-33908138,26.62,True,False,chr1,33907599-33908138,33907599,33908138,Bend6,139.956085,-0.82275,DOWN


In [72]:
result_up = find_intersection(file1_path, file2_path, upregulated)

Total rows in file 1: 5166
Total rows in file 2: 3071
Number of matching rows: 395


In [73]:
result_down = find_intersection(file1_path, file2_path, downregulated)

Total rows in file 1: 2723
Total rows in file 2: 3071
Number of matching rows: 214
