In [93]:
import pandas as pd
import os

wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG'
os.chdir(wd_dir)

In [94]:
def find_intersection(file1_path, file2_path, gene_list=None):
    """
    Find intersection between two CSV files based on gene, Endogenous_Promoter, 
    and Exogenous_Promoter columns.
    """
    # Read the CSV files
    df1 = pd.read_csv(file1_path)
    df2 = pd.read_csv(file2_path)
    
    if gene_list is not None:
        # Filter df1 to only include genes from gene_list
        df1 = df1[df1['gene'].isin(gene_list)]
    
    # Check for duplicates in the matching columns
    columns_to_compare = ['gene', 'Endogenous_Promoter', 'Exogenous_Promoter']
    
    # Print duplicate information
    print("\nChecking for duplicates in file 1:")
    duplicates1 = df1[df1.duplicated(subset=columns_to_compare, keep=False)]
    print(f"Number of duplicate rows in file 1: {len(duplicates1)}")
    if len(duplicates1) > 0:
        print("\nExample duplicates from file 1:")
        print(duplicates1.head())
    
    print("\nChecking for duplicates in file 2:")
    duplicates2 = df2[df2.duplicated(subset=columns_to_compare, keep=False)]
    print(f"Number of duplicate rows in file 2: {len(duplicates2)}")
    if len(duplicates2) > 0:
        print("\nExample duplicates from file 2:")
        print(duplicates2.head())
    
    # Ensure boolean values are consistent
    for df in [df1, df2]:
        df['Endogenous_Promoter'] = df['Endogenous_Promoter'].astype(bool)
        df['Exogenous_Promoter'] = df['Exogenous_Promoter'].astype(bool)
    
    # Add this before the merge
    df1 = df1.drop_duplicates(subset=columns_to_compare)
    df2 = df2.drop_duplicates(subset=columns_to_compare)
    
    # Merge the dataframes on the specified columns
    merged_df = pd.merge(df1, 
                        df2,
                        on=columns_to_compare,
                        suffixes=('_file1', '_file2'))
    
    # Create a summary of the results
    total_rows_file1 = len(df1)
    total_rows_file2 = len(df2)
    matching_rows = len(merged_df)
    
    print(f"\nTotal rows in file 1: {total_rows_file1}")
    print(f"Total rows in file 2: {total_rows_file2}")
    print(f"Number of matching rows: {matching_rows}")
    
    return merged_df

## Neuron disregulated genes

In [95]:
# Load gene lists from CSV files
upregulated = pd.read_csv('custom_pipeline/DATA/neuron_upregulated_genes.csv')['gene'].tolist()
downregulated = pd.read_csv('custom_pipeline/DATA/neuron_downregulated_genes.csv')['gene'].tolist() 

disregulated = upregulated + downregulated

In [96]:
file1_path = 'custom_pipeline/results/Neuron_peak_analysis_annotated_clean.csv'
file2_path = 'custom_pipeline/DATA/allgenes_NEU_total.csv'

In [97]:
# %%script false --no-raise-error
# # Find intersection
# result = find_intersection(file1_path, file2_path)

# # Display the first few rows of the intersection
# result.head()

# # save the results
# # path_to_save = 'custom_pipeline/results'
# # result.to_csv(f'{path_to_save}/matches_Neuron_file2.csv', index=False)

In [98]:
# Find intersection
result = find_intersection(file1_path, file2_path, disregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 2716

Example duplicates from file 1:
                   location  Endogenous_Promoter  Exogenous_Promoter chrom  \
134  chr1:39194050-39194209                 True                True  chr1   
135  chr1:39318185-39318402                 True                True  chr1   
202  chr1:55405521-55406190                 True                True  chr1   
203  chr1:55501177-55501443                 True                True  chr1   
204  chr1:55529009-55529252                 True                True  chr1   

                coords     start       end   gene  
134  39194050-39194209  39194050  39194209  Npas2  
135  39318185-39318402  39318185  39318402  Npas2  
202  55405521-55406190  55405521  55406190  Plcl1  
203  55501177-55501443  55501177  55501443  Plcl1  
204  55529009-55529252  55529009  55529252  Plcl1  

Checking for duplicates in file 2:
Number of duplicate rows in file 2: 0

Total rows in file 1: 957
Total ro

In [99]:
result.head()

Unnamed: 0,location,Endogenous_Promoter,Exogenous_Promoter,chrom,coords,start,end,gene,baseMean,log2FoldChange,Direction
0,chr1:55405521-55406190,True,True,chr1,55405521-55406190,55405521,55406190,Plcl1,361.918949,-0.726832,DOWN
1,chr1:62706498-62707027,True,True,chr1,62706498-62707027,62706498,62707027,Nrp2,1460.062193,-0.5416,DOWN
2,chr1:89070006-89070435,True,True,chr1,89070006-89070435,89070006,89070435,Sh3bp4,630.23125,-0.553894,DOWN
3,chr1:91413167-91413489,True,True,chr1,91413167-91413489,91413167,91413489,Hes6,429.655435,0.631325,UP
4,chr1:106581988-106582206,True,True,chr1,106581988-106582206,106581988,106582206,Bcl2,456.720686,-0.81981,DOWN


In [100]:
result_up = find_intersection(file1_path, file2_path, upregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 272

Example duplicates from file 1:
                      location  Endogenous_Promoter  Exogenous_Promoter  \
778   chr1:165302813-165303223                 True                True   
779   chr1:165320625-165320886                 True                True   
1288   chr10:59950736-59950913                 True                True   
1289   chr10:59951745-59952140                 True                True   
1846     chr11:5906232-5906436                 True                True   

      chrom               coords      start        end    gene  
778    chr1  165302813-165303223  165302813  165303223  Gpr161  
779    chr1  165320625-165320886  165320625  165320886  Gpr161  
1288  chr10    59950736-59950913   59950736   59950913   Ddit4  
1289  chr10    59951745-59952140   59951745   59952140   Ddit4  
1846  chr11      5906232-5906436    5906232    5906436     Gck  

Checking for duplicates in file 2:
Number of dupl

In [101]:
result_down = find_intersection(file1_path, file2_path, downregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 2444

Example duplicates from file 1:
                   location  Endogenous_Promoter  Exogenous_Promoter chrom  \
134  chr1:39194050-39194209                 True                True  chr1   
135  chr1:39318185-39318402                 True                True  chr1   
202  chr1:55405521-55406190                 True                True  chr1   
203  chr1:55501177-55501443                 True                True  chr1   
204  chr1:55529009-55529252                 True                True  chr1   

                coords     start       end   gene  
134  39194050-39194209  39194050  39194209  Npas2  
135  39318185-39318402  39318185  39318402  Npas2  
202  55405521-55406190  55405521  55406190  Plcl1  
203  55501177-55501443  55501177  55501443  Plcl1  
204  55529009-55529252  55529009  55529252  Plcl1  

Checking for duplicates in file 2:
Number of duplicate rows in file 2: 0

Total rows in file 1: 663
Total ro

## NSC disregulated genes

In [102]:
# Load gene lists from CSV files
upregulated = pd.read_csv('custom_pipeline/DATA/nsc_upregulated_genes.csv')['gene'].tolist()
downregulated = pd.read_csv('custom_pipeline/DATA/nsc_downregulated_genes.csv')['gene'].tolist() 

disregulated = upregulated + downregulated

In [103]:
file1_path = 'custom_pipeline/results/NSC_peak_analysis_annotated_clean.csv'
file2_path = 'custom_pipeline/DATA/allgenes_NSC_total.csv'

In [104]:
# %%script false --no-raise-error
# # Find intersection
# result = find_intersection(file1_path, file2_path)

# # Display the first few rows of the intersection
# result.head()

# # save the results
# # path_to_save = 'custom_pipeline/results'
# # result.to_csv(f'{path_to_save}/matches_NSC_file2.csv', index=False)

In [105]:
# Find intersection
result = find_intersection(file1_path, file2_path, disregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 2909

Example duplicates from file 1:
                  location  Endogenous_Promoter  Exogenous_Promoter chrom  \
10  chr1:21052731-21053109                 True                True  chr1   
11  chr1:21074185-21074736                 True                True  chr1   
12  chr1:21079128-21079704                 True                True  chr1   
13  chr1:21411207-21412021                 True                True  chr1   
14  chr1:21804970-21805472                 True                True  chr1   

               coords     start       end   gene  
10  21052731-21053109  21052731  21053109  Tram2  
11  21074185-21074736  21074185  21074736  Tram2  
12  21079128-21079704  21079128  21079704  Tram2  
13  21411207-21412021  21411207  21412021  Kcnq5  
14  21804970-21805472  21804970  21805472  Kcnq5  

Checking for duplicates in file 2:
Number of duplicate rows in file 2: 0

Total rows in file 1: 3446
Total rows in file 

In [106]:
result.head()

Unnamed: 0,location,Endogenous_Promoter,Exogenous_Promoter,chrom,coords,start,end,gene,baseMean,log2FoldChange,Direction
0,chr1:15805370-15805982,True,True,chr1,15805370-15805982,15805370,15805982,Terf1,757.125075,-0.736643,DOWN
1,chr1:21052731-21053109,True,True,chr1,21052731-21053109,21052731,21053109,Tram2,607.910969,0.663319,UP
2,chr1:24230437-24230931,True,True,chr1,24230437-24230931,24230437,24230931,Col9a1,88.459047,-3.204749,DOWN
3,chr1:34150571-34151106,True,True,chr1,34150571-34151106,34150571,34151106,Dst,10318.09246,0.640716,UP
4,chr1:37864763-37865107,True,True,chr1,37864763-37865107,37864763,37865107,Tsga10,473.432002,-0.646423,DOWN


In [107]:
result_up = find_intersection(file1_path, file2_path, upregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 2171

Example duplicates from file 1:
                  location  Endogenous_Promoter  Exogenous_Promoter chrom  \
10  chr1:21052731-21053109                 True                True  chr1   
11  chr1:21074185-21074736                 True                True  chr1   
12  chr1:21079128-21079704                 True                True  chr1   
13  chr1:21411207-21412021                 True                True  chr1   
14  chr1:21804970-21805472                 True                True  chr1   

               coords     start       end   gene  
10  21052731-21053109  21052731  21053109  Tram2  
11  21074185-21074736  21074185  21074736  Tram2  
12  21079128-21079704  21079128  21079704  Tram2  
13  21411207-21412021  21411207  21412021  Kcnq5  
14  21804970-21805472  21804970  21805472  Kcnq5  

Checking for duplicates in file 2:
Number of duplicate rows in file 2: 0

Total rows in file 1: 1945
Total rows in file 

In [108]:
result_down = find_intersection(file1_path, file2_path, downregulated)


Checking for duplicates in file 1:
Number of duplicate rows in file 1: 738

Example duplicates from file 1:
                    location  Endogenous_Promoter  Exogenous_Promoter  chrom  \
77    chr1:65178166-65178729                 True                True   chr1   
78    chr1:65178885-65179453                 True                True   chr1   
283  chr10:21930917-21931239                 True                True  chr10   
284  chr10:21994280-21994792                 True                True  chr10   
288  chr10:26772951-26773279                 True                True  chr10   

                coords     start       end      gene  
77   65178166-65178729  65178166  65178729      Idh1  
78   65178885-65179453  65178885  65179453      Idh1  
283  21930917-21931239  21930917  21931239      Sgk1  
284  21994280-21994792  21994280  21994792      Sgk1  
288  26772951-26773279  26772951  26773279  Arhgap18  

Checking for duplicates in file 2:
Number of duplicate rows in file 2: 0

Total

   location              Endogenous_Promoter Exogenous_Promoter   chrom  coords               start      end         gene     baseMean      log2FoldChange   Direction
0  chr1:55405521-55406190        True                True         chr1   55405521-55406190    55405521   55406190    Plcl1    361.918949    -0.726832        DOWN
1  chr1:62706498-62707027        True                True         chr1   62706498-62707027    62706498   62707027    Nrp2     1460.062193   -0.541600        DOWN
2  chr1:89070006-89070435        True                True         chr1   89070006-89070435    89070006   89070435    Sh3bp4   630.231250    -0.553894        DOWN
3  chr1:91413167-91413489        True                True         chr1   91413167-91413489    91413167   91413489    Hes6     429.655435    0.631325         UP
4  chr1:106581988-106582206      True                True         chr1   106581988-106582206  106581988  106582206   Bcl2     456.720686    -0.819810        DOWN