In [52]:
import pandas as pd
import os

wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG'
os.chdir(wd_dir)

# Get the current working directory
current_dir = os.getcwd()

In [53]:
def clean_intergenic_records(input_file, output_file):
    """
    Remove records labeled as 'Intergenic' in the gene_symbol column,
    rename columns, and save the cleaned data to a new CSV file.
    
    Parameters:
    input_file (str): Path to input CSV file
    output_file (str): Path to output CSV file
    """
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # Remove rows where gene_symbol is 'Intergenic'
    df_cleaned = df[df['gene_symbol'] != 'Intergenic']
    
    # Rename columns
    df_cleaned = df_cleaned.rename(columns={
        'gene': 'location',
        'gene_symbol': 'gene'
    })
    
    # Save the cleaned dataframe to a new CSV file
    df_cleaned.to_csv(output_file, index=False)
    
    # Print summary of removal
    removed_count = len(df) - len(df_cleaned)
    print(f"Processed {input_file}:")
    print(f"- Original records: {len(df)}")
    print(f"- Records removed: {removed_count}")
    print(f"- Records remaining: {len(df_cleaned)}")
    print(f"- Cleaned file saved as: {output_file}\n")

In [54]:
sufix = '_clean'

# Process both files
files_to_process = [
    ('custom_pipeline/results/Neuron_peak_analysis_annotated.csv', f'custom_pipeline/results/Neuron_peak_analysis_annotated{sufix}.csv'),
    ('custom_pipeline/results/NSC_peak_analysis_annotated.csv', f'custom_pipeline/results/NSC_peak_analysis_annotated{sufix}.csv')
]

for input_file, output_file in files_to_process:
    try:
        clean_intergenic_records(input_file, output_file)
    except Exception as e:
        print(f"Error processing {input_file}: {str(e)}")

Processed custom_pipeline/results/Neuron_peak_analysis_annotated.csv:
- Original records: 94013
- Records removed: 35512
- Records remaining: 58501
- Cleaned file saved as: custom_pipeline/results/Neuron_peak_analysis_annotated_clean.csv

Processed custom_pipeline/results/NSC_peak_analysis_annotated.csv:
- Original records: 25357
- Records removed: 7666
- Records remaining: 17691
- Cleaned file saved as: custom_pipeline/results/NSC_peak_analysis_annotated_clean.csv

