In [1]:
import unittest
import pandas as pd
from pandas.testing import assert_frame_equal
import os

In [2]:
def extract_tf_binding_regions(files, tf_name):
    tf_data = {}
    for file in files:
        df = pd.read_csv(file, sep='\t', header=None)
        df['TF'] = df[3].apply(lambda x: x.split(':')[0])
        df_filtered = df[df['TF'] == tf_name]
        genome_name = os.path.basename(file).split('_')[0]  # assuming the genome name is the first part of the file name
        if genome_name not in tf_data:
            tf_data[genome_name] = df_filtered
        else:
            tf_data[genome_name] = pd.concat([tf_data[genome_name], df_filtered])
    return tf_data

In [None]:
class TestExtractTFBindingRegions(unittest.TestCase):
    def setUp(self):
        # Specify the files and transcription factor name
        self.files = ['remap2022/remap2022_nr_macs2_hg38_v1_0.bed', 'remap2022/remap2022_nr_macs2_mm10_v1_0.bed']
        self.tf_name = 'IRF4'

    def test_extract_tf_binding_regions(self):
        # Run the function
        result = extract_tf_binding_regions(self.files, self.tf_name)

        # Manually check a few entries in the result
        # For example, you can print the first few rows of each DataFrame
        for genome, df in result.items():
            print(f'Genome: {genome}')
            print(df.head())

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)