In [24]:
import sys
import os

In [25]:
def read_list_file(filename):
    """Read a list file and return its contents as a set."""
    if not os.path.exists(filename):
        print(f"Error: File '{filename}' does not exist.")
        sys.exit(1)
    
    with open(filename, 'r') as f:
        # Strip whitespace and filter out empty lines
        return {line.strip() for line in f if line.strip()}

In [26]:
path1 =  "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Sammy/figures/genelists/"
file1_names = ["endo.txt", "NEU_exo.txt", "NSC_exo.txt"]
file1 = [
        f"{path1}{file1_name}" for file1_name in file1_names
]

path2 = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_Sammy/gene_lists/"
file2_names = ["endo_enriched_gene_list_nsc_vs_neu.txt", "nsc_enriched_gene_list_exo_vs_endo.txt", "neu_enriched_gene_list_exo_vs_endo.txt"]
file2 = [
    f"{path2}{file2_name}" for file2_name in file2_names
]

In [27]:
for i, (file1_name, file2_name) in enumerate(zip(file1, file2)):
    list1 = read_list_file(file1_name)
    list2 = read_list_file(file2_name)

    # Items in both lists
    common = list1.intersection(list2)

    # Items unique to each list
    only_in_list1 = list1 - list2
    only_in_list2 = list2 - list1

    results = {
        'common': common,
        'only_in_list1': only_in_list1,
        'only_in_list2': only_in_list2,
        'list1_count': len(list1),
        'list2_count': len(list2),
        'common_count': len(common)
    }

    """Print the comparison results in a readable format."""
    print(f"\nComparison between '{file1_names[i]}' and '{file2_names[i]}':")
    print(f"Total items in '{file1_names[i]}': {results['list1_count']}")
    print(f"Total items in '{file2_names[i]}': {results['list2_count']}")
    print(f"Items in both lists: {results['common_count']}")

    print(f"\nPercentage of '{file1_names[i]}' items in '{file2_names[i]}': "
            f"{results['common_count'] / results['list1_count'] * 100:.2f}%")
    print(f"Percentage of '{file2_names[i]}' items in '{file1_names[i]}': "
            f"{results['common_count'] / results['list2_count'] * 100:.2f}%")
    
    print("#############################################################################")


Comparison between 'endo.txt' and 'endo_enriched_gene_list_nsc_vs_neu.txt':
Total items in 'endo.txt': 4405
Total items in 'endo_enriched_gene_list_nsc_vs_neu.txt': 4979
Items in both lists: 4403

Percentage of 'endo.txt' items in 'endo_enriched_gene_list_nsc_vs_neu.txt': 99.95%
Percentage of 'endo_enriched_gene_list_nsc_vs_neu.txt' items in 'endo.txt': 88.43%
#############################################################################

Comparison between 'NEU_exo.txt' and 'nsc_enriched_gene_list_exo_vs_endo.txt':
Total items in 'NEU_exo.txt': 793
Total items in 'nsc_enriched_gene_list_exo_vs_endo.txt': 2425
Items in both lists: 186

Percentage of 'NEU_exo.txt' items in 'nsc_enriched_gene_list_exo_vs_endo.txt': 23.46%
Percentage of 'nsc_enriched_gene_list_exo_vs_endo.txt' items in 'NEU_exo.txt': 7.67%
#############################################################################

Comparison between 'NSC_exo.txt' and 'neu_enriched_gene_list_exo_vs_endo.txt':
Total items in 'NSC_exo.txt