In [2]:
import pickle
from itertools import combinations
import pandas as pd
import csv


def extract_accessions_csv(csv_file_path):
    identifiers = []
    with open(csv_file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            fields = line.split(',')
            if len(fields) >= 2:
                versioned_id = fields[1]
                base_id = versioned_id.split('.')[0]
                identifiers.append(base_id)
    return identifiers


def extract_accessions_tsv(tsv_file, column_name):
    """
    Extracts the 'Accession' column from a TSV file.

    Args:
        tsv_file (str): Path to the TSV file.
        column_name (str): The name of the column to extract.

    Returns:
        List[str]: A list of entries from the specified column.
    """
    accession_entries = []

    with open(tsv_file, newline='') as file:
        reader = csv.DictReader(file, delimiter='\t')
        for row in reader:
            iso_id = row[column_name]
            base_id = iso_id.split('-')[0] # remove isoform marker
            accession_entries.append(base_id)

    return accession_entries


In [None]:
ARM10_HUMAN_ARM_REPEAT_PSI3 = extract_accessions_csv("datasets_uniprot/ARM10_HUMAN_ARM_REPEAT_PSI3.csv")
ARM10_HUMAN_FULL_PSI10 = extract_accessions_csv("datasets_uniprot/ARM10_HUMAN_FULL_PSI10.csv")
uniprot_text_search = extract_accessions_tsv("datasets_uniprot/uniprot_text_search.tsv", "Entry")
blast_ARM10_HUMAN = extract_accessions_tsv("datasets_uniprot/blast_ARM10_HUMAN.tsv", "Accession")
uniprot_repeat_ARM = extract_accessions_tsv("datasets_uniprot/uniprotkb_repeat_ARM.tsv", "Entry")

datasets = [ARM10_HUMAN_ARM_REPEAT_PSI3, ARM10_HUMAN_FULL_PSI10, uniprot_text_search, blast_ARM10_HUMAN, uniprot_repeat_ARM]

In [16]:
# Calculate unique entries for each dataset
unique_ARM10_HUMAN_ARM_REPEAT_PSI3 = set(ARM10_HUMAN_ARM_REPEAT_PSI3) - (
    set(ARM10_HUMAN_FULL_PSI10) | set(uniprot_text_search) | set(blast_ARM10_HUMAN) | set(uniprot_repeat_ARM)
)
unique_ARM10_HUMAN_FULL_PSI10 = set(ARM10_HUMAN_FULL_PSI10) - (
    set(ARM10_HUMAN_ARM_REPEAT_PSI3) | set(uniprot_text_search) | set(blast_ARM10_HUMAN) | set(uniprot_repeat_ARM)
)
unique_uniprot_text_search = set(uniprot_text_search) - (
    set(ARM10_HUMAN_ARM_REPEAT_PSI3) | set(ARM10_HUMAN_FULL_PSI10) | set(blast_ARM10_HUMAN) | set(uniprot_repeat_ARM)
)
unique_blast_ARM10_HUMAN = set(blast_ARM10_HUMAN) - (
    set(ARM10_HUMAN_ARM_REPEAT_PSI3) | set(ARM10_HUMAN_FULL_PSI10) | set(uniprot_text_search) | set(uniprot_repeat_ARM)
)
unique_uniprot_repeat_ARM = set(uniprot_repeat_ARM) - (
    set(ARM10_HUMAN_ARM_REPEAT_PSI3) | set(ARM10_HUMAN_FULL_PSI10) | set(uniprot_text_search) | set(blast_ARM10_HUMAN)
)

# Print the unique entries and their counts
print("Unique entries in each dataset:")
print(f"ARM10_HUMAN_ARM_REPEAT_PSI3 ({len(unique_ARM10_HUMAN_ARM_REPEAT_PSI3)} entries):", unique_ARM10_HUMAN_ARM_REPEAT_PSI3)
print(f"ARM10_HUMAN_FULL_PSI10 ({len(unique_ARM10_HUMAN_FULL_PSI10)} entries):", unique_ARM10_HUMAN_FULL_PSI10)
print(f"uniprot_text_search ({len(unique_uniprot_text_search)} entries):", unique_uniprot_text_search)
print(f"blast_ARM10_HUMAN ({len(unique_blast_ARM10_HUMAN)} entries):", unique_blast_ARM10_HUMAN)
print(f"uniprot_repeat_ARM ({len(unique_uniprot_repeat_ARM)} entries):", unique_uniprot_repeat_ARM)

Unique entries in each dataset:
ARM10_HUMAN_ARM_REPEAT_PSI3 (0 entries): set()
ARM10_HUMAN_FULL_PSI10 (36 entries): {'P30153', 'Q15021', 'O75155', 'O95373', 'Q92616', 'P42345', 'Q5S007', 'P63010', 'Q7Z4Q2', 'Q9H3U1', 'O75031', 'Q5VYK3', 'Q96KG9', 'P30154', 'O14787', 'Q6ZUA9', 'O60518', 'Q6PJG6', 'Q14974', 'Q5UIP0', 'Q99460', 'O00410', 'Q96JX3', 'Q9NRP7', 'Q86WZ0', 'Q6AI08', 'Q6NUP7', 'Q9H892', 'Q8N122', 'Q92973', 'Q9H173', 'Q8TEX9', 'Q86VP6', 'Q08AM6', 'Q7Z460', 'Q10567'}
uniprot_text_search (94 entries): {'H0YGQ5', 'Q5TGZ0', 'Q15388', 'A0A804HJ95', 'A0A5F9ZH09', 'Q96LK7', 'Q07666', 'A0A0A0MT17', 'H7BYZ4', 'C9J625', 'A2A2M9', 'E5RHK3', 'F5GWV0', 'A5YKK6', 'H7C5B7', 'A0A804HIT2', 'B7Z7A1', 'B5MC89', 'B3KW05', 'B4DLR0', 'A0A1D5RMU3', 'B7Z979', 'F5H4H6', 'A0A140VJU9', 'Q5H9D8', 'E9PR92', 'O15169', 'B6ZDE5', 'P61586', 'B4DFB8', 'H7BZA2', 'Q16891', 'Q13033', 'Q9NSA3', 'A0A0A0MSY0', 'C9J2I1', 'H0Y4P5', 'F5H2X2', 'Q15910', 'A0A0A0MRT8', 'P36402', 'O43815', 'A0A7P0Z468', 'Q15022', 'E5RJ86', 'Q

In [None]:

# Output
print("Total entries (including duplicates due to isomorphism):")
print("Text based search:", len(uniprot_text_search))
print("PSI-BLAST with full ARM10:", len(ARM10_HUMAN_FULL_PSI10))
print("PSI BLAST with ARM repeat:", len(ARM10_HUMAN_ARM_REPEAT_PSI3))
print("blast with ARM10 full:", len(blast_ARM10_HUMAN))
print("Uniprot structure search:", len(uniprot_repeat_ARM))

# Find common entries
common_entries = set(ARM10_HUMAN_ARM_REPEAT_PSI3) & set(ARM10_HUMAN_FULL_PSI10) & set(uniprot_text_search) & set(blast_ARM10_HUMAN) & set(uniprot_repeat_ARM)
total_entries = set(ARM10_HUMAN_ARM_REPEAT_PSI3 + ARM10_HUMAN_FULL_PSI10 + uniprot_text_search + blast_ARM10_HUMAN + uniprot_repeat_ARM)

# Output common entries
print("Common Entries:", common_entries, " ", len(common_entries))
print("Total unique entries:", len(total_entries))

print("Unique entries in each dataset:")
print("ARM10_HUMAN_ARM_REPEAT_PSI3:", len(set(ARM10_HUMAN_ARM_REPEAT_PSI3)))
print("ARM10_HUMAN_FULL_PSI10:", len(set(ARM10_HUMAN_FULL_PSI10)))
print("uniprot_text_search:", len(set(uniprot_text_search)))
print("blast_ARM10_HUMAN:", len(set(blast_ARM10_HUMAN)))
print("Uniprot structure search:", len(set(uniprot_repeat_ARM)))



Total entries (including duplicates due to isomorphism):
Text based search: 159
PSI-BLAST with full ARM10: 135
PSI BLAST with ARM repeat: 7
blast with ARM10 full: 115
Uniprot structure search: 174
Common Entries: {'Q8N2F6', 'Q7L311', 'Q9UH62', 'Q9P291', 'Q5H9R4'}   5
Total unique entries: 345
Unique entries in each dataset:
ARM10_HUMAN_ARM_REPEAT_PSI3: 7
ARM10_HUMAN_FULL_PSI10: 82
uniprot_text_search: 159
blast_ARM10_HUMAN: 63
Uniprot structure search: 174


In [10]:
# List of datasets
datasets = {
    "ARM10_HUMAN_ARM_REPEAT_PSI3": set(ARM10_HUMAN_ARM_REPEAT_PSI3),
    "ARM10_HUMAN_FULL_PSI10": set(ARM10_HUMAN_FULL_PSI10),
    "uniprot_text_search": set(uniprot_text_search),
    "blast_ARM10_HUMAN": set(blast_ARM10_HUMAN),
    "uniprot_repeat_ARM": set(uniprot_repeat_ARM)
}

# Create an empty DataFrame for the overlap matrix
overlap_matrix = pd.DataFrame(index=datasets.keys(), columns=datasets.keys())

# Calculate overlaps
for dataset1, entries1 in datasets.items():
    for dataset2, entries2 in datasets.items():
        overlap_matrix.loc[dataset1, dataset2] = len(entries1 & entries2)

# Print the matrix
print("Overlap Matrix:")
# print(overlap_matrix)

display(overlap_matrix)



Overlap Matrix:


Unnamed: 0,ARM10_HUMAN_ARM_REPEAT_PSI3,ARM10_HUMAN_FULL_PSI10,uniprot_text_search,blast_ARM10_HUMAN,uniprot_repeat_ARM
ARM10_HUMAN_ARM_REPEAT_PSI3,7,7,6,6,5
ARM10_HUMAN_FULL_PSI10,7,82,23,16,40
uniprot_text_search,6,23,159,26,47
blast_ARM10_HUMAN,6,16,26,63,24
uniprot_repeat_ARM,5,40,47,24,174


In [None]:



with open('../jobs_AF/side1_uniprot_ids.pkl', 'wb') as file:
    pickle.dump(list(total_entries), file)
   
   

In [3]:
   
with open('../jobs_AF/side2_uniprot_ids.pkl', 'wb') as file:
    pickle.dump(["TCF7L2"], file)

In [None]:
ARM = extract_accessions_tsv("datasets_uniprot/uniprotkb_repeat_ARM.tsv", "Entry")
PFAM = extract_accessions_tsv("datasets_uniprot/uniprotkb_xref_pfam_PF00514_2025_05_02.tsv", "Entry")
INTERPRO = extract_accessions_tsv("datasets_uniprot/uniprotkb_xref_interpro_IPR000225_2025_05_02.tsv", "Entry")
ARM_PFAM_INTERPRO = extract_accessions_tsv("datasets_uniprot/uniprotkb_interpro_PFAM_ARM_2025_05_02.tsv", "Entry")

In [14]:
# Calculate unique entries for each dataset
unique_ARM = set(ARM) - (set(PFAM) | set(INTERPRO))
unique_PFAM = set(PFAM) - (set(ARM) | set(INTERPRO))
unique_INTERPRO = set(INTERPRO) - (set(ARM) | set(PFAM))

# Print the unique entries and their counts
print("Unique entries in each dataset:")
print(f"ARM ({len(unique_ARM)} entries):", unique_ARM)
print(f"PFAM ({len(unique_PFAM)} entries):", unique_PFAM)
print(f"INTERPRO ({len(unique_INTERPRO)} entries):", unique_INTERPRO)

Unique entries in each dataset:
ARM (9 entries): {'Q6ZWH5', 'Q8N2F6', 'Q6P1M9', 'Q6SZW1', 'Q8NA31', 'Q9Y263', 'Q5T9G4', 'Q9NZL4', 'Q8WYA6'}
PFAM (0 entries): set()
INTERPRO (29 entries): {'A8K5M7', 'B7Z5M1', 'B2RC26', 'J3QKQ5', 'B3KT98', 'B4DR79', 'A8K0G3', 'C9J1E7', 'B7Z752', 'Q96LK7', 'A0A087WYD1', 'P63010', 'B7Z5P0', 'Q14974', 'A0A087X2F3', 'B4DWG4', 'A0A0A0MSI1', 'Q68DI0', 'A0A087WU93', 'A8K916', 'A0A087WZQ6', 'J3KTM9', 'Q96DC7', 'A0A087X253', 'B2RBR9', 'Q1MX18', 'B7Z979', 'A0A140VJE8', 'Q10567'}


In [13]:
# Print the number of entries in each dataset
print("Number of entries in each dataset:")
print("ARM:", len(ARM))
print("PFAM:", len(PFAM))
print("INTERPRO:", len(INTERPRO))
print("ARM_PFAM_INTERPRO:", len(ARM_PFAM_INTERPRO))

Number of entries in each dataset:
ARM: 174
PFAM: 173
INTERPRO: 221
ARM_PFAM_INTERPRO: 230


In [None]:
# List of additional datasets
additional_datasets = {
    "ARM": set(ARM),
    "PFAM": set(PFAM),
    "INTERPRO": set(INTERPRO),
    "ARM_PFAM_INTERPRO": set(ARM_PFAM_INTERPRO)
}

# Create an empty DataFrame for the overlap matrix
additional_overlap_matrix = pd.DataFrame(index=additional_datasets.keys(), columns=additional_datasets.keys())

# Calculate overlaps
for dataset1, entries1 in additional_datasets.items():
    for dataset2, entries2 in additional_datasets.items():
        additional_overlap_matrix.loc[dataset1, dataset2] = len(entries1 & entries2)

# Print the matrix
print("Overlap Matrix for Additional Datasets:")
print(additional_overlap_matrix)

Overlap Matrix for Additional Datasets:
                   ARM PFAM INTERPRO ARM_PFAM_INTERPRO
ARM                174  146      165               174
PFAM               146  173      173               173
INTERPRO           165  173      221               221
ARM_PFAM_INTERPRO  174  173      221               230
