In [2]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
import itertools
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from matplotlib.colors import Normalize, ListedColormap
from matplotlib.cm import ScalarMappable
import ast
import re

In [None]:
# first load tables

domain_table = pd.read_csv('/Volumes/PGH-Backup/domains/IPS/all_IPS_results.tsv', sep='\t', header=None)

display(domain_table, domain_table.shape)

In [None]:
# Group by the enzyme name and aggregate the domain column as a list
grouped_df = domain_table.groupby(0)[5].apply(list).reset_index()


In [None]:
display(grouped_df)

In [None]:
grouped_df_clean = grouped_df

In [None]:
grouped_df_clean[0] = grouped_df_clean[0].str.split('|').str[0]

display(grouped_df_clean)

In [None]:
grouped_df_clean.to_csv('/Volumes/PGH-Backup/domains/IPS/all_IPS_results_grouped.tsv', sep='\t', index=False, header=False)

**Begin Merging with cluster table**

In [None]:
## Temporary swap to a windows enviorment, paths changed accordingly
# grouped_df_clean = pd.read_csv("E:\\domains\\IPS\\all_IPS_results_grouped.tsv", sep='\t', header=None)
grouped_df_clean = pd.read_csv("/Volumes/PGH-Backup/domains/IPS/all_IPS_results_grouped.tsv", sep='\t', header=None)

display(grouped_df_clean)

In [9]:
def format_domain_table(df):
    """
    Function to format the domain table for future use
    """
    # Rename columns
    df.rename(columns={0: 'FullIdentifier', 1: 'Domains'}, inplace=True)
    
    # Split the FullIdentifier column into separate columns
    df['Uniref'] = df['FullIdentifier'].str.split('_').str[2]

    df['Enzyme'] = df['FullIdentifier'].str.split('_').str[0]

    df.drop_duplicates(subset='Uniref', inplace=True)

    enzymes = df['Enzyme'].unique()

    return df, enzymes

In [None]:
# read in cluster map
# cluster_map = pd.read_csv("E:\\clustering\\newest_cluster_maps\\catted_maps.tsv", sep='\t', index_col=0, header=None)
cluster_map = pd.read_csv("/Volumes/PGH-Backup/clustering/newest_cluster_maps/catted_maps.tsv", sep='\t', index_col=0, header=None)

display(cluster_map)

In [13]:
def format_cluster_map(df):
    """
    Function to format the cluster map for future use, return list of unique enzymes
    """
    # Rename columns
    df.rename(columns={1: 'unclustered', 2: 'mmseqs', 3: 'foldseek'}, inplace=True)

    df.drop_duplicates(subset='unclustered', inplace=True)

    return df

In [11]:
def clean_and_convert(domain_string):
    domain_string = [ast.literal_eval(x) for x in domain_string]
    
    domain_string = [
        item
        for sublist in domain_string
        for item in sublist
    ]
    
    return domain_string
    

In [None]:
# Subset by enzymes and merge with cluster map

for enzyme in enzymes:
    grouped_df_clean_subset = grouped_df_clean[grouped_df_clean['Enzyme'] == enzyme]
    merged_df = pd.merge(grouped_df_clean_subset, cluster_map, left_on='Uniref', right_on='unclustered', how='left')
    merged_df = merged_df.dropna(subset=['foldseek'])

    if merged_df.shape[0] > 0:
        grouped_merged_domain_cluster = merged_df.groupby('foldseek').agg({
                'Domains': list,    # Aggregate Domains into a list
                'Uniref': list,     # Aggregate Uniref into a list
                'Enzyme': set       # Aggregate Enzyme into a set (to remove duplicates)
            }).reset_index()
        
        grouped_merged_domain_cluster['member_count'] = grouped_merged_domain_cluster['Uniref'].apply(len)

        grouped_merged_domain_cluster['Domains'] = grouped_merged_domain_cluster['Domains'].apply(
            lambda x: clean_and_convert(x) if isinstance(x, list) else x)
        
        print(grouped_merged_domain_cluster.head())

        grouped_merged_domain_cluster.to_csv(f"/Volumes/PGH-Backup/domains/IPS/{enzyme}_IPS_results_grouped.tsv", sep='\t', index=False)

**Generate Network Plots**

In [None]:
def calculate_domain_similarity(subset_df):

    subset_df = pd.read_csv(subset_df, sep='\t')
    
    # Initialize to track how many proteins have each domain in each cluster
    domain_occurrences = defaultdict(lambda: defaultdict(int))

    # Iterate over the dataframe to count domain presence per protein
    for index, row in subset_df.iterrows():
        cluster_id = row['foldseek']
        domains = row['Domains']

        # Check if 'domains' is a valid non-empty list or string
        if isinstance(domains, str) and domains.strip() != '':
            try:
                # Safely evaluate the string into a list
                unique_domains = set(eval(domains))
            except:
                # Skip any domains that can't be evaluated
                continue
        elif isinstance(domains, (list, tuple, np.ndarray)) and len(domains) > 0:
            unique_domains = set(domains)
        else:
            # Skip if 'domains' is None, NaN, empty, or not a valid type
            continue

        # Count each domain in the set
        for domain in unique_domains:
            domain_occurrences[cluster_id][domain] += 1

    # Calculate the percentage of proteins with each domain in each cluster
    domain_percentages = {}
    for cluster, domains_dict in domain_occurrences.items():
        member_count = subset_df.loc[subset_df['foldseek'] == cluster, 'member_count'].values[0]
        domain_percentages[cluster] = {domain: count / member_count for domain, count in domains_dict.items()}

    # Convert domain percentages to a matrix for similarity calculation
    # Create a list of all unique domains across all clusters
    all_domains = set(domain for cluster_domains in domain_percentages.values() for domain in cluster_domains.keys())

    # Create a matrix of domain percentages for each cluster
    cluster_ids = list(domain_percentages.keys())
    domain_matrix = np.zeros((len(cluster_ids), len(all_domains)))

    # Mapping of cluster IDs and domain indices to facilitate matrix population
    cluster_idx_map = {cluster_id: idx for idx, cluster_id in enumerate(cluster_ids)}
    domain_idx_map = {domain: idx for idx, domain in enumerate(all_domains)}

    # Populate the matrix with domain percentages
    for cluster_id, domains_dict in domain_percentages.items():
        for domain, percentage in domains_dict.items():
            cluster_idx = cluster_idx_map[cluster_id]
            domain_idx = domain_idx_map[domain]
            domain_matrix[cluster_idx, domain_idx] = percentage

    # Calculate cosine similarity between clusters based on domain matrix
    cosine_sim = cosine_similarity(domain_matrix)

    # Convert the similarity matrix to edge list for significant similarities
    # We consider a similarity significant if it's above 0.1 (can adjust)
    significant_similarity_threshold = 0.8
    significant_edges = []
    for i in range(len(cluster_ids)):
        for j in range(i+1, len(cluster_ids)):
            if cosine_sim[i, j] > significant_similarity_threshold:
                significant_edges.append((cluster_ids[i], cluster_ids[j], cosine_sim[i, j]))
        
    return significant_edges


In [None]:
def plot_network(enzyme_type, subset_df, significant_edges, association_table=None, title=None, global_min_size=None, global_max_size=None):
    """ Plot the network graph for each unique enzyme type. Adjust node colors based on
        the enzyme type and node sizes based on cluster sizes. """
    
    subset_df = pd.read_csv(subset_df, sep='\t')

    # Create the network graph
    G_adjusted_similarity = nx.Graph()

    # Subset cluster_ids and edges based on the enzyme type in subset_df
    cluster_ids_subset = subset_df['foldseek'].tolist()
    edges_subset = [(u, v, w) for u, v, w in significant_edges if u in cluster_ids_subset and v in cluster_ids_subset]

    # Add nodes (clusters) for the subset
    G_adjusted_similarity.add_nodes_from(cluster_ids_subset)

    # Add edges with weights based on cosine similarity for the subset
    G_adjusted_similarity.add_weighted_edges_from(edges_subset)

    # --- NEW: Get the connected components (subclusters) ---
    connected_components = list(nx.connected_components(G_adjusted_similarity))

    # Rank the connected components by their size (number of nodes)
    connected_components_sorted = sorted(connected_components, key=len, reverse=True)

    # Keep the top 5 largest clusters, color them, and set the rest to grey
    top_n = 5
    cmap = plt.get_cmap('tab10')  # Use a colormap with 10 distinct colors
    cluster_colors = {i: cmap(i / top_n) for i in range(top_n)}  # Assign colors to top 5 clusters
    grey_color = 'grey'

    # Create a mapping of node to its subcluster color
    node_color_map = {}
    for i, component in enumerate(connected_components_sorted):
        if i < top_n:
            # Assign a color from the colormap to top 5 clusters
            for node in component:
                node_color_map[node] = cluster_colors[i]
        else:
            # Assign grey color to the remaining smaller clusters
            for node in component:
                node_color_map[node] = grey_color

    # Get the cluster sizes from the 'member_count' column in the subset_df
    cluster_sizes = subset_df.set_index('foldseek')['member_count'].to_dict()

    # Normalize cluster sizes globally
    if global_min_size is None:
        global_min_size = min(cluster_sizes.values())
    if global_max_size is None:
        global_max_size = max(cluster_sizes.values())

    min_size = 20
    max_size = 1000
    node_sizes = [
        ((cluster_sizes[node] - global_min_size) / (global_max_size - global_min_size) * (max_size - min_size) + min_size)
        if node in cluster_sizes else min_size
        for node in G_adjusted_similarity.nodes()
    ]

    # --- NEW: Extract node colors based on cluster assignment ---
    node_colors = [node_color_map[node] for node in G_adjusted_similarity.nodes()]

    # Visualize the adjusted network for the enzyme type
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G_adjusted_similarity, seed=42, k=0.5)  # Adjust the 'k' parameter to control node spacing

    # Draw the network
    nodes = nx.draw_networkx_nodes(G_adjusted_similarity, pos, node_color=node_colors, node_size=node_sizes, alpha=0.8)
    nx.draw_networkx_edges(G_adjusted_similarity, pos, alpha=0.5)
    nx.draw_networkx_labels(G_adjusted_similarity, pos, font_size=5, alpha=0.7)

    # --- NEW: Add title and legend ---
    if title is not None:
        plt.title(f"{title} - {enzyme_type}")
    else:
        plt.title(f"Adjusted Network Graph for {enzyme_type}")
    plt.axis('off')

    # --- NEW: Add legend showing only top 5 clusters ---
    handles = [
        plt.Line2D([0], [0], marker='o', color='w', label=f'Cluster {i+1}', 
                   markerfacecolor=cluster_colors[i], markersize=10)
        for i in range(top_n)
    ]
    handles.append(plt.Line2D([0], [0], marker='o', color='w', label=f'Other Clusters', 
                   markerfacecolor=grey_color, markersize=10))
    
    plt.legend(handles=handles, title='Subclusters', bbox_to_anchor=(1.05, 1), loc='upper left')

    # Save the plot for each enzyme type
    plt.savefig(f'./{enzyme_type}_adjusted_network.png', dpi=600, bbox_inches='tight')

    return G_adjusted_similarity

In [None]:
def extract_top_domains(subset_df, G_adjusted_similarity, top_n=5, top_domains=3):
    subset_df = pd.read_csv(subset_df, sep='\t')
    
    # Get the connected components (clusters)
    connected_components = list(nx.connected_components(G_adjusted_similarity))
    
    # Sort the connected components by size (number of nodes) in descending order
    connected_components_sorted = sorted(connected_components, key=len, reverse=True)
    
    # Limit to the top_n largest clusters
    top_connected_components = connected_components_sorted[:top_n]

    # Create a dictionary to store domains for each of the top_n clusters
    cluster_domains = {}

    for i, component in enumerate(top_connected_components):
        cluster_name = f'Cluster_{i+1}'
        domain_counter = Counter()  # Use a Counter to track domain frequencies
        for node in component:
            domains_str = subset_df.loc[subset_df['foldseek'] == node, 'Domains'].values[0]
            try:
                domains = eval(domains_str)  # Safely evaluate domain strings
                domain_counter.update(domains)  # Count domain occurrences
            except:
                continue

        # Get the top N most common domains in the cluster
        cluster_domains[cluster_name] = domain_counter.most_common(top_domains)

    # Print and return the top 3 domains for each cluster
    for cluster, top_domains in cluster_domains.items():
        print(f"Top {len(top_domains)} domains in {cluster}: {top_domains}")

    return cluster_domains

In [None]:
enzymes = ['Amidase', 'DD-carboxypeptidase', 'DD-endopeptidase', 'DL-endopeptidase', 
           'Glucosaminidase', 'LD-carboxypeptidase', 'Muramidase']

path = f'/Volumes/PGH-Backup/domains/IPS/clustered/{enzyme}_IPS_results_grouped.tsv'

In [None]:
# Modify this part of the code to pass in global min/max sizes across all enzymes
global_min_size = float('inf')
global_max_size = float('-inf')

# Loop through each enzyme type and calculate global min/max cluster sizes first
for enzyme in enzymes:
    path = f'/Volumes/PGH-Backup/domains/IPS/clustered/{enzyme}_IPS_results_grouped.tsv'
    
    subset_df = pd.read_csv(path, sep='\t')
    cluster_sizes = subset_df['member_count'].values

    global_min_size = min(global_min_size, min(cluster_sizes))
    global_max_size = max(global_max_size, max(cluster_sizes))

# Now, loop again to plot each enzyme network with consistent global node size scaling
for enzyme in enzymes:
    path = f'/Volumes/PGH-Backup/domains/IPS/clustered/{enzyme}_IPS_results_grouped.tsv'
    
    significant_edges = calculate_domain_similarity(path)

    G_adjusted_similarity = plot_network(
        enzyme_type=enzyme,
        subset_df=path,
        significant_edges=significant_edges,
        title=None,
        global_min_size=global_min_size,
        global_max_size=global_max_size
    )
    
    print(f"Clusters for {enzyme}")
    cluster_domains = extract_top_domains(path, G_adjusted_similarity)


---

### Domain Comparison with clustering methods, how different are domain inclusion stats between sequence clustering and structural?

**Question: How do domain percentages change from sequence clustering to foldseek clustering?**
1. merge domain table with cluster maps
2. groupby mmseqs cluster reps
3. clean formatting for domain lists
4. calculate stats on % domain inclusion, do same for foldseek clusters
5. visualizations to compare 

**Step 1: merge domain table w cluster maps**

In [27]:
# grouped_df_clean = pd.read_csv("E:/domains/IPS/clustered/all_IPS_results_grouped.tsv", sep='\t', header=None)

# mac
grouped_df_clean = pd.read_csv("/Volumes/PGH-Backup/domains/IPS/clustered/all_IPS_results_grouped.tsv", sep='\t', header=None)

display(grouped_df_clean)

Unnamed: 0,0,1
0,Amidase_UniRef100_A0A009ES59,['N-acetylmuramoyl-L-alanine amidase']
1,Amidase_UniRef100_A0A009FUX6,['N-acetylmuramoyl-L-alanine amidase']
2,Amidase_UniRef100_A0A009H4S4,['N-acetylmuramoyl-L-alanine amidase']
3,Amidase_UniRef100_A0A009HT94,['N-acetylmuramoyl-L-alanine amidase']
4,Amidase_UniRef100_A0A009L0R9,['N-acetylmuramoyl-L-alanine amidase']
...,...,...
707784,UC118_WP_253005939.1,"['NlpC/P60 family', 'LysM domain', 'LysM domai..."
707785,UC118_WP_255820014.1,"['LysM domain', 'LysM domain', 'LysM domain', ..."
707786,UC118_WP_263296879.1,"['NlpC/P60 family', 'LysM domain', 'LysM domai..."
707787,UC118_WP_263297069.1,"['LysM domain', 'LysM domain', 'LysM domain', ..."


In [28]:
new_grouped_df_clean, enzymes = format_domain_table(grouped_df_clean)

display(new_grouped_df_clean)

Unnamed: 0,FullIdentifier,Domains,Uniref,Enzyme
0,Amidase_UniRef100_A0A009ES59,['N-acetylmuramoyl-L-alanine amidase'],A0A009ES59,Amidase
1,Amidase_UniRef100_A0A009FUX6,['N-acetylmuramoyl-L-alanine amidase'],A0A009FUX6,Amidase
2,Amidase_UniRef100_A0A009H4S4,['N-acetylmuramoyl-L-alanine amidase'],A0A009H4S4,Amidase
3,Amidase_UniRef100_A0A009HT94,['N-acetylmuramoyl-L-alanine amidase'],A0A009HT94,Amidase
4,Amidase_UniRef100_A0A009L0R9,['N-acetylmuramoyl-L-alanine amidase'],A0A009L0R9,Amidase
...,...,...,...,...
707784,UC118_WP_253005939.1,"['NlpC/P60 family', 'LysM domain', 'LysM domai...",253005939.1,UC118
707785,UC118_WP_255820014.1,"['LysM domain', 'LysM domain', 'LysM domain', ...",255820014.1,UC118
707786,UC118_WP_263296879.1,"['NlpC/P60 family', 'LysM domain', 'LysM domai...",263296879.1,UC118
707787,UC118_WP_263297069.1,"['LysM domain', 'LysM domain', 'LysM domain', ...",263297069.1,UC118


In [29]:
new_grouped_df_clean['Domains'] = new_grouped_df_clean['Domains'].apply(
            lambda x: clean_and_convert(x) if isinstance(x, list) else x)

In [30]:
# Find unqiue elements in a list. If they appear multiple times, adjust the string to reflect that

def find_unique_elements(lst):
    """
    Function to find unique elements in a list. If they appear multiple times, adjust the string to reflect that.
    """
    
    element_counts = Counter(lst)
    adjusted_elements = []
    
    for element in lst:
        if element_counts[element] > 1:
            adjusted_elements.append(f"{element} ({element_counts[element]})")
        else:
            adjusted_elements.append(element)

    adjusted_elements = list(set(adjusted_elements))
    
    return adjusted_elements

In [31]:
new_grouped_df_clean['Domains'] = new_grouped_df_clean['Domains'].apply(find_unique_elements)

display(new_grouped_df_clean)

Unnamed: 0,FullIdentifier,Domains,Uniref,Enzyme
0,Amidase_UniRef100_A0A009ES59,"[l (3), m (3), t, ' (2), y (2), L, i (2), ], d...",A0A009ES59,Amidase
1,Amidase_UniRef100_A0A009FUX6,"[l (3), m (3), t, ' (2), y (2), L, i (2), ], d...",A0A009FUX6,Amidase
2,Amidase_UniRef100_A0A009H4S4,"[l (3), m (3), t, ' (2), y (2), L, i (2), ], d...",A0A009H4S4,Amidase
3,Amidase_UniRef100_A0A009HT94,"[l (3), m (3), t, ' (2), y (2), L, i (2), ], d...",A0A009HT94,Amidase
4,Amidase_UniRef100_A0A009L0R9,"[l (3), m (3), t, ' (2), y (2), L, i (2), ], d...",A0A009L0R9,Amidase
...,...,...,...,...
707784,UC118_WP_253005939.1,"[f, L (3), 6, d (3), m (4), ' (8), a (4), M (3...",253005939.1,UC118
707785,UC118_WP_255820014.1,"[f, L (3), 6, d (3), m (4), ' (8), a (4), M (3...",255820014.1,UC118
707786,UC118_WP_263296879.1,"[f, L (3), 6, d (3), m (4), ' (8), a (4), M (3...",263296879.1,UC118
707787,UC118_WP_263297069.1,"[f, L (3), 6, d (3), m (4), ' (8), a (4), M (3...",263297069.1,UC118


In [14]:
#cluster_map = pd.read_csv("E:/clustering/newest_cluster_maps/catted_maps.tsv", sep='\t', index_col=0, header=None)

# mac
cluster_map = pd.read_csv("/Volumes/PGH-Backup/clustering/newest_cluster_maps/catted_maps.tsv", sep='\t', index_col=0, header=None)

clean_cluster_map = format_cluster_map(cluster_map)

display(clean_cluster_map)

Unnamed: 0_level_0,unclustered,mmseqs,foldseek
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A0A432IFZ8,A0A432IFZ8,A0A534TJ56
1,A0A369Y4I6,A0A432IFZ8,A0A534TJ56
2,E3HA38,A0A432IFZ8,A0A534TJ56
3,UPI0004050AE8,A0A369PUK8,A0A455SSC1
4,A0A924USN5,A0A369PUK8,A0A455SSC1
...,...,...,...
331605,R5VGU4,R5VGU4,R5VGU4
331606,T0DIM4,T0DIM4,A0A5N7MBX0
331607,UPI000EA3697F,T0DIM4,A0A5N7MBX0
331608,A0A2S3QPZ4,T0DIM4,A0A5N7MBX0


In [15]:
# Drop foldseek column
clean_cluster_map.drop(columns=['foldseek'], inplace=True)

In [16]:
print(enzymes)

['Amidase' 'DD-carboxypeptidase' 'DD-endopeptidase' 'DL-endopeptidase'
 'Diadenylate' 'Glucosaminidase' 'LD-carboxypeptidase' 'LD-endopeptidase'
 'Muramidase' 'SagA' 'UC118']


**Steps 2 & 3: group by mmseqs & format domain info**

In [None]:
for enzyme in enzymes:
    grouped_df_clean_subset = new_grouped_df_clean[new_grouped_df_clean['Enzyme'] == enzyme]
    merged_df = pd.merge(grouped_df_clean_subset, cluster_map, left_on='Uniref', right_on='unclustered', how='left')
    merged_df = merged_df.dropna(subset=['mmseqs'])

    if merged_df.shape[0] > 0:
        grouped_merged_domain_cluster = merged_df.groupby('mmseqs').agg({
                'Domains': list,    # Aggregate Domains into a list
                'Uniref': list,     # Aggregate Uniref into a list
                'Enzyme': set       # Aggregate Enzyme into a set (to remove duplicates)
            }).reset_index()
        
        grouped_merged_domain_cluster['member_count'] = grouped_merged_domain_cluster['Uniref'].apply(len)

        grouped_merged_domain_cluster['Domains'] = grouped_merged_domain_cluster['Domains'].apply(
            lambda x: clean_and_convert(x) if isinstance(x, list) else x)
        
        print(grouped_merged_domain_cluster.head())

        grouped_merged_domain_cluster.to_csv(f"/Volumes/PGH-Backup/domains/IPS/{enzyme}_IPS_results_grouped_mmseqs.tsv", sep='\t', index=False)

**Step 4: Calculate stats for domain inclusion for mmseqs clusters compared to foldseek**

In [4]:
test_view = pd.read_csv("/Volumes/PGH-Backup/domains/IPS/mmseqs_groups/Amidase_IPS_results_grouped_mmseqs.tsv", sep='\t')

display(test_view)

Unnamed: 0,mmseqs,Domains,Uniref,Enzyme,member_count
0,A0A010NMG8,"['ell wall binding domain 2 (CWB2)', 'ell wall...","['A0A010NMG8', 'A0A010PUG6', 'A0A233V2K4', 'A0...",{'Amidase'},17
1,A0A010PSL2,"['Cysteine-rich secretory protein family', 'el...","['A0A010PSL2', 'A0A233VWJ9', 'A0A6I2SEB3', 'B0...",{'Amidase'},4
2,A0A010YT92,"['N-acetylmuramoyl-L-alanine amidase', 'N-acet...","['A0A010YT92', 'UPI000240E03A', 'UPI0004BCB526...",{'Amidase'},4
3,A0A010ZI67,"['N-acetylmuramoyl-L-alanine amidase', 'N-acet...","['A0A010ZI67', 'A0A7X7BLJ5', 'A0A7X9T7G7', 'A0...",{'Amidase'},7
4,A0A011Q2X5,"['N-acetylmuramoyl-L-alanine amidase', 'N-acet...","['A0A011Q2X5', 'A0A963P6Q4', 'W7WK78']",{'Amidase'},3
...,...,...,...,...,...
23115,X8ECC6,['N-acetylmuramoyl-L-alanine amidase'],['X8ECC6'],{'Amidase'},1
23116,X8HN28,"['Choline-binding repeat', 'Choline-binding re...","['UPI0004B1C17B', 'X8HN28']",{'Amidase'},2
23117,X8ISN4,"['N-acetylmuramoyl-L-alanine amidase', 'N-acet...","['A0A7Y8VSE7', 'X8ISN4']",{'Amidase'},2
23118,Z4WVM5,"['N-acetylmuramoyl-L-alanine amidase', 'N-acet...","['A0A069ZJT9', 'A0A076ILX0', 'A0A0A2DYZ4', 'A0...",{'Amidase'},207


In [5]:
test_view['Domains'] = test_view['Domains'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Now proceed with your logic
for idx, row in test_view.iterrows():
    unique_domains = set(row['Domains'])  # Get unique domains from each row

    # For each unique domain in the row
    for domain in unique_domains:
        if domain not in test_view.columns:  # If the domain is not already a column
            test_view[domain] = 0.0  # Initialize the column with 0

        # Fill the column with the proportion of the domain in the current row
        test_view.at[idx, domain] = row['Domains'].count(domain) / row['member_count']

# Display the updated dataframe
display(test_view)

  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[domain] = 0.0  # Initialize the column with 0
  test_view[do

Unnamed: 0,mmseqs,Domains,Uniref,Enzyme,member_count,Family of unknown function (DUF5633),ell wall binding domain 2 (CWB2),Cysteine-rich secretory protein family,Mannosyl-glycoprotein endo-beta-N-acetylglucosaminidase,N-acetylmuramoyl-L-alanine amidase,...,Domain of unknown function (DUF4280),Domain of unknown function (DUF4347),"5'-nucleotidase, C-terminal domain",GTP-binding GTPase Middle Region,Family of unknown function (DUF6541),Glycosyl hydrolase family 46,Phage lysozyme,Terminase RNaseH-like domain,Domain of unknown function (DUF4062),NACHT domain
0,A0A010NMG8,"[ell wall binding domain 2 (CWB2), ell wall bi...","['A0A010NMG8', 'A0A010PUG6', 'A0A233V2K4', 'A0...",{'Amidase'},17,0.058824,3.0,0.882353,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A0A010PSL2,"[Cysteine-rich secretory protein family, ell w...","['A0A010PSL2', 'A0A233VWJ9', 'A0A6I2SEB3', 'B0...",{'Amidase'},4,0.000000,3.0,1.000000,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A0A010YT92,"[N-acetylmuramoyl-L-alanine amidase, N-acetylm...","['A0A010YT92', 'UPI000240E03A', 'UPI0004BCB526...",{'Amidase'},4,0.000000,0.0,0.000000,0.25,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A0A010ZI67,"[N-acetylmuramoyl-L-alanine amidase, N-acetylm...","['A0A010ZI67', 'A0A7X7BLJ5', 'A0A7X9T7G7', 'A0...",{'Amidase'},7,0.000000,0.0,0.000000,0.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A0A011Q2X5,"[N-acetylmuramoyl-L-alanine amidase, N-acetylm...","['A0A011Q2X5', 'A0A963P6Q4', 'W7WK78']",{'Amidase'},3,0.000000,0.0,0.000000,0.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23115,X8ECC6,[N-acetylmuramoyl-L-alanine amidase],['X8ECC6'],{'Amidase'},1,0.000000,0.0,0.000000,0.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23116,X8HN28,"[Choline-binding repeat, Choline-binding repea...","['UPI0004B1C17B', 'X8HN28']",{'Amidase'},2,0.000000,0.0,0.000000,1.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23117,X8ISN4,"[N-acetylmuramoyl-L-alanine amidase, N-acetylm...","['A0A7Y8VSE7', 'X8ISN4']",{'Amidase'},2,0.000000,0.0,0.000000,0.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23118,Z4WVM5,"[N-acetylmuramoyl-L-alanine amidase, N-acetylm...","['A0A069ZJT9', 'A0A076ILX0', 'A0A0A2DYZ4', 'A0...",{'Amidase'},207,0.000000,0.0,0.000000,0.00,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# What is the average frequency of domain inclusion in each cluster?
