In [2]:
import json
import pandas as pd
#display all columns




In [None]:
import json
import pandas as pd
pd.set_option('display.max_columns', None)
def test_output(output_df: pd.DataFrame):
    """
    Process and validate model outputs against edge clusters.
    
    This function performs several validation steps:
    1. Merges output data with cluster data
    2. Parses and validates the model outputs
    3. Checks if outputs match input edges
    4. Validates edge type consistency
    5. Verifies representative edge presence
    
    Args:
        output_df (pd.DataFrame): DataFrame containing model outputs with 'embedding' column
        
    Returns:
        tuple: (processed_df, rerun_df)
            - processed_df: DataFrame with validated results
            - rerun_df: DataFrame containing entries that need to be reprocessed
    """
    # Rename column for clarity
    output_df.rename(columns={'embedding': 'output_string'}, inplace=True)

    # Load cluster data
    cluster_df = pd.read_parquet('path/to/parquet')  # edge_clusters_6_max_clust_30.parquet
    cluster_df = cluster_df[cluster_df.cluster_size > 1]
    
    print(len(cluster_df))

    # Create input string format: "index: list_of_edges"
    cluster_df["input"] = cluster_df.apply(lambda row: f"{row.name}: {row['ids_in_cluster']}", axis=1)
    cluster_df["id"] = cluster_df["cluster_id"].astype(str)
    
    # Merge output with cluster data
    merged_df = pd.merge(output_df, cluster_df[["id", "ids_in_cluster"]], on='id', how='inner')

    def load_dict(dict_string):
        """
        Safely parse a dictionary string to Python object.
        
        Args:
            dict_string (str): String representation of a dictionary
            
        Returns:
            dict/list: Parsed object or empty list if parsing fails
        """
        try:
            return json.loads(dict_string)
        except Exception:
            return []

    # Parse model outputs
    merged_df["finetuned_output"] = merged_df["output_string"].apply(load_dict)
    merged_df["finetuned_output_parsed"] = merged_df["finetuned_output"].apply(lambda x: isinstance(x, dict))
    
    print(merged_df["finetuned_output_parsed"].value_counts())

    # Separate failed parses for reprocessing
    rerun_df = merged_df[merged_df["finetuned_output_parsed"] == False]
    merged_df = merged_df[merged_df["finetuned_output_parsed"] == True]

    def convert_to_list(row):
        """
        Convert dictionary values to Python list.
        
        Args:
            row (dict): Dictionary containing model output
            
        Returns:
            list: Converted list or None if conversion fails
        """
        try:
            return eval(list(row.values())[0])
        except:
            return None

    # Process outputs into lists
    merged_df["finetuned_list"] = merged_df["finetuned_output"].apply(convert_to_list)
    merged_df["finetuned_list"] = merged_df["finetuned_list"].apply(lambda x: x[0] if isinstance(x, tuple) else x)

    # Add failed conversions to rerun dataset
    rerun_df = pd.concat([rerun_df.copy(), merged_df[merged_df["finetuned_list"].isnull()]].copy())
    merged_df = merged_df[merged_df["finetuned_list"].notnull()]

    print("length of merged_df:", len(merged_df))
    print("length of rerun:", len(rerun_df))

    # Flatten nested lists
    merged_df["flattened_list"] = merged_df["finetuned_list"].apply(lambda x: [item for sublist in x for item in sublist])

    def flatten_list_to_strings(item):
        """
        Recursively flatten nested lists/tuples into a list of strings.
        
        Args:
            item: Nested structure of lists/tuples
            
        Returns:
            list: Flattened list of strings
        """
        if isinstance(item, (list, tuple)):
            return [str_item for sublist in item for str_item in flatten_list_to_strings(sublist)]
        else:
            return [str(item)]

    merged_df["flattened_list"] = merged_df["flattened_list"].apply(flatten_list_to_strings)

    # Validate flattened list structure
    assert merged_df["flattened_list"].apply(lambda x: isinstance(x, list)).all()
    assert merged_df["flattened_list"].apply(lambda x: all(isinstance(item, str) for item in x)).all()

    def check_if_outputs_matches_input(row):
        """
        Verify if all items in flattened_list exist in the input cluster.
        
        Args:
            row (pd.Series): DataFrame row containing flattened_list and ids_in_cluster
            
        Returns:
            bool: True if all outputs match inputs, False otherwise
        """
        flattened_list = row["flattened_list"]
        checks = []
        
        for item in flattened_list:
            try:
                if item.strip("*: ") in row["ids_in_cluster"]:
                    checks.append(True)
                else:
                    checks.append(False)
            except:
                print("error")
                print(item)
                print("flattened_list:", flattened_list)
                print("output")
                
                checks.append(False)
        if False in checks:
            return False
        else:
            return True

    merged_df["output_matches_input"] = merged_df.apply(check_if_outputs_matches_input, axis=1)

    def check_if_representative_is_in_finetuned_list(row):
        """
        Check if each group in finetuned_list has a representative edge (marked with **).
        
        Args:
            row (pd.Series): DataFrame row containing finetuned_list
            
        Returns:
            bool: True if all groups have a representative, False otherwise
        """
        list_of_checks = []
        for item in row["finetuned_list"]:
            group_has_representative = False
            for entry in item:
                if "**" in entry:
                    group_has_representative = True
        
            list_of_checks.append(group_has_representative)
        if False in list_of_checks:
            return False
        else:
            return True

    def check_that_all_edges_have_same_types(row):
        """
        Verify that all edges in each group have consistent source and target types.
        
        Args:
            row (pd.Series): DataFrame row containing finetuned_list
            
        Returns:
            bool: True if all edges in each group have consistent types, False otherwise
        """
        for item in row["finetuned_list"]:
            source_types = set()
            target_types = set()
            for edge in item:
                # Handle case where edge might be a list or string
                if isinstance(edge, list):
                    # If it's a list, skip it or handle it appropriately
                    continue
                
                edge_str = edge.strip("*: ")
                try:
                    source_type = edge_str.split("]")[0].split("[")[1]
                    target_type = edge_str.split("[")[2].split("]")[0]
                    source_types.add(source_type)
                    target_types.add(target_type)
                except:
                    print(f"Error processing edge: {edge}")
                    continue
                    
            if len(source_types) > 1 or len(target_types) > 1:
                return False
        return True

    # Add validation columns
    merged_df["finetuned_representative_in_list"] = merged_df.apply(check_if_representative_is_in_finetuned_list, axis=1)
    merged_df["all_edges_have_same_types"] = merged_df.apply(check_that_all_edges_have_same_types, axis=1)
    
    return merged_df, rerun_df

# Load and process the output data
output_df = pd.read_parquet('path/to/parquet')  # v8_full_output_20250331_141507.parquet
merged_df, rerun_df = test_output(output_df)

In [4]:

merged_df

Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
0,0,"{""0"":""[[\""**[organ] correlated with [metabolit...","[[organ] correlated with [metabolic pathway], ...","{'0': '[[""**[organ] correlated with [metabolit...",True,"[[**[organ] correlated with [metabolite]**, [o...","[**[organ] correlated with [metabolite]**, [or...",True,True,True
1,1,"{""1"":""[]""}","[[organ] mediates [metabolite], [organ] suppor...",{'1': '[]'},True,[],[],True,True,True
2,2,"{""2"":""[[\""**[organ] leads to [metabolite]**\"",...","[[metabolite] self-organizes along [organ], [o...","{'2': '[[""**[organ] leads to [metabolite]**"", ...",True,"[[**[organ] leads to [metabolite]**, [organ] l...","[**[organ] leads to [metabolite]**, [organ] le...",True,True,True
3,3,"{""3"":""[[\""**[organ] involves [metabolite]**\"",...","[[organ] involves [metabolite], [organ] affect...","{'3': '[[""**[organ] involves [metabolite]**"", ...",True,"[[**[organ] involves [metabolite]**, [organ] i...","[**[organ] involves [metabolite]**, [organ] in...",True,True,True
4,4,"{""4"":""[[\""**[organ] related to [metabolite]**\...","[[organ] related to [metabolite], [organ] rela...","{'4': '[[""**[organ] related to [metabolite]**""...",True,"[[**[organ] related to [metabolite]**, [organ]...","[**[organ] related to [metabolite]**, [organ] ...",True,True,True
...,...,...,...,...,...,...,...,...,...,...
98555,116971,"{""116971"":""[[\""**[gene] are classified into [p...","[[gene] are classified into [protein domain], ...","{'116971': '[[""**[gene] are classified into [p...",True,[[**[gene] are classified into [protein domain...,[**[gene] are classified into [protein domain]...,False,True,True
98556,116972,"{""116972"":""[[\""**[gene] differ in [protein dom...","[[gene] features [protein domain], [gene] diff...","{'116972': '[[""**[gene] differ in [protein dom...",True,"[[**[gene] differ in [protein domain]**, [gene...","[**[gene] differ in [protein domain]**, [gene]...",True,True,True
98557,116973,"{""116973"":""[[\""**[gene] evolved to [protein do...","[[gene] belonged to [protein domain], [gene] e...","{'116973': '[[""**[gene] evolved to [protein do...",True,"[[**[gene] evolved to [protein domain]**, [gen...","[**[gene] evolved to [protein domain]**, [gene...",True,True,True
98558,116974,"{""116974"":""[]""}","[[gene] predict [protein domain], [gene] ident...",{'116974': '[]'},True,[],[],True,True,True


In [4]:
rerun_df

Unnamed: 0,id,finetuned_output,ids_in_cluster,finetuned_output_parsed,finetuned_list
2464,2937,[],[[phenotype] is the limiting step in the forma...,False,
4656,5527,[],"[[gene] consist mainly of [metabolite], [gene]...",False,
5626,6664,[],"[[gene] as intermediates in [metabolite], [gen...",False,
6459,7659,[],"[[gene] were down-regulated in [metabolite], [...",False,
6832,8102,[],"[[enzyme] catalyzes [metabolite], [enzyme] cat...",False,
...,...,...,...,...,...
97074,115198,"{'115198': '[[""**[protein] dimerize with [gene...","[[protein] dimerize with [gene], [protein] dis...",True,
97349,115524,"{'115524': '[[""**[gene] couples to [protein]**...","[[gene] couples to [protein], [gene] are partn...",True,
97967,116274,"{'116274': '[[""**[gene] inserts [protein compl...","[[gene] inserts [protein complex], [gene] is i...",True,
98008,116326,"{'116326': '[[""**[gene] identifies [protein co...","[[gene] identifies [protein complex], [gene] l...",True,


In [5]:
merged_df["all_edges_have_same_types"].value_counts()


all_edges_have_same_types
True     91100
False     7020
Name: count, dtype: int64

In [None]:
wrong_edges_df = merged_df[merged_df["all_edges_have_same_types"] == False]
wrong_edges_df



Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
119,134,"{""134"":""[[\""**[organ] are strong sinks of [met...","[[organ] are strong sinks of [metabolite], [or...","{'134': '[[""**[organ] are strong sinks of [met...",True,[[**[organ] are strong sinks of [metabolite]**...,"[**[organ] are strong sinks of [metabolite]**,...",True,True,False
146,163,"{""163"":""[[\""**[metabolite] fails to accumulate...",[[metabolite] failed to reduce the growth of [...,"{'163': '[[""**[metabolite] fails to accumulate...",True,[[**[metabolite] fails to accumulate in [organ...,[**[metabolite] fails to accumulate in [organ]...,True,True,False
151,168,"{""168"":""[[\""**[organism] unable to accumulate ...","[[organism] unable to accumulate [metabolite],...","{'168': '[[""**[organism] unable to accumulate ...",True,[[**[organism] unable to accumulate [metabolit...,[**[organism] unable to accumulate [metabolite...,True,True,False
154,171,"{""171"":""[[\""**[organ] cannot provide sufficien...",[[metabolite] is not sufficient to attract [or...,"{'171': '[[""**[organ] cannot provide sufficien...",True,[[**[organ] cannot provide sufficient [metabol...,[**[organ] cannot provide sufficient [metaboli...,True,True,False
186,208,"{""208"":""[[\""**[metabolite] did not change in [...","[[metabolite] did not change in [organism], [o...","{'208': '[[""**[metabolite] did not change in [...",True,[[**[metabolite] did not change in [organism]*...,[**[metabolite] did not change in [organism]**...,True,True,False
...,...,...,...,...,...,...,...,...,...,...
98461,116850,"{""116850"":""[[\""**[protein domain] decreased in...",[[protein domain] decreased interaction intens...,"{'116850': '[[""**[protein domain] decreased in...",True,[[**[protein domain] decreased interaction int...,[**[protein domain] decreased interaction inte...,True,True,False
98465,116854,"{""116854"":""[[\""**[protein domain] interact wit...","[[protein domain] interact with [gene], [gene]...","{'116854': '[[""**[protein domain] interact wit...",True,"[[**[protein domain] interact with [gene]**, [...","[**[protein domain] interact with [gene]**, [g...",True,True,False
98496,116892,"{""116892"":""[[\""**[protein domain] characterize...","[[protein domain] classified into [gene], [pro...","{'116892': '[[""**[protein domain] characterize...",True,"[[**[protein domain] characterizes [gene]**, [...","[**[protein domain] characterizes [gene]**, [p...",True,True,False
98512,116918,"{""116918"":""[[\""**[gene] has a [protein domain]...","[[gene] has five [protein domain], [gene] has ...","{'116918': '[[""**[gene] has a [protein domain]...",True,"[[**[gene] has a [protein domain]**, [gene] ha...","[**[gene] has a [protein domain]**, [gene] has...",True,True,False


In [11]:
rerun_df = pd.concat([merged_df[merged_df["finetuned_representative_in_list"] == False], rerun_df]) # add rows where the representative is not in the finetuned list
merged_df = merged_df[merged_df["finetuned_representative_in_list"] == True] # remove rows where the representative is not in the finetuned list
merged_df.loc[:, ["id", "output_string", "ids_in_cluster"]].to_parquet('path/to/parquet') # output to parquet

In [37]:
def extract_edge_mapping(row):
    """Performed on dataframe with columns:
    - finetuned_list: list of lists of edges
    - ids_in_cluster: list of edges
    """
    edge_mapping = {}

    
    
    #check that the finetuned_list is flattened
    for item in row["finetuned_list"]:
        # Skip if item is not properly formatted
        if not item:
            continue
            
        # Find representative edge (the one containing **)
        representative = None
        for edge in item:
            if isinstance(edge, list):
                
                print(edge)
                print(item)
                print(row["finetuned_list"])
                print("next")
                continue
            if "**" in edge:
                representative = edge.strip("*: ")
                break
                
        if not representative:
            continue
            
        # Map all edges in the group to the representative
        for edge in item:
            if isinstance(edge, list):
                continue
            edge_clean = edge.strip("*: ")
            edge_mapping[edge_clean] = representative


    return edge_mapping

merged_df["edge_mapping"] = merged_df.apply(extract_edge_mapping, axis=1)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["edge_mapping"] = merged_df.apply(extract_edge_mapping, axis=1)


In [38]:
#combine the edge mapping into a single dictionary
edge_mapping = {}
for index, row in merged_df.iterrows():
    edge_mapping.update(row["edge_mapping"])


len(edge_mapping)

515207

In [39]:
kg_df = pd.read_parquet('path/to/kg.parquet') # nodes_remapped_kg.parquet
initial_length = len(kg_df)

In [40]:
def map_edge_to_kg(row):
    source_type = row["mapped_source_type"]
    target_type = row["mapped_target_type"]
    edge = row["interaction"]
    
    input_edge = f"[{source_type}] {edge.strip("*: ")} [{target_type}]"

    try:
        mapped_edge_type = edge_mapping[input_edge]
    except:
        mapped_edge_type = input_edge

    try:
        map_source_type = mapped_edge_type.split("]")[0][1:].strip(" ")
        map_target_type = mapped_edge_type.split("[")[2][:-1].strip(" ")
        map_edge = mapped_edge_type.split("]")[1].split("[")[0].strip(" ")
    except IndexError:
        return edge
    
    
    if map_source_type.strip(" ") == source_type and map_target_type.strip(" ") == target_type:
        return map_edge
    else:
        return edge
    
    


kg_df["mapped_edge"] = kg_df.apply(map_edge_to_kg, axis=1)



In [41]:
(kg_df["mapped_edge"] != kg_df["interaction"]).value_counts()


False    3475279
True     1343960
Name: count, dtype: int64

In [49]:
print(kg_df["interaction"].nunique())
print(kg_df["mapped_edge"].nunique())
print(kg_df["interaction"].nunique() - kg_df["mapped_edge"].nunique())





461199
352138
109061


In [43]:
final_length = len(kg_df)
assert initial_length == final_length

In [44]:
kg_df.to_parquet("path/to/parquet") # 20250331_nodes_and_edges_remapped_kg.parquet