In [2]:
import os
import json
import time
from datetime import datetime

import pandas as pd
#display all columns
pd.set_option('display.max_columns', None)
import numpy as np

import tiktoken
import gzip
from openai import OpenAI


In [3]:
def load_output_jsonl(file_path):
    # Read the JSONL file and parse each line as JSON
    print(file_path)
    with open(file_path, 'rt') as f:  
        data = []
        for line in f:

                json_obj = json.loads(line)
                embedding = json_obj['response']['body']['choices'][0]['message']['content']
                data.append({
                    'id': json_obj['custom_id'],
                    'embedding': embedding
                })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df


def test_output(output_df: pd.DataFrame):
    output_df.rename(columns={'embedding': 'output_string'}, inplace=True)

    cluster_df = pd.read_parquet('/home/mads/connectome/data/embeddings/edge_embeddings/clustering/edge_clusters_6_max_clust_30.parquet')
    cluster_df = cluster_df[cluster_df.cluster_size > 1]
    
    print(len(cluster_df))

    #create the input string. the string should be "index: the list of edges as a string"
    cluster_df["input"] = cluster_df.apply(lambda row: f"{row.name}: {row['ids_in_cluster']}", axis=1)

    cluster_df["id"] = cluster_df["cluster_id"].astype(str)
    merged_df = pd.merge(output_df, cluster_df[["id", "ids_in_cluster"]], on='id', how='inner')


    # evaluate the dictionary strings in o3_output and finetuned_output
    def load_dict(dict_string):
        try:
            return json.loads(dict_string)
        except Exception as e:
            #print(f"Failed to parse {dict_string}")
            #print(e)
            return []


    merged_df["finetuned_output"] = merged_df["output_string"].apply(load_dict)

    merged_df["finetuned_output_parsed"] = merged_df["finetuned_output"].apply(lambda x: isinstance(x, dict))

    
    print(merged_df["finetuned_output_parsed"].value_counts())

    rerun_df = merged_df[merged_df["finetuned_output_parsed"] == False]

    merged_df = merged_df[merged_df["finetuned_output_parsed"] == True]





    def convert_to_list(row):
        try:
            return eval(list(row.values())[0])
        except:
            #print(row)
            return None

    merged_df["finetuned_list"] = merged_df["finetuned_output"].apply(convert_to_list)
    #if "finetuned_list" is a tuple, take the first element
    merged_df["finetuned_list"] = merged_df["finetuned_list"].apply(lambda x: x[0] if isinstance(x, tuple) else x)

    #add merged_df["finetuned_list"].isnull() to needs_rerun and remove rows from merged_df where finetuned_list is None
    rerun_df = pd.concat([rerun_df.copy(), merged_df[merged_df["finetuned_list"].isnull()]].copy())
    merged_df = merged_df[merged_df["finetuned_list"].notnull()]

    print("length of mergeed_df:",  len(merged_df))
    print("length of rerun:",len(rerun_df))
    #flatten the list



    merged_df["flattened_list"] = merged_df["finetuned_list"].apply(lambda x: [item for sublist in x for item in sublist])

    def flatten_list_to_strings(item):
        """
        Recursively flatten a nested structure (lists/tuples) into a single list of strings.
        Non-string items will be converted to strings.
        
        Args:
            item: Any Python object that might contain nested lists/tuples
            
        Returns:
            list: A flattened list containing only strings
        """
        if isinstance(item, (list, tuple)):
            return [
                str_item 
                for sublist in item 
                for str_item in flatten_list_to_strings(sublist)
            ]
        else:
            return [str(item)]

    # You can then replace your assertions with:
    merged_df["flattened_list"] = merged_df["flattened_list"].apply(flatten_list_to_strings)



    #check that the flattened list is flattened, by checking that all entries are strings
    assert merged_df["flattened_list"].apply(lambda x: isinstance(x, list)).all()
    assert merged_df["flattened_list"].apply(lambda x: all(isinstance(item, str) for item in x)).all()

    def check_if_outputs_matches_input(row):
        flattened_list = row["flattened_list"]
        checks = []
        
        for item in flattened_list:
            try:
                if item.strip("*: ") in row["ids_in_cluster"]:
                    checks.append(True)
                else:
                    checks.append(False)
            except:
                print("error")
                print(item)
                print("flattened_list:", flattened_list)
                print("output")
                
                checks.append(False)
        if False in checks:
            return False
        else:
            return True

    merged_df["output_matches_input"] = merged_df.apply(check_if_outputs_matches_input, axis=1)





    def check_if_representative_is_in_finetuned_list(row):
        list_of_checks = []
        for item in row["finetuned_list"]:
            group_has_representative = False
            for entry in item:
                if "**" in entry:
                    group_has_representative = True
        
            list_of_checks.append(group_has_representative)
        if False in list_of_checks:
            return False
        else:
            return True
        

    def check_that_all_edges_have_same_types(row):
        
        for item in row["finetuned_list"]:
            #print(item)
            source_types = set()
            target_types = set()
            for edge in item:
                edge = edge.strip("*: ")
                try:
                    source_type = edge.split("]")[0].split("[")[1]
                    target_type = edge.split("[")[2].split("]")[0]
                    source_types.add(source_type)
                    target_types.add(target_type)
                except:
                    print(edge)
            if len(source_types) > 1 or len(target_types) > 1:

                return False
        return True


    def check_that_all_edges_have_same_types(row):
        for item in row["finetuned_list"]:
            source_types = set()
            target_types = set()
            for edge in item:
                # Handle case where edge might be a list or string
                if isinstance(edge, list):
                    # If it's a list, skip it or handle it appropriately
                    continue
                
                edge_str = edge.strip("*: ")
                try:
                    source_type = edge_str.split("]")[0].split("[")[1]
                    target_type = edge_str.split("[")[2].split("]")[0]
                    source_types.add(source_type)
                    target_types.add(target_type)
                except:
                    print(f"Error processing edge: {edge}")
                    continue
                    
            if len(source_types) > 1 or len(target_types) > 1:
                return False
        return True



    merged_df["finetuned_representative_in_list"] = merged_df.apply(check_if_representative_is_in_finetuned_list, axis=1)
    merged_df["all_edges_have_same_types"] = merged_df.apply(check_that_all_edges_have_same_types, axis=1)
    return merged_df, rerun_df




output_df = pd.read_parquet('/home/mads/connectome/data/predictions/v8_full_output_20250331_141507.parquet')

merged_df, rerun_df = test_output(output_df)




98560
finetuned_output_parsed
True     98491
False       69
Name: count, dtype: int64
length of mergeed_df: 98120
length of rerun: 440
Error processing edge: [
Error processing edge: [
Error processing edge: m
Error processing edge: e
Error processing edge: t
Error processing edge: a
Error processing edge: b
Error processing edge: o
Error processing edge: l
Error processing edge: i
Error processing edge: t
Error processing edge: e
Error processing edge: ]
Error processing edge:  
Error processing edge: s
Error processing edge: p
Error processing edge: e
Error processing edge: e
Error processing edge: d
Error processing edge: s
Error processing edge:  
Error processing edge: u
Error processing edge: p
Error processing edge:  
Error processing edge: [
Error processing edge: p
Error processing edge: h
Error processing edge: e
Error processing edge: n
Error processing edge: o
Error processing edge: t
Error processing edge: y
Error processing edge: p
Error processing edge: e
Error processin

In [4]:

merged_df

Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
0,0,"{""0"":""[[\""**[organ] correlated with [metabolit...","[[organ] correlated with [metabolic pathway], ...","{'0': '[[""**[organ] correlated with [metabolit...",True,"[[**[organ] correlated with [metabolite]**, [o...","[**[organ] correlated with [metabolite]**, [or...",True,True,True
1,1,"{""1"":""[]""}","[[organ] mediates [metabolite], [organ] suppor...",{'1': '[]'},True,[],[],True,True,True
2,2,"{""2"":""[[\""**[organ] leads to [metabolite]**\"",...","[[metabolite] self-organizes along [organ], [o...","{'2': '[[""**[organ] leads to [metabolite]**"", ...",True,"[[**[organ] leads to [metabolite]**, [organ] l...","[**[organ] leads to [metabolite]**, [organ] le...",True,True,True
3,3,"{""3"":""[[\""**[organ] involves [metabolite]**\"",...","[[organ] involves [metabolite], [organ] affect...","{'3': '[[""**[organ] involves [metabolite]**"", ...",True,"[[**[organ] involves [metabolite]**, [organ] i...","[**[organ] involves [metabolite]**, [organ] in...",True,True,True
4,4,"{""4"":""[[\""**[organ] related to [metabolite]**\...","[[organ] related to [metabolite], [organ] rela...","{'4': '[[""**[organ] related to [metabolite]**""...",True,"[[**[organ] related to [metabolite]**, [organ]...","[**[organ] related to [metabolite]**, [organ] ...",True,True,True
...,...,...,...,...,...,...,...,...,...,...
98555,116971,"{""116971"":""[[\""**[gene] are classified into [p...","[[gene] are classified into [protein domain], ...","{'116971': '[[""**[gene] are classified into [p...",True,[[**[gene] are classified into [protein domain...,[**[gene] are classified into [protein domain]...,False,True,True
98556,116972,"{""116972"":""[[\""**[gene] differ in [protein dom...","[[gene] features [protein domain], [gene] diff...","{'116972': '[[""**[gene] differ in [protein dom...",True,"[[**[gene] differ in [protein domain]**, [gene...","[**[gene] differ in [protein domain]**, [gene]...",True,True,True
98557,116973,"{""116973"":""[[\""**[gene] evolved to [protein do...","[[gene] belonged to [protein domain], [gene] e...","{'116973': '[[""**[gene] evolved to [protein do...",True,"[[**[gene] evolved to [protein domain]**, [gen...","[**[gene] evolved to [protein domain]**, [gene...",True,True,True
98558,116974,"{""116974"":""[]""}","[[gene] predict [protein domain], [gene] ident...",{'116974': '[]'},True,[],[],True,True,True


In [4]:
rerun_df

Unnamed: 0,id,finetuned_output,ids_in_cluster,finetuned_output_parsed,finetuned_list
2464,2937,[],[[phenotype] is the limiting step in the forma...,False,
4656,5527,[],"[[gene] consist mainly of [metabolite], [gene]...",False,
5626,6664,[],"[[gene] as intermediates in [metabolite], [gen...",False,
6459,7659,[],"[[gene] were down-regulated in [metabolite], [...",False,
6832,8102,[],"[[enzyme] catalyzes [metabolite], [enzyme] cat...",False,
...,...,...,...,...,...
97074,115198,"{'115198': '[[""**[protein] dimerize with [gene...","[[protein] dimerize with [gene], [protein] dis...",True,
97349,115524,"{'115524': '[[""**[gene] couples to [protein]**...","[[gene] couples to [protein], [gene] are partn...",True,
97967,116274,"{'116274': '[[""**[gene] inserts [protein compl...","[[gene] inserts [protein complex], [gene] is i...",True,
98008,116326,"{'116326': '[[""**[gene] identifies [protein co...","[[gene] identifies [protein complex], [gene] l...",True,


In [5]:
merged_df["all_edges_have_same_types"].value_counts()


all_edges_have_same_types
True     91100
False     7020
Name: count, dtype: int64

In [None]:
wrong_edges_df = merged_df[merged_df["all_edges_have_same_types"] == False]
wrong_edges_df



Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
119,134,"{""134"":""[[\""**[organ] are strong sinks of [met...","[[organ] are strong sinks of [metabolite], [or...","{'134': '[[""**[organ] are strong sinks of [met...",True,[[**[organ] are strong sinks of [metabolite]**...,"[**[organ] are strong sinks of [metabolite]**,...",True,True,False
146,163,"{""163"":""[[\""**[metabolite] fails to accumulate...",[[metabolite] failed to reduce the growth of [...,"{'163': '[[""**[metabolite] fails to accumulate...",True,[[**[metabolite] fails to accumulate in [organ...,[**[metabolite] fails to accumulate in [organ]...,True,True,False
151,168,"{""168"":""[[\""**[organism] unable to accumulate ...","[[organism] unable to accumulate [metabolite],...","{'168': '[[""**[organism] unable to accumulate ...",True,[[**[organism] unable to accumulate [metabolit...,[**[organism] unable to accumulate [metabolite...,True,True,False
154,171,"{""171"":""[[\""**[organ] cannot provide sufficien...",[[metabolite] is not sufficient to attract [or...,"{'171': '[[""**[organ] cannot provide sufficien...",True,[[**[organ] cannot provide sufficient [metabol...,[**[organ] cannot provide sufficient [metaboli...,True,True,False
186,208,"{""208"":""[[\""**[metabolite] did not change in [...","[[metabolite] did not change in [organism], [o...","{'208': '[[""**[metabolite] did not change in [...",True,[[**[metabolite] did not change in [organism]*...,[**[metabolite] did not change in [organism]**...,True,True,False
...,...,...,...,...,...,...,...,...,...,...
98461,116850,"{""116850"":""[[\""**[protein domain] decreased in...",[[protein domain] decreased interaction intens...,"{'116850': '[[""**[protein domain] decreased in...",True,[[**[protein domain] decreased interaction int...,[**[protein domain] decreased interaction inte...,True,True,False
98465,116854,"{""116854"":""[[\""**[protein domain] interact wit...","[[protein domain] interact with [gene], [gene]...","{'116854': '[[""**[protein domain] interact wit...",True,"[[**[protein domain] interact with [gene]**, [...","[**[protein domain] interact with [gene]**, [g...",True,True,False
98496,116892,"{""116892"":""[[\""**[protein domain] characterize...","[[protein domain] classified into [gene], [pro...","{'116892': '[[""**[protein domain] characterize...",True,"[[**[protein domain] characterizes [gene]**, [...","[**[protein domain] characterizes [gene]**, [p...",True,True,False
98512,116918,"{""116918"":""[[\""**[gene] has a [protein domain]...","[[gene] has five [protein domain], [gene] has ...","{'116918': '[[""**[gene] has a [protein domain]...",True,"[[**[gene] has a [protein domain]**, [gene] ha...","[**[gene] has a [protein domain]**, [gene] has...",True,True,False


In [7]:
rerun_df

Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list
2464,2937,"{""2937"":""[[\""**[metabolite] is limiting and re...",[[phenotype] is the limiting step in the forma...,[],False,
4656,5527,"{""5527"":""[[\""**[gene] consist of [metabolite]*...","[[gene] consist mainly of [metabolite], [gene]...",[],False,
5626,6664,"{""6664"":""[[\""**[gene] encode enzyme activities...","[[gene] as intermediates in [metabolite], [gen...",[],False,
6459,7659,"{""7659"":""[[\""**[gene] were downregulated by [m...","[[gene] were down-regulated in [metabolite], [...",[],False,
6832,8102,"{""8102"":""[[\""**[enzyme] catalyzes [metabolite]...","[[enzyme] catalyzes [metabolite], [enzyme] cat...",[],False,
...,...,...,...,...,...,...
97074,115198,"{""115198"":""[[\""**[protein] dimerize with [gene...","[[protein] dimerize with [gene], [protein] dis...","{'115198': '[[""**[protein] dimerize with [gene...",True,
97349,115524,"{""115524"":""[[\""**[gene] couples to [protein]**...","[[gene] couples to [protein], [gene] are partn...","{'115524': '[[""**[gene] couples to [protein]**...",True,
97967,116274,"{""116274"":""[[\""**[gene] inserts [protein compl...","[[gene] inserts [protein complex], [gene] is i...","{'116274': '[[""**[gene] inserts [protein compl...",True,
98008,116326,"{""116326"":""[[\""**[gene] identifies [protein co...","[[gene] identifies [protein complex], [gene] l...","{'116326': '[[""**[gene] identifies [protein co...",True,


In [8]:
merged_df

Unnamed: 0,id,output_string,ids_in_cluster,finetuned_output,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
0,0,"{""0"":""[[\""**[organ] correlated with [metabolit...","[[organ] correlated with [metabolic pathway], ...","{'0': '[[""**[organ] correlated with [metabolit...",True,"[[**[organ] correlated with [metabolite]**, [o...","[**[organ] correlated with [metabolite]**, [or...",True,True,True
1,1,"{""1"":""[]""}","[[organ] mediates [metabolite], [organ] suppor...",{'1': '[]'},True,[],[],True,True,True
2,2,"{""2"":""[[\""**[organ] leads to [metabolite]**\"",...","[[metabolite] self-organizes along [organ], [o...","{'2': '[[""**[organ] leads to [metabolite]**"", ...",True,"[[**[organ] leads to [metabolite]**, [organ] l...","[**[organ] leads to [metabolite]**, [organ] le...",True,True,True
3,3,"{""3"":""[[\""**[organ] involves [metabolite]**\"",...","[[organ] involves [metabolite], [organ] affect...","{'3': '[[""**[organ] involves [metabolite]**"", ...",True,"[[**[organ] involves [metabolite]**, [organ] i...","[**[organ] involves [metabolite]**, [organ] in...",True,True,True
4,4,"{""4"":""[[\""**[organ] related to [metabolite]**\...","[[organ] related to [metabolite], [organ] rela...","{'4': '[[""**[organ] related to [metabolite]**""...",True,"[[**[organ] related to [metabolite]**, [organ]...","[**[organ] related to [metabolite]**, [organ] ...",True,True,True
...,...,...,...,...,...,...,...,...,...,...
98555,116971,"{""116971"":""[[\""**[gene] are classified into [p...","[[gene] are classified into [protein domain], ...","{'116971': '[[""**[gene] are classified into [p...",True,[[**[gene] are classified into [protein domain...,[**[gene] are classified into [protein domain]...,False,True,True
98556,116972,"{""116972"":""[[\""**[gene] differ in [protein dom...","[[gene] features [protein domain], [gene] diff...","{'116972': '[[""**[gene] differ in [protein dom...",True,"[[**[gene] differ in [protein domain]**, [gene...","[**[gene] differ in [protein domain]**, [gene]...",True,True,True
98557,116973,"{""116973"":""[[\""**[gene] evolved to [protein do...","[[gene] belonged to [protein domain], [gene] e...","{'116973': '[[""**[gene] evolved to [protein do...",True,"[[**[gene] evolved to [protein domain]**, [gen...","[**[gene] evolved to [protein domain]**, [gene...",True,True,True
98558,116974,"{""116974"":""[]""}","[[gene] predict [protein domain], [gene] ident...",{'116974': '[]'},True,[],[],True,True,True


In [11]:
rerun_df = pd.concat([merged_df[merged_df["finetuned_representative_in_list"] == False], rerun_df])
merged_df = merged_df[merged_df["finetuned_representative_in_list"] == True]
merged_df.loc[:, ["id", "output_string", "ids_in_cluster"]].to_parquet('/home/mads/connectome/data/predictions/v8_merged_20250407_141507.parquet')

In [35]:
rerun_df

Unnamed: 0,id,finetuned_output,ids_in_cluster,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
4200,4974,"{'4974': '[[""**[metabolite] hastens [phenotype...","[[metabolite] hastens [phenotype], [metabolite...",True,"[[**[metabolite] hastens [phenotype]**, [metab...","[**[metabolite] hastens [phenotype]**, [metabo...",False,False,True
10442,12403,"{'12403': '[[""**[protein] impairs [metabolite]...",[[protein complex] impairs production of [meta...,True,"[[**[protein] impairs [metabolite]**, [protein...","[**[protein] impairs [metabolite]**, [protein]...",False,False,True
12118,14380,"{'14380': '[[""**[metabolite] allocated to [tis...","[[metabolite] allocated to [tissue], [metaboli...",True,"[[**[metabolite] allocated to [tissue]**, [met...","[**[metabolite] allocated to [tissue]**, [meta...",False,False,True
12347,14652,"{'14652': '[[""**[metabolite] blocks [subcellul...",[[metabolite] blocks [subcellular compartment]...,True,[[**[metabolite] blocks [subcellular compartme...,[**[metabolite] blocks [subcellular compartmen...,False,False,True
12462,14788,"{'14788': '[[""**[metabolite] identifies [subce...",[[metabolite] identifies [subcellular compartm...,True,[[**[metabolite] identifies [subcellular compa...,[**[metabolite] identifies [subcellular compar...,False,False,True
...,...,...,...,...,...,...,...,...,...
97074,115198,"{'115198': '[[""**[protein] dimerize with [gene...","[[protein] dimerize with [gene], [protein] dis...",True,,,,,
97349,115524,"{'115524': '[[""**[gene] couples to [protein]**...","[[gene] couples to [protein], [gene] are partn...",True,,,,,
97967,116274,"{'116274': '[[""**[gene] inserts [protein compl...","[[gene] inserts [protein complex], [gene] is i...",True,,,,,
98008,116326,"{'116326': '[[""**[gene] identifies [protein co...","[[gene] identifies [protein complex], [gene] l...",True,,,,,


In [36]:
merged_df

Unnamed: 0,id,finetuned_output,ids_in_cluster,finetuned_output_parsed,finetuned_list,flattened_list,output_matches_input,finetuned_representative_in_list,all_edges_have_same_types
0,0,"{'0': '[[""**[organ] correlated with [metabolit...","[[organ] correlated with [metabolic pathway], ...",True,"[[**[organ] correlated with [metabolite]**, [o...","[**[organ] correlated with [metabolite]**, [or...",True,True,True
1,1,{'1': '[]'},"[[organ] mediates [metabolite], [organ] suppor...",True,[],[],True,True,True
2,2,"{'2': '[[""**[organ] leads to [metabolite]**"", ...","[[metabolite] self-organizes along [organ], [o...",True,"[[**[organ] leads to [metabolite]**, [organ] l...","[**[organ] leads to [metabolite]**, [organ] le...",True,True,True
3,3,"{'3': '[[""**[organ] involves [metabolite]**"", ...","[[organ] involves [metabolite], [organ] affect...",True,"[[**[organ] involves [metabolite]**, [organ] i...","[**[organ] involves [metabolite]**, [organ] in...",True,True,True
4,4,"{'4': '[[""**[organ] related to [metabolite]**""...","[[organ] related to [metabolite], [organ] rela...",True,"[[**[organ] related to [metabolite]**, [organ]...","[**[organ] related to [metabolite]**, [organ] ...",True,True,True
...,...,...,...,...,...,...,...,...,...
98555,116971,"{'116971': '[[""**[gene] are classified into [p...","[[gene] are classified into [protein domain], ...",True,[[**[gene] are classified into [protein domain...,[**[gene] are classified into [protein domain]...,False,True,True
98556,116972,"{'116972': '[[""**[gene] differ in [protein dom...","[[gene] features [protein domain], [gene] diff...",True,"[[**[gene] differ in [protein domain]**, [gene...","[**[gene] differ in [protein domain]**, [gene]...",True,True,True
98557,116973,"{'116973': '[[""**[gene] evolved to [protein do...","[[gene] belonged to [protein domain], [gene] e...",True,"[[**[gene] evolved to [protein domain]**, [gen...","[**[gene] evolved to [protein domain]**, [gene...",True,True,True
98558,116974,{'116974': '[]'},"[[gene] predict [protein domain], [gene] ident...",True,[],[],True,True,True


In [37]:
def extract_edge_mapping(row):
    """Performed on dataframe with columns:
    - finetuned_list: list of lists of edges
    - ids_in_cluster: list of edges
    """
    edge_mapping = {}

    
    
    #check that the finetuned_list is flattened
    for item in row["finetuned_list"]:
        # Skip if item is not properly formatted
        if not item:
            continue
            
        # Find representative edge (the one containing **)
        representative = None
        for edge in item:
            if isinstance(edge, list):
                
                print(edge)
                print(item)
                print(row["finetuned_list"])
                print("next")
                continue
            if "**" in edge:
                representative = edge.strip("*: ")
                break
                
        if not representative:
            continue
            
        # Map all edges in the group to the representative
        for edge in item:
            if isinstance(edge, list):
                continue
            edge_clean = edge.strip("*: ")
            edge_mapping[edge_clean] = representative


    return edge_mapping

merged_df["edge_mapping"] = merged_df.apply(extract_edge_mapping, axis=1)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["edge_mapping"] = merged_df.apply(extract_edge_mapping, axis=1)


In [38]:
#combine the edge mapping into a single dictionary
edge_mapping = {}
for index, row in merged_df.iterrows():
    edge_mapping.update(row["edge_mapping"])


len(edge_mapping)

515207

In [39]:
kg_df = pd.read_parquet('/home/mads/connectome/data/KG/nodes_remapped_kg.parquet')
initial_length = len(kg_df)

In [40]:
def map_edge_to_kg(row):
    source_type = row["mapped_source_type"]
    target_type = row["mapped_target_type"]
    edge = row["interaction"]
    
    input_edge = f"[{source_type}] {edge.strip("*: ")} [{target_type}]"

    try:
        mapped_edge_type = edge_mapping[input_edge]
    except:
        mapped_edge_type = input_edge

    try:
        map_source_type = mapped_edge_type.split("]")[0][1:].strip(" ")
        map_target_type = mapped_edge_type.split("[")[2][:-1].strip(" ")
        map_edge = mapped_edge_type.split("]")[1].split("[")[0].strip(" ")
    except IndexError:
        return edge
    
    
    if map_source_type.strip(" ") == source_type and map_target_type.strip(" ") == target_type:
        return map_edge
    else:
        return edge
    
    


kg_df["mapped_edge"] = kg_df.apply(map_edge_to_kg, axis=1)



In [41]:
(kg_df["mapped_edge"] != kg_df["interaction"]).value_counts()


False    3475279
True     1343960
Name: count, dtype: int64

In [49]:
print(kg_df["interaction"].nunique())
print(kg_df["mapped_edge"].nunique())
print(kg_df["interaction"].nunique() - kg_df["mapped_edge"].nunique())





461199
352138
109061


In [43]:
final_length = len(kg_df)
assert initial_length == final_length

In [44]:
kg_df.to_parquet("/home/mads/connectome/data/KG/20250331_nodes_and_edges_remapped_kg.parquet")

In [45]:
kg_df

Unnamed: 0,source,source type,interaction,target,target type,gene,pubmedID,p_source,species,basis,source_extracted_definition,source_generated_definition,target_extracted_definition,target_generated_definition,source type disamb,target type disamb,mapped_source,mapped_target,mapped_source_type,mapped_target_type,mapped_edge
0,SUCROSE TRANSPORTER 5,gene,supplies,A. thaliana embryos,organism,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Uptake measurements of radiolabelled biotin in...,A gene encoding a transporter that facilitates...,,Embryos from the plant species Arabidopsis tha...,,gene,organism,SUCROSE TRANSPORTER 5,A. thaliana embryo,gene,organism,supplies
1,SUCROSE TRANSPORTER 5,gene,affect,Triacylglycerol accumulation,phenotype,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,TAG content analysis in suc5 mutants,A gene encoding a transporter that facilitates...,,The buildup of triacylglycerols in plant tissu...,,gene,phenotype,SUCROSE TRANSPORTER 5,Triacylglycerol accumulation,gene,phenotype,affect
2,SUC5 protein(s),gene,represents,sucrose/H+ symporter,protein,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Functional analysis of SUC5 protein,"Proteins encoded by the SUC5 gene, responsible...",,A protein that facilitates the co-transport of...,,gene,protein,SUC5 gene(s),sucrose/H+ symporter,gene,protein,represents
3,SUC5,gene,transports,biotin,metabolite,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Uptake measurements of radiolabelled biotin in...,A gene encoding a sucrose transporter that als...,,A water-soluble vitamin (Vitamin B7) essential...,,gene,metabolite,SUC5,biotin,gene,metabolite,transport
4,SUC5,gene,localizes to,plasma membrane,subcellular compartment,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Subcellular localization studies of SUC5 protein,A gene encoding a sucrose transporter that als...,,The biological membrane that separates the int...,,gene,subcellular compartment,SUC5,plasmamembrane,gene,subcellular compartment,localizes to
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4819234,Transgenic poplar lines,mutant,had greater levels of resistance to,NaCl,metabolite,STO1,24929937,24929937_abstract,Populus tremula × P. alba,Greenhouse trials showing greater resistance t...,Poplar plants genetically modified to express ...,,"Sodium chloride, commonly used to induce salt ...",,mutant,metabolite,Transgenic poplar lines,NaCl solution,mutant,metabolite,more resistant to
4819235,Transgenic poplar lines,mutant,had greater levels of resistance than,wt,organism,STO1,24929937,24929937_abstract,Populus tremula × P. alba,Greenhouse trials showing greater resistance t...,Poplar plants genetically modified to express ...,,"Wild-type, the standard or non-modified versio...",,mutant,organism,Transgenic poplar lines,WT (wildtype),mutant,organism,was more resistant than
4819236,RT-PCR,treatment,indicated variation in,relative abundance of STO1 transcript,gene,STO1,24929937,24929937_abstract,Populus tremula × P. alba,RT-PCR analysis of STO1 transcript levels in t...,Reverse transcription polymerase chain reactio...,,The level of STO1 gene expression measured in ...,,treatment,gene,RT-PCR,relative abundance of STO1 transcript,treatment,gene,indicated variation in
4819237,variation in relative abundance of STO1 transc...,gene,coincided with,tolerance to salt,phenotype,STO1,24929937,24929937_abstract,Populus tremula × P. alba,RT-PCR analysis of STO1 transcript levels in t...,Differences in the expression levels of the ST...,,The ability of plants to survive and grow in s...,,gene,phenotype,variation in relative abundance of STO1 transc...,tolerance to salt,gene,phenotype,coincided with


In [57]:
# remove duplicates of mapped_edge, mapped target type and mapped source type
kg_removed_duplicates_df = kg_df.drop_duplicates(subset=["mapped_target_type", "mapped_source_type", "mapped_source", "mapped_target", "mapped_edge"])








In [59]:
kg_removed_duplicates_df

Unnamed: 0,source,source type,interaction,target,target type,gene,pubmedID,p_source,species,basis,source_extracted_definition,source_generated_definition,target_extracted_definition,target_generated_definition,source type disamb,target type disamb,mapped_source,mapped_target,mapped_source_type,mapped_target_type,mapped_edge
0,SUCROSE TRANSPORTER 5,gene,supplies,A. thaliana embryos,organism,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Uptake measurements of radiolabelled biotin in...,A gene encoding a transporter that facilitates...,,Embryos from the plant species Arabidopsis tha...,,gene,organism,SUCROSE TRANSPORTER 5,A. thaliana embryo,gene,organism,supplies
1,SUCROSE TRANSPORTER 5,gene,affect,Triacylglycerol accumulation,phenotype,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,TAG content analysis in suc5 mutants,A gene encoding a transporter that facilitates...,,The buildup of triacylglycerols in plant tissu...,,gene,phenotype,SUCROSE TRANSPORTER 5,Triacylglycerol accumulation,gene,phenotype,affect
2,SUC5 protein(s),gene,represents,sucrose/H+ symporter,protein,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Functional analysis of SUC5 protein,"Proteins encoded by the SUC5 gene, responsible...",,A protein that facilitates the co-transport of...,,gene,protein,SUC5 gene(s),sucrose/H+ symporter,gene,protein,represents
3,SUC5,gene,transports,biotin,metabolite,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Uptake measurements of radiolabelled biotin in...,A gene encoding a sucrose transporter that als...,,A water-soluble vitamin (Vitamin B7) essential...,,gene,metabolite,SUC5,biotin,gene,metabolite,transport
4,SUC5,gene,localizes to,plasma membrane,subcellular compartment,BIO2,23031218,23031218_abstract,Arabidopsis thaliana,Subcellular localization studies of SUC5 protein,A gene encoding a sucrose transporter that als...,,The biological membrane that separates the int...,,gene,subcellular compartment,SUC5,plasmamembrane,gene,subcellular compartment,localizes to
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4819234,Transgenic poplar lines,mutant,had greater levels of resistance to,NaCl,metabolite,STO1,24929937,24929937_abstract,Populus tremula × P. alba,Greenhouse trials showing greater resistance t...,Poplar plants genetically modified to express ...,,"Sodium chloride, commonly used to induce salt ...",,mutant,metabolite,Transgenic poplar lines,NaCl solution,mutant,metabolite,more resistant to
4819235,Transgenic poplar lines,mutant,had greater levels of resistance than,wt,organism,STO1,24929937,24929937_abstract,Populus tremula × P. alba,Greenhouse trials showing greater resistance t...,Poplar plants genetically modified to express ...,,"Wild-type, the standard or non-modified versio...",,mutant,organism,Transgenic poplar lines,WT (wildtype),mutant,organism,was more resistant than
4819236,RT-PCR,treatment,indicated variation in,relative abundance of STO1 transcript,gene,STO1,24929937,24929937_abstract,Populus tremula × P. alba,RT-PCR analysis of STO1 transcript levels in t...,Reverse transcription polymerase chain reactio...,,The level of STO1 gene expression measured in ...,,treatment,gene,RT-PCR,relative abundance of STO1 transcript,treatment,gene,indicated variation in
4819237,variation in relative abundance of STO1 transc...,gene,coincided with,tolerance to salt,phenotype,STO1,24929937,24929937_abstract,Populus tremula × P. alba,RT-PCR analysis of STO1 transcript levels in t...,Differences in the expression levels of the ST...,,The ability of plants to survive and grow in s...,,gene,phenotype,variation in relative abundance of STO1 transc...,tolerance to salt,gene,phenotype,coincided with
