## Check all entities and edges whether are valid and follow the rules

In [3]:
import pandas as pd

entities = pd.read_csv("./entities.tsv", sep="\t", low_memory=False)

In [25]:
entities["xrefs"] = entities["xrefs"].fillna("")
entities["xrefs_list"] = entities["xrefs"].str.split("|")

id_priority = {
    "Disease": ["MONDO", "MESH", "UMLS"],
    "Symptom": ["SYMP", "UMLS"],
    "Compound": ["DrugBank", "MESH"],
}

def choose_id(ids, label):
    for prefix in id_priority.get(label, []):
        for id in ids:
            if id.startswith(prefix):
                return id
    return list(ids)[0]

merged_rows = []

for label, group in entities.groupby("label"):
    if label not in id_priority:
        merged_rows.extend(group.to_dict(orient="records"))
        continue

    id_to_index = {}
    for index, row in group.iterrows():
        id_to_index.setdefault(row["id"], []).append(index)
        for xref in row["xrefs_list"]:
            if xref:
                id_to_index.setdefault(xref, []).append(index)

    processed_indices = set()
    for indices in id_to_index.values():
        if len(indices) > 1:
            related_rows = group.loc[indices]
            processed_indices.update(indices)

            all_ids = set(related_rows["id"].tolist())
            all_xrefs = set(related_rows["xrefs"].str.cat(sep="|").split("|"))
            merged_xrefs = all_ids.union(all_xrefs) - {""}

            main_id = choose_id(all_ids, label)
            merged_row = related_rows.iloc[0].copy()
            for col in related_rows.columns:
                if col not in ["id", "xrefs", "xrefs_list"]:
                    values = related_rows[col].fillna("").unique()
                    merged_row[col] = "|".join(values)
            merged_row["xrefs"] = "|".join(merged_xrefs)
            merged_row["id"] = main_id
            merged_rows.append(merged_row)

    unprocessed_rows = group.loc[group.index.difference(list(processed_indices))]
    merged_rows.extend(unprocessed_rows.to_dict(orient="records"))

merged_df = pd.DataFrame(merged_rows)
# Remove the xrefs_list column
merged_df = merged_df.drop(columns=["xrefs_list"]).drop_duplicates()

In [18]:
merged_df

array(['Symptom', 'Disease', 'Compound', 'Metabolite', 'Gene',
       'MolecularFunction', 'CellularComponent', 'BiologicalProcess'],
      dtype=object)