In [8]:
valid_entity_types = [
    "Compound",
    "Disease",
    "Gene",
    "Metabolite",
    "Pathway",
    "Anatomy",
    "Symptom",
    "PharmacologicClass",
    "BiologicalProcess",
    "CellularComponent",
    "MolecularFunction",
]

In [9]:
import pandas as pd

df = pd.read_csv("./customdb-v20240329.csv", sep=",")
# Filter all the rows with Unknown:Unknown values in the target_id or source_id columns
df = df[~df["target_id"].str.contains("Unknown:Unknown")]
df = df[~df["source_id"].str.contains("Unknown:Unknown")]
df["idx"] = df.index

# Join the source_type, target_type and relation_type columns
valid_relations = df[
    df["relation_type"].str.contains(".*:+[a-zA-Z]+:[a-zA-Z]+$", regex=True)
    & (
        df["source_type"].apply(lambda x: x in valid_entity_types)
        & df["target_type"].apply(lambda x: x in valid_entity_types)
    )
]
print(f"Valid relations: {valid_relations.shape[0]}")
invalid_relations = df[df["idx"].apply(lambda x: x not in valid_relations.index)]
print(f"Invalid relations: {invalid_relations.shape[0]}")

invalid_relations = invalid_relations.copy()
# Replace 'Protein' with 'Gene' in 'source_type' and 'target_type' columns
invalid_relations["source_type"] = invalid_relations["source_type"].replace(
    "Protein", "Gene"
)
invalid_relations["target_type"] = invalid_relations["target_type"].replace(
    "Protein", "Gene"
)

# Remove all rows which have a invalid source_type or target_type
invalid_relations = invalid_relations[
    (invalid_relations["source_type"].apply(lambda x: x in valid_entity_types))
    & (invalid_relations["target_type"].apply(lambda x: x in valid_entity_types))
].copy()

print(f"Invalid relations after fixing: {invalid_relations.shape[0]}")

invalid_relations["relation_type"] = invalid_relations.apply(
    lambda x: x["relation_type"] + "::" + x["source_type"] + ":" + x["target_type"],
    axis=1,
)

Valid relations: 989
Invalid relations: 186
Invalid relations after fixing: 186


In [10]:
print(invalid_relations["relation_type"].unique())

['increased_by::BiologicalProcess:Disease'
 'associated_with::Pathway:Disease'
 'reduced_by::BiologicalProcess:Disease' 'reduced_by::Gene:Disease'
 'associated_with::Compound:Disease' 'associated_with::Symptom:Disease'
 'associated_with::Metabolite:Disease'
 'associated_with::BiologicalProcess:Disease'
 'biomarker::Compound:Disease' 'biomarker::Metabolite:Disease'
 'inhibited_by::BiologicalProcess:Disease'
 'associated_with::Anatomy:Disease' 'biomarker::Gene:Disease'
 'biomarker::BiologicalProcess:Disease'
 'biomarker::CellularComponent:Disease'
 'biomarker::Disease:BiologicalProcess'
 'associated_with::CellularComponent:Disease' 'treats::Compound:Disease'
 'treats::Metabolite:Disease' 'associated_with::Disease:Compound'
 'biomarker::Pathway:Disease' 'associated_with::Disease:BiologicalProcess'
 'reduced_by::Metabolite:Disease' 'induced_by::BiologicalProcess:Disease'
 'increased_by::Symptom:Disease' 'biomarker::Anatomy:Disease'
 'reduced_by::Pathway:Disease' 'increased_by::Metabolite:D

In [7]:
print(
    invalid_relations[
        invalid_relations["relation_type"] == "associated_with::Gene:Anatomy"
    ]
)

                      relation_type source_name source_type    source_id  \
1308  associated_with::Gene:Anatomy       IFN-γ        Gene  ENTREZ:3458   
1329  associated_with::Gene:Anatomy         TNF        Gene  ENTREZ:7124   
1330  associated_with::Gene:Anatomy       IL-1β        Gene  ENTREZ:3553   
1333  associated_with::Gene:Anatomy       IL-1β        Gene  ENTREZ:3553   
1334  associated_with::Gene:Anatomy         TNF        Gene  ENTREZ:7124   
1335  associated_with::Gene:Anatomy       IFN-γ        Gene  ENTREZ:3458   

                target_name target_type       target_id  \
1308  alveolar regeneration     Anatomy  UBERON:0002169   
1329  alveolar regeneration     Anatomy  UBERON:0002169   
1330     pulmonary function     Anatomy    MESH:D008168   
1333  alveolar regeneration     Anatomy  UBERON:0002169   
1334     pulmonary function     Anatomy    MESH:D008168   
1335     pulmonary function     Anatomy    MESH:D008168   

                                           key_senten