## CustomDB

Download from the prophet-studio.3steps.cn and upload to the customdb folder.

You can connect to the database and fetch the related records by using the following sql query:

```sql
SELECT relation_type,source_name,source_type,source_id,target_name,target_type,target_id,key_sentence,pmid FROM biomedgps_knowledge_curation WHERE curator = '1635231996@qq.com' AND source_id != 'Unknown:Unknown' AND target_id != 'Unknown:Unknown';
```


### Reformat the curated knowledges as the BioMedGPS format

In [8]:
valid_entity_types = [
    "Compound",
    "Disease",
    "Gene",
    "Metabolite",
    "Pathway",
    "Anatomy",
    "Symptom",
    "PharmacologicClass",
    "BiologicalProcess",
    "CellularComponent",
    "MolecularFunction",
]

In [9]:
import pandas as pd

df = pd.read_csv("./customdb-v20240329.csv", sep=",")
# Filter all the rows with Unknown:Unknown values in the target_id or source_id columns
df = df[~df["target_id"].str.contains("Unknown:Unknown")]
df = df[~df["source_id"].str.contains("Unknown:Unknown")]
df["idx"] = df.index

# Join the source_type, target_type and relation_type columns
valid_relations = df[
    df["relation_type"].str.contains(".*:+[a-zA-Z]+:[a-zA-Z]+$", regex=True)
    & (
        df["source_type"].apply(lambda x: x in valid_entity_types)
        & df["target_type"].apply(lambda x: x in valid_entity_types)
    )
]
print(f"Valid relations: {valid_relations.shape[0]}")
invalid_relations = df[df["idx"].apply(lambda x: x not in valid_relations.index)]
print(f"Invalid relations: {invalid_relations.shape[0]}")

invalid_relations = invalid_relations.copy()
# Replace 'Protein' with 'Gene' in 'source_type' and 'target_type' columns
invalid_relations["source_type"] = invalid_relations["source_type"].replace(
    "Protein", "Gene"
)
invalid_relations["target_type"] = invalid_relations["target_type"].replace(
    "Protein", "Gene"
)

# Remove all rows which have a invalid source_type or target_type
invalid_relations = invalid_relations[
    (invalid_relations["source_type"].apply(lambda x: x in valid_entity_types))
    & (invalid_relations["target_type"].apply(lambda x: x in valid_entity_types))
].copy()

print(f"Invalid relations after fixing: {invalid_relations.shape[0]}")

invalid_relations["relation_type"] = invalid_relations.apply(
    lambda x: x["relation_type"] + "::" + x["source_type"] + ":" + x["target_type"],
    axis=1,
)

Valid relations: 989
Invalid relations: 186
Invalid relations after fixing: 186


In [10]:
print(invalid_relations["relation_type"].unique())

['increased_by::BiologicalProcess:Disease'
 'associated_with::Pathway:Disease'
 'reduced_by::BiologicalProcess:Disease' 'reduced_by::Gene:Disease'
 'associated_with::Compound:Disease' 'associated_with::Symptom:Disease'
 'associated_with::Metabolite:Disease'
 'associated_with::BiologicalProcess:Disease'
 'biomarker::Compound:Disease' 'biomarker::Metabolite:Disease'
 'inhibited_by::BiologicalProcess:Disease'
 'associated_with::Anatomy:Disease' 'biomarker::Gene:Disease'
 'biomarker::BiologicalProcess:Disease'
 'biomarker::CellularComponent:Disease'
 'biomarker::Disease:BiologicalProcess'
 'associated_with::CellularComponent:Disease' 'treats::Compound:Disease'
 'treats::Metabolite:Disease' 'associated_with::Disease:Compound'
 'biomarker::Pathway:Disease' 'associated_with::Disease:BiologicalProcess'
 'reduced_by::Metabolite:Disease' 'induced_by::BiologicalProcess:Disease'
 'increased_by::Symptom:Disease' 'biomarker::Anatomy:Disease'
 'reduced_by::Pathway:Disease' 'increased_by::Metabolite:D

In [7]:
print(
    invalid_relations[
        invalid_relations["relation_type"] == "associated_with::Gene:Anatomy"
    ]
)

                      relation_type source_name source_type    source_id  \
1308  associated_with::Gene:Anatomy       IFN-γ        Gene  ENTREZ:3458   
1329  associated_with::Gene:Anatomy         TNF        Gene  ENTREZ:7124   
1330  associated_with::Gene:Anatomy       IL-1β        Gene  ENTREZ:3553   
1333  associated_with::Gene:Anatomy       IL-1β        Gene  ENTREZ:3553   
1334  associated_with::Gene:Anatomy         TNF        Gene  ENTREZ:7124   
1335  associated_with::Gene:Anatomy       IFN-γ        Gene  ENTREZ:3458   

                target_name target_type       target_id  \
1308  alveolar regeneration     Anatomy  UBERON:0002169   
1329  alveolar regeneration     Anatomy  UBERON:0002169   
1330     pulmonary function     Anatomy    MESH:D008168   
1333  alveolar regeneration     Anatomy  UBERON:0002169   
1334     pulmonary function     Anatomy    MESH:D008168   
1335     pulmonary function     Anatomy    MESH:D008168   

                                           key_senten

### Merge the formatted relations into one file

In [1]:
import pandas as pd

files = [
    "./formatted_customdb_v20240329.tsv",
    "./formatted_fuscc_bcdb_v20240923.tsv",
    "./formatted_malacards_mecfs.tsv",
    "./formatted_treatme_survey_compounds.tsv",
    "./formatted_treatme_survey_phenotypes.tsv",
]
merged = pd.concat([pd.read_csv(f, sep="\t") for f in files], ignore_index=True)
merged.to_csv("formatted_merged_all.tsv", sep="\t", index=False)


In [2]:
import os
import os.path as osp
import subprocess


def format_customdb(filename):
    def get_project_root():
        try:
            return osp.dirname(osp.dirname(os.getcwd()))
        except Exception as e:
            raise RuntimeError(f"Failed to determine project root: {e}")

    try:
        root_dir = get_project_root()
        print(f"Project root directory: {root_dir}")
    except RuntimeError as e:
        print(e)
        exit(1)

    database = "customdb"
    relations_path = osp.join(
        root_dir,
        "relations",
        "customdb",
        filename,
    )
    output_dir = osp.join(root_dir, "formatted_relations", "customdb")
    entities_path = osp.join(root_dir, "entities.tsv")
    log_file = osp.join(output_dir, "log.txt")

    command = [
        "graph-builder",
        "--database",
        database,
        "-d",
        relations_path,
        "-o",
        output_dir,
        "-f",
        entities_path,
        "-n",
        "20",
        "--download",
        "--skip",
        "-l",
        log_file,
        "--debug",
    ]

    print("Executing command:", " ".join(command))

    try:
        subprocess.run(command, check=True)
    except FileNotFoundError:
        print(
            "Error: 'graph-builder' command not found. Make sure it is installed and available in the PATH."
        )
        exit(1)
    except subprocess.CalledProcessError as e:
        print(f"Error: Command execution failed with return code {e.returncode}")
        print(f"Output: {e.output}")
        exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        exit(1)

In [5]:
format_customdb("formatted_merged_all.tsv")

Project root directory: /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data
Executing command: graph-builder --database customdb -d /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/customdb/formatted_merged_all.tsv -o /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/customdb -f /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/entities.tsv -n 20 --download --skip -l /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/customdb/log.txt --debug


2025-03-01 07:25:16 - cli:171 - INFO - Run jobs with (output_dir: /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/customdb, db file/directory: /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/customdb/formatted_merged_all.tsv, databases: ('customdb',), download: True, skip: True)
2025-03-01 07:25:18 - base_parser:229 - INFO - Using allow_ignore_checking_errors=all to ignore the checking errors.
2025-03-01 07:25:18 - customdb_parser:99 - INFO - Get 1152 relations
2025-03-01 07:25:18 - base_parser:478 - INFO - Found 1152 relations.
2025-03-01 07:25:18 - base_parser:778 - INFO - Found entity id map file, skip to generate it. If you want to regenerate it, please delete the file: /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/formatted_relations/customdb/customdb.entity_id_map.json
2025-03-01 07:25:18 - base_parser:480 - INFO - Found 748 entity ids in entity id map.
2025-03-01 07:25:18 - base_parser:494 - INFO - T