# Fibromine
An interactive multi-omics data mining tool for Idiopathic Pulmonary Fibrosis

### Step 1: Read all files and merge them

In [2]:
import pandas as pd
import os
from glob import glob

# Set the path: Place your Excel file in this directory
folder_path = "./expressionData/"
all_files = glob(os.path.join(folder_path, "*.xlsx"))
all_files

['./expressionData/Korfei13_expressionData_2025-04-25.xlsx',
 './expressionData/Niu17_expressionData_2025-04-25.xlsx',
 './expressionData/Tian19_expressionData_2025-04-25.xlsx',
 './expressionData/ODwyer17_expressionData_2025-04-25.xlsx',
 './expressionData/Saraswat20_expressionData_2025-04-25.xlsx',
 './expressionData/Todd19_expressionData_2025-04-25.xlsx',
 './expressionData/Moodley19_expressionData_2025-04-25.xlsx',
 './expressionData/Korfei11_expressionData_2025-04-25.xlsx',
 './expressionData/Foster14_expressionData_2025-04-25.xlsx']

In [3]:
# Merge by row (column structure is exactly the same)
dfs = [pd.read_excel(file) for file in all_files]
fibromine_df = pd.concat(dfs, ignore_index=True)

# save file
fibromine_df.to_csv("fibromine_expression_data.tsv", sep='\t', index=False)

In [4]:
fibromine_df

Unnamed: 0,UniprotAC,Symbol,ENSGid,ExpressionDirection,DatasetID,Comparison
0,P60709,ACTB,ENSG00000075624,Up,Korfei13,IPF_vs_Ctrl
1,P28838,LAP3,ENSG00000002549,Up,Korfei13,IPF_vs_Ctrl
2,P62937,PPIA,ENSG00000196262,Up,Korfei13,IPF_vs_Ctrl
3,Q01469,FABP5,ENSG00000164687,Down,Korfei13,IPF_vs_Ctrl
4,P60174,TPI1,ENSG00000111669,Up,Korfei13,IPF_vs_Ctrl
...,...,...,...,...,...,...
809,P12111,COL6A3,ENSG00000163359,Up,Foster14,IPF_vs_Ctrl
810,P01833,PIGR,ENSG00000162896,Down,Foster14,IPF_vs_Ctrl
811,Q6IQ49,SDE2,ENSG00000143751,Up,Foster14,IPF_vs_Ctrl
812,P55774,CCL18,ENSG00000275385,Up,Foster14,IPF_vs_Ctrl


### Step 2: Reformat the Fibromine data as the BioMedGPS format
More details on the data format can be found [here](https://open-prophetdb.github.io/biomedgps-data/graph_data_index/#knowledge-graph-file).

Examples:

| relation_type                  | resource | source_id | source_type | target_id   | target_type | source_name                    | target_name |
|--------------------------------|----------|-----------|-------------|-------------|-------------|--------------------------------|-------------|
| DGIDB::INHIBITOR::Gene:Compound| DGIDB    | ENTREZ:4311 | Gene        | MESH:D015244| Compound    | membrane metalloendopeptidase  | Thiorphan   |
| DGIDB::INHIBITOR::Gene:Compound| DGIDB    | ENTREZ:4311 | Gene        | MESH:C097292| Compound    | membrane metalloendopeptidase  | aladotrilat |

In [6]:
import pandas as pd

# We assume the entity file is already generated and placed in the ROOT_DIR/graph_data/entities.tsv. The ROOT_DIR is the root directory of the BioMedGPS Data Repository.
entity_file = "/Users/zhuzhixing/KG/biomedgps-data/graph_data/entities.tsv"

entity_df = pd.read_csv(entity_file, sep="\t", low_memory=False)

In [7]:
entity_df

Unnamed: 0,id,label,name,description,resource,synonyms,pmids,taxid,xrefs
0,CLO:0000000,CellLine,cell line cell culturing,a maintaining cell culture process that keeps ...,CLO,,,,
1,CLO:0000001,CellLine,cell line cell,A cultured cell that is part of a cell line - ...,CLO,,,,
2,CLO:0000002,CellLine,suspension cell line culturing,suspension cell line culturing is a cell line ...,CLO,,,,
3,CLO:0000003,CellLine,adherent cell line culturing,adherent cell line culturing is a cell line cu...,CLO,,,,
4,CLO:0000004,CellLine,cell line cell modification,a material processing that modifies an existin...,CLO,,,,
...,...,...,...,...,...,...,...,...,...
936596,WikiPathways:WP88,Pathway,Toll-like receptor signaling,Toll-like receptors (TLRs) are a class of prot...,WikiPathways,,,10090.0,
936597,WikiPathways:WP89,Pathway,FAS pathway and stress induction of HSP regula...,This pathway describes the Fas induced apoptos...,WikiPathways,,,10116.0,
936598,WikiPathways:WP93,Pathway,IL-4 signaling pathway,,WikiPathways,,,10090.0,
936599,WikiPathways:WP94,Pathway,Hepatocyte growth factor receptor signaling,Signaling pathway of the Hepatocyte Growth Fac...,WikiPathways,,,10116.0,


In [12]:
result = entity_df[entity_df.apply(lambda row: row.astype(str).str.contains('MONDO:0002771').any(), axis=1)]
result

Unnamed: 0,id,label,name,description,resource,synonyms,pmids,taxid,xrefs
717523,MONDO:0002771,Disease,pulmonary fibrosis,Chronic progressive interstitial lung disorder...,Mondo,fibrosis of lung|pulmonary interstitial fibrosis,,,DOID:3770|EFO:0009448|MEDGEN:11028|MESH:D01165...


Unnamed: 0,id,label,name,description,resource,synonyms,pmids,taxid,xrefs


In [8]:
df = pd.read_csv("fibromine_expression_data.tsv", sep="\t", low_memory=False)

In [9]:
df

Unnamed: 0,UniprotAC,Symbol,ENSGid,ExpressionDirection,DatasetID,Comparison
0,P60709,ACTB,ENSG00000075624,Up,Korfei13,IPF_vs_Ctrl
1,P28838,LAP3,ENSG00000002549,Up,Korfei13,IPF_vs_Ctrl
2,P62937,PPIA,ENSG00000196262,Up,Korfei13,IPF_vs_Ctrl
3,Q01469,FABP5,ENSG00000164687,Down,Korfei13,IPF_vs_Ctrl
4,P60174,TPI1,ENSG00000111669,Up,Korfei13,IPF_vs_Ctrl
...,...,...,...,...,...,...
809,P12111,COL6A3,ENSG00000163359,Up,Foster14,IPF_vs_Ctrl
810,P01833,PIGR,ENSG00000162896,Down,Foster14,IPF_vs_Ctrl
811,Q6IQ49,SDE2,ENSG00000143751,Up,Foster14,IPF_vs_Ctrl
812,P55774,CCL18,ENSG00000275385,Up,Foster14,IPF_vs_Ctrl


In [13]:
df["ExpressionDirection"].unique()

array(['Up', 'Down'], dtype=object)

In [19]:
relation_type_map = {
    "Up": "Hetionet::DuG::Disease:Gene",
    "Down": "Hetionet::DdG::Disease:Gene",
}
relation_type_map

{'Up': 'Hetionet::DuG::Disease:Gene', 'Down': 'Hetionet::DdG::Disease:Gene'}

In [23]:
df["relation_type"] = df["ExpressionDirection"].map(relation_type_map)

In [24]:
df

Unnamed: 0,UniprotAC,Symbol,ENSGid,ExpressionDirection,DatasetID,Comparison,PMID,key_sentence,relation_type
0,P60709,ACTB,ENSG00000075624,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P60709｜ENSGid ENSG00000075624,Hetionet::DuG::Disease:Gene
1,P28838,LAP3,ENSG00000002549,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P28838｜ENSGid ENSG00000002549,Hetionet::DuG::Disease:Gene
2,P62937,PPIA,ENSG00000196262,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P62937｜ENSGid ENSG00000196262,Hetionet::DuG::Disease:Gene
3,Q01469,FABP5,ENSG00000164687,Down,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: Q01469｜ENSGid ENSG00000164687,Hetionet::DdG::Disease:Gene
4,P60174,TPI1,ENSG00000111669,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P60174｜ENSGid ENSG00000111669,Hetionet::DuG::Disease:Gene
...,...,...,...,...,...,...,...,...,...
809,P12111,COL6A3,ENSG00000163359,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P12111｜ENSGid ENSG00000163359,Hetionet::DuG::Disease:Gene
810,P01833,PIGR,ENSG00000162896,Down,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P01833｜ENSGid ENSG00000162896,Hetionet::DdG::Disease:Gene
811,Q6IQ49,SDE2,ENSG00000143751,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: Q6IQ49｜ENSGid ENSG00000143751,Hetionet::DuG::Disease:Gene
812,P55774,CCL18,ENSG00000275385,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P55774｜ENSGid ENSG00000275385,Hetionet::DuG::Disease:Gene


In [15]:
# Construct a mapping dictionary from DatasetID to PMID
pmid_map = {
    "Tian19": 30774578,
    "ODwyer17": 28440314,
    "Korfei13": 23659799,
    "Moodley19": 31393655,
    "Niu17": 28122020,
    "Todd19": 31640794,
    "Saraswat20": 32385381,
    "Foster14": 25541672,
    "Korfei11": 21319792
}
# 2. 添加新列
df["PMID"] = df["DatasetID"].map(pmid_map)

In [16]:
df

Unnamed: 0,UniprotAC,Symbol,ENSGid,ExpressionDirection,DatasetID,Comparison,PMID
0,P60709,ACTB,ENSG00000075624,Up,Korfei13,IPF_vs_Ctrl,23659799
1,P28838,LAP3,ENSG00000002549,Up,Korfei13,IPF_vs_Ctrl,23659799
2,P62937,PPIA,ENSG00000196262,Up,Korfei13,IPF_vs_Ctrl,23659799
3,Q01469,FABP5,ENSG00000164687,Down,Korfei13,IPF_vs_Ctrl,23659799
4,P60174,TPI1,ENSG00000111669,Up,Korfei13,IPF_vs_Ctrl,23659799
...,...,...,...,...,...,...,...
809,P12111,COL6A3,ENSG00000163359,Up,Foster14,IPF_vs_Ctrl,25541672
810,P01833,PIGR,ENSG00000162896,Down,Foster14,IPF_vs_Ctrl,25541672
811,Q6IQ49,SDE2,ENSG00000143751,Up,Foster14,IPF_vs_Ctrl,25541672
812,P55774,CCL18,ENSG00000275385,Up,Foster14,IPF_vs_Ctrl,25541672


In [17]:
# Combine more columns into the 'key_sentence' field, using '｜' as the delimiter
df["key_sentence"] = (
    "UniprotAC: " + df["UniprotAC"].fillna("N/A") + "｜" +
    "ENSGid " + df["ENSGid"].fillna("N/A") 
)
df

Unnamed: 0,UniprotAC,Symbol,ENSGid,ExpressionDirection,DatasetID,Comparison,PMID,key_sentence
0,P60709,ACTB,ENSG00000075624,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P60709｜ENSGid ENSG00000075624
1,P28838,LAP3,ENSG00000002549,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P28838｜ENSGid ENSG00000002549
2,P62937,PPIA,ENSG00000196262,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P62937｜ENSGid ENSG00000196262
3,Q01469,FABP5,ENSG00000164687,Down,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: Q01469｜ENSGid ENSG00000164687
4,P60174,TPI1,ENSG00000111669,Up,Korfei13,IPF_vs_Ctrl,23659799,UniprotAC: P60174｜ENSGid ENSG00000111669
...,...,...,...,...,...,...,...,...
809,P12111,COL6A3,ENSG00000163359,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P12111｜ENSGid ENSG00000163359
810,P01833,PIGR,ENSG00000162896,Down,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P01833｜ENSGid ENSG00000162896
811,Q6IQ49,SDE2,ENSG00000143751,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: Q6IQ49｜ENSGid ENSG00000143751
812,P55774,CCL18,ENSG00000275385,Up,Foster14,IPF_vs_Ctrl,25541672,UniprotAC: P55774｜ENSGid ENSG00000275385


In [55]:
import pandas as pd

# We assume the entity file is already generated and placed in the ROOT_DIR/graph_data/entities.tsv. The ROOT_DIR is the root directory of the BioMedGPS Data Repository.
entity_file = "/Users/zhuzhixing/KG/biomedgps-data/graph_data/entities.tsv"

entity_df = pd.read_csv(entity_file, sep="\t", low_memory=False)

In [56]:
entity_df

Unnamed: 0,id,label,name,description,resource,synonyms,pmids,taxid,xrefs
0,CLO:0000000,CellLine,cell line cell culturing,a maintaining cell culture process that keeps ...,CLO,,,,
1,CLO:0000001,CellLine,cell line cell,A cultured cell that is part of a cell line - ...,CLO,,,,
2,CLO:0000002,CellLine,suspension cell line culturing,suspension cell line culturing is a cell line ...,CLO,,,,
3,CLO:0000003,CellLine,adherent cell line culturing,adherent cell line culturing is a cell line cu...,CLO,,,,
4,CLO:0000004,CellLine,cell line cell modification,a material processing that modifies an existin...,CLO,,,,
...,...,...,...,...,...,...,...,...,...
936596,WikiPathways:WP88,Pathway,Toll-like receptor signaling,Toll-like receptors (TLRs) are a class of prot...,WikiPathways,,,10090.0,
936597,WikiPathways:WP89,Pathway,FAS pathway and stress induction of HSP regula...,This pathway describes the Fas induced apoptos...,WikiPathways,,,10116.0,
936598,WikiPathways:WP93,Pathway,IL-4 signaling pathway,,WikiPathways,,,10090.0,
936599,WikiPathways:WP94,Pathway,Hepatocyte growth factor receptor signaling,Signaling pathway of the Hepatocyte Growth Fac...,WikiPathways,,,10116.0,


In [57]:
entity_df[entity_df["id"] == "MONDO:0100430"]

Unnamed: 0,id,label,name,description,resource,synonyms,pmids,taxid,xrefs
739181,MONDO:0100430,Disease,fibrotic liver disease,A liver disease characterized by the presence ...,Mondo,hepatic fibrosis (disease)|liver fibrosis (dis...,,,MONDO:0100430


In [25]:
formatted_df = pd.DataFrame()
formatted_df["source_name"] = df["Symbol"]
formatted_df["source_type"] = "Gene"
formatted_df["target_name"] = "IPF"
formatted_df["target_type"] = "Disease"


source_ids = []
target_ids = []

for _, row in df.iterrows():
    source_id = entity_df.loc[
        (entity_df["name"] == row["Symbol"]) & (entity_df["label"] == "Gene"), "id"
    ]

    if not source_id.empty:
        source_ids.append(source_id.values[0])
    else:
        source_ids.append(None)

    target_id = entity_df.loc[
        (entity_df["name"] == "pulmonary fibrosis") & (entity_df["label"] == "Disease"),
        "id",
    ]

    if not target_id.empty:
        target_ids.append(target_id.values[0])
    else:
        target_ids.append(None)

formatted_df["source_id"] = source_ids
formatted_df["target_id"] = target_ids

formatted_df["relation_type"] =df["relation_type"]
formatted_df["resource"] = "FIBROMINE"

invalid_formatted_df = formatted_df[formatted_df["source_id"].isna() | formatted_df["target_id"].isna()]
invalid_formatted_df.to_csv("invalid_formatted_fibromine.tsv", index=False, sep="\t")

formatted_df = formatted_df[~formatted_df["source_id"].isna() & ~formatted_df["target_id"].isna()]
formatted_df.to_csv("formatted_fibromine.tsv", index=False, sep="\t")

In [28]:
import os
import os.path as osp
import subprocess


def format_fibromine(filename):
    def get_project_root():
        try:
            return osp.dirname(osp.dirname(os.getcwd()))
        except Exception as e:
            raise RuntimeError(f"Failed to determine project root: {e}")

    try:
        root_dir = get_project_root()
        print(f"Project root directory: {root_dir}")
    except RuntimeError as e:
        print(e)
        exit(1)

    database = "customdb"
    relations_path = osp.join(
        root_dir,
        "relations",
        "Fibromine",
        filename,
    )
    output_dir = osp.join(
        root_dir, "formatted_relations", "Fibromine"
    )
    entities_path = osp.join(root_dir, "entities.tsv")
    log_file = osp.join(output_dir, "log.txt")
    relation_types_file = osp.join(root_dir, "relation_types.tsv")

    command = [
        "graph-builder",
        "--database",
        database,
        "-d",
        relations_path,
        "-o",
        output_dir,
        "-f",
        entities_path,
        "-n",
        "20",
        "--download",
        "--skip",
        "-l",
        log_file,
        "--debug",
        "--relation-type-dict-fpath",
        relation_types_file,
    ]

    print("Executing command:", " ".join(command))

    try:
        subprocess.run(command, check=True)
    except FileNotFoundError:
        print(
            "Error: 'graph-builder' command not found. Make sure it is installed and available in the PATH."
        )
        exit(1)
    except subprocess.CalledProcessError as e:
        print(f"Error: Command execution failed with return code {e.returncode}")
        print(f"Output: {e.output}")
        exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        exit(1)

In [29]:
format_fibromine("formatted_fibromine.tsv")

Project root directory: /Users/zhuzhixing/KG/biomedgps-data/graph_data
Executing command: graph-builder --database customdb -d /Users/zhuzhixing/KG/biomedgps-data/graph_data/relations/Fibromine/formatted_fibromine.tsv -o /Users/zhuzhixing/KG/biomedgps-data/graph_data/formatted_relations/Fibromine -f /Users/zhuzhixing/KG/biomedgps-data/graph_data/entities.tsv -n 20 --download --skip -l /Users/zhuzhixing/KG/biomedgps-data/graph_data/formatted_relations/Fibromine/log.txt --debug --relation-type-dict-fpath /Users/zhuzhixing/KG/biomedgps-data/graph_data/relation_types.tsv


2025-04-25 20:38:22 - cli:171 - INFO - Run jobs with (output_dir: /Users/zhuzhixing/KG/biomedgps-data/graph_data/formatted_relations/Fibromine, db file/directory: /Users/zhuzhixing/KG/biomedgps-data/graph_data/relations/Fibromine/formatted_fibromine.tsv, databases: ('customdb',), download: True, skip: True)
2025-04-25 20:38:25 - base_parser:229 - INFO - Using allow_ignore_checking_errors=all to ignore the checking errors.
2025-04-25 20:38:25 - customdb_parser:104 - INFO - Get 814 relations
2025-04-25 20:38:25 - base_parser:478 - INFO - Found 814 relations.
2025-04-25 20:38:25 - base_parser:797 - INFO - Start to get entity id map.
2025-04-25 20:38:35 - base_parser:831 - INFO - The number of deduped entity type ids: 695
2025-04-25 20:44:08 - base_parser:841 - INFO - The number of entity ids: 695
2025-04-25 20:44:09 - base_parser:480 - INFO - Found 695 entity ids in entity id map.
2025-04-25 20:44:09 - base_parser:494 - INFO - The number of relations before dropna: 814
2025-04-25 20:44:09