### KEGG

In [None]:
import io

# Import Biopython modules to interact with KEGG
from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG.KGML import KGML_parser
import pandas as pd

In [None]:
def to_df(result):
    return pd.read_table(io.StringIO(result), header=None)

In [None]:
# Get all entries in the PATHWAY database for K. setae as a dataframe
result = REST.kegg_list("pathway", "hsa").read()
df = to_df(result)

In [None]:
df.shape

In [None]:
result = REST.kegg_get("hsa01521").read()
result

In [None]:
# Split the content into lines
lines = result.split('\n')

print(lines)

# Initialize variables to store extracted information
entry = ""
name = ""
description = ""
references = []

def get_value(line):
    items = line.split()
    if len(items) >= 2:
        return " ".join(items[1:])
    else:
        return ""

# Iterate through the lines to extract the fields
for line in lines:
    if line.startswith("ENTRY"):
        entry = get_value(line)
    elif line.startswith("NAME"):
        name = get_value(line)
    elif line.startswith("DESCRIPTION"):
        description = get_value(line)
    elif line.startswith("REFERENCE"):
        references.append(get_value(line))
        

# Print the extracted information
print("ENTRY:", entry)
print("NAME:", name)
print("DESCRIPTION:", description)
print("REFERENCES:", references)

### Embeddings

In [None]:
from transformers import AutoModel, AutoTokenizer
from sklearn.decomposition import PCA
import torch

# Load pre-trained BERT model and tokenizer
# model_name = "dmis-lab/biobert-large-cased-v1.1-squad"
model_name = "/Users/jy006/Downloads/RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def gen_embeddings(text, n_dims=768):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Get contextual embeddings
    with torch.no_grad():
        inputs = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
        outputs = model(inputs)
        entity_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling for entity-level embeddings

    # Assuming you have 'entity_embeddings' with shape (num_entities, 768)
    # where num_entities is the number of entities and 768 is the original embedding dimension

    # Create a PCA object with the desired number of components (400)
    # pca = PCA(n_components=n_dims)

    # Fit the PCA model to your entity embeddings
    # reduced_embeddings = pca.fit_transform(entity_embeddings)

    # reduced_embeddings now contains the embeddings with reduced dimensionality (400)
    # return reduced_embeddings
    return entity_embeddings[0]

def get_similarity(embedding1, embedding2):
    return embedding1.dot(embedding2) / (torch.norm(embedding1) * torch.norm(embedding2))

def get_similarities(embedding, embeddings):
    return [get_similarity(embedding, embedding2) for embedding2 in embeddings]

In [None]:
diseases = ["long COVID-19", "Hypotension, Orthostatic", "Postural Orthostatic Tachycardia Syndrome", "Lung Cancer", "Inflamation", "fever", "headache", "chronic fatigue", "Diarrhea", "TP53", "EGFR", "Naltrexone", "Rubbish", "fracture", "Brain Injury"]

disease_embeddings = [gen_embeddings(disease) for disease in diseases]

In [None]:
mecfs_embedding = gen_embeddings("myalgic encephalomyelitis/chronic fatigue syndrome")

In [None]:
# Get the similarity scores for all diseases and sort them in descending order
scores = get_similarities(mecfs_embedding, disease_embeddings)
scores = [(diseases[i], scores[i]) for i in range(len(diseases))]
scores.sort(key=lambda x: x[1], reverse=True)

scores

### Statistics

In [2]:
import pandas as pd

hdsn = pd.read_csv("./graph_data/formatted_relations/hsdn/raw_hsdn.tsv", sep="\t")
hdsn["resource"] = "CuratedKG"
drkg = pd.read_csv("./graph_data/formatted_relations/drkg/raw_drkg.tsv", sep="\t")
custom = pd.read_csv("/Users/jy006/Downloads/publications.txt", sep="\t")
custom["resource"] = "CuratedKG"
malacards = pd.read_csv("/Users/jy006/Downloads/malacards.txt", sep="\t")
malacards["resource"] = "CuratedKG"

#### DRKG

In [13]:
# DRKG
s_drkg_df = drkg[["source_id", "source_type"]]
s_drkg_df.columns = ["id", "label"]
t_drkg_df = drkg[["target_id", "target_type"]]
t_drkg_df.columns = ["id", "label"]

drkg_df = pd.concat([s_drkg_df, t_drkg_df]).drop_duplicates().reset_index(drop=True)
drkg_grouped_df = drkg_df.groupby(["label"]).size().reset_index(name="count")
drkg_grouped_df = drkg_grouped_df.sort_values(by=["count"], ascending=True)

print(drkg_grouped_df.to_string(index=False))

# use plotly to plot the distribution of relations, but don't include the "Total" row and ignore the count less than 10
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'vscode'

fig = px.bar(drkg_grouped_df, x="label", y="count", title="Distribution of entities [DRKG]")
fig.update_traces(texttemplate='%{value:.2s}', textposition='outside')

             label  count
               Tax    215
PharmacologicClass    345
           Anatomy    400
           Symptom    415
 CellularComponent   1391
           Pathway   1822
 MolecularFunction   2884
               Atc   4048
           Disease   5103
        SideEffect   5701
 BiologicalProcess  11381
          Compound  24313
              Gene  39220


In [4]:
# Generate a new column with the source and target types and don't care about the order
drkg["source_target"] = drkg.apply(lambda row: row["source_type"] + "_" + row["target_type"] if row["source_type"] < row["target_type"] else row["target_type"] + "_" + row["source_type"], axis=1)

drkg_relations_grouped_df = drkg.groupby(["source_target", "resource"]).size().reset_index(name="count")
drkg_relations_grouped_df["total_count"] = drkg_relations_grouped_df.groupby(["resource"])["count"].transform("sum")

drkg_relations_grouped_df = drkg_relations_grouped_df.sort_values(by=["count"], ascending=True)

print(drkg_relations_grouped_df.to_string(index=False))

              source_target resource   count  total_count
               Disease_Gene   bioarx     461        84756
            Disease_Disease Hetionet     543      2250197
Compound_PharmacologicClass Hetionet    1029      2250197
           Compound_Disease Hetionet    1145      2250197
              Compound_Gene   INTACT    1805       256151
            Disease_Symptom Hetionet    3357      2250197
            Anatomy_Disease Hetionet    3602      2250197
           Compound_Disease DRUGBANK    4968      1424790
          Compound_Compound Hetionet    6486      2250197
                   Gene_Tax     GNBR   14663       335369
               Atc_Compound DRUGBANK   15750      1424790
              Compound_Gene DRUGBANK   24801      1424790
              Compound_Gene   bioarx   25666        84756
              Compound_Gene    DGIDB   26290        26290
               Disease_Gene Hetionet   27977      2250197
              Compound_Gene Hetionet   51429      2250197
              

In [5]:
# use plotly to plot the distribution of relations, but don't include the "Total" row and ignore the count less than 10
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'vscode'

color_map = {
    "CuratedKG": "#636efa",
    "bioarx": "#EF553B",
    "Hetionet": "#00cc96",
    "INTACT": "#ab63fa",
    "DRUGBANK": "#FFA15A",
    "DGIDB": "#19d3f3",
    "GNBR": "#FF6692",
    "STRING": "#B6E880"
}

drkg_relations_grouped_df = drkg_relations_grouped_df.sort_values(by=["count"], ascending=True)

kept_relations = ["Compound_Compound", "Compound_Disease", "Compound_Gene", "Disease_Disease", "Disease_Gene", "Disease_Symptom", "Gene_Gene", "Gene_Pathway", "Pathway_Pathway"]

drkg_relations_grouped_df = drkg_relations_grouped_df[drkg_relations_grouped_df["source_target"].isin(kept_relations)]

fig = px.bar(drkg_relations_grouped_df, x="source_target", y="count", title="Distribution of relations [DRKG]", color="resource", color_discrete_map=color_map)
# Order the x axis by the label of the relations
fig.update_xaxes(categoryorder="array", categoryarray=kept_relations)
# fig.update_traces(texttemplate='%{value:.2s}', textposition='inside')
fig.show()

#### CustomKG

In [15]:
s_entity = pd.concat([hdsn[["source_id", "source_type"]], drkg[["source_id", "source_type"]], custom[["source_id", "source_type"]], malacards[["source_id", "source_type"]]])
s_entity = s_entity.rename(columns={"source_id": "id", "source_type": "label"})
t_entity = pd.concat([hdsn[["target_id", "target_type"]], drkg[["target_id", "target_type"]], custom[["target_id", "target_type"]], malacards[["target_id", "target_type"]]])
t_entity = t_entity.rename(columns={"target_id": "id", "target_type": "label"})

entity = pd.concat([pd.DataFrame(s_entity), pd.DataFrame(t_entity)])
entity = entity.drop_duplicates()
entity_grouped_df = entity.groupby(["label"]).size().reset_index(name="count")

# total = entity_grouped_df["count"].sum()

# Add total number of entities as a new row
# entity_grouped_df.loc[len(entity_grouped_df)] = ["Total", total]
entity_grouped_df = entity_grouped_df.sort_values(by=["count"], ascending=True)

print(entity_grouped_df.to_string(index=False))

             label  count
           Microbe      1
           Protein      1
        Metabolite     13
               Tax    215
PharmacologicClass    346
           Anatomy    411
           Symptom    466
 CellularComponent   1395
           Pathway   1856
 MolecularFunction   2886
               Atc   4048
        SideEffect   5701
           Disease   6208
 BiologicalProcess  11397
          Compound  24337
              Gene  39222


In [7]:
# use plotly to plot the distribution of relations, but don't include the "Total" row and ignore the count less than 10
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'vscode'

fig = px.bar(entity_grouped_df[1:-1][entity_grouped_df[1:-1]["count"] > 100], x="label", y="count", title="Distribution of entities [CustomKG]")
fig.update_traces(texttemplate='%{value:.2s}', textposition='outside')

In [8]:
relations = pd.concat([hdsn, drkg, custom, malacards])
relations = relations.drop_duplicates()

# Generate a new column with the source and target types and don't care about the order
relations["source_target"] = relations.apply(lambda row: row["source_type"] + "_" + row["target_type"] if row["source_type"] < row["target_type"] else row["target_type"] + "_" + row["source_type"], axis=1)

relations_grouped_df = relations.groupby(["source_target", "resource"]).size().reset_index(name="count")
relations_grouped_df = relations_grouped_df.sort_values(by=["count"], ascending=True)

print(relations_grouped_df.to_string(index=False))

                      source_target  resource   count
                       Gene_Pathway CuratedKG       1
                   Anatomy_Compound CuratedKG       1
          MolecularFunction_Symptom CuratedKG       1
                    Pathway_Symptom CuratedKG       1
                       Anatomy_Gene CuratedKG       1
                    Microbe_Pathway CuratedKG       1
          BiologicalProcess_Protein CuratedKG       1
                 Metabolite_Symptom CuratedKG       1
                   Compound_Symptom CuratedKG       1
         Disease_PharmacologicClass CuratedKG       1
          BiologicalProcess_Pathway CuratedKG       2
                    Protein_Symptom CuratedKG       2
                   Compound_Pathway CuratedKG       2
                    Disease_Protein CuratedKG       2
                   Compound_Microbe CuratedKG       2
                          Gene_Gene CuratedKG       3
                 Metabolite_Pathway CuratedKG       3
                    Microbe_

In [9]:
# use plotly to plot the distribution of relations, but don't include the "Total" row and ignore the count less than 10
import plotly.express as px

import plotly.io as pio
pio.renderers.default = 'vscode'

kept_relations = ["Compound_Compound", "Compound_Disease", "Compound_Gene", "Disease_Disease", "Disease_Gene", "Disease_Symptom", "Gene_Gene", "Gene_Pathway", "Pathway_Pathway"]

color_map = {
    "CuratedKG": "#636efa",
    "bioarx": "#EF553B",
    "Hetionet": "#00cc96",
    "INTACT": "#ab63fa",
    "DRUGBANK": "#FFA15A",
    "DGIDB": "#19d3f3",
    "GNBR": "#FF6692",
    "STRING": "#B6E880"
}

relations_grouped_df = relations_grouped_df[relations_grouped_df["source_target"].isin(kept_relations)]

fig = px.bar(relations_grouped_df, x="source_target", y="count", title="Distribution of relations [CustomKG]", color="resource", color_discrete_map=color_map)
# Order the x axis by the label of the relations
fig.update_xaxes(categoryorder="array", categoryarray=kept_relations)
# fig.update_traces(texttemplate='%{value:.2s}', textposition='inside')
fig.show()

# Extract the colors assigned to the bars
bar_colors = fig.data

# Output the colors
print(bar_colors)

(Bar({
    'alignmentgroup': 'True',
    'hovertemplate': 'resource=CuratedKG<br>source_target=%{x}<br>count=%{y}<extra></extra>',
    'legendgroup': 'CuratedKG',
    'marker': {'color': '#636efa', 'pattern': {'shape': ''}},
    'name': 'CuratedKG',
    'offsetgroup': 'CuratedKG',
    'orientation': 'v',
    'showlegend': True,
    'textposition': 'auto',
    'x': array(['Gene_Pathway', 'Gene_Gene', 'Compound_Compound', 'Compound_Gene',
                'Disease_Disease', 'Compound_Disease', 'Disease_Gene',
                'Disease_Symptom'], dtype=object),
    'xaxis': 'x',
    'y': array([     1,      3,      4,     30,    105,    177,    212, 140719]),
    'yaxis': 'y'
}), Bar({
    'alignmentgroup': 'True',
    'hovertemplate': 'resource=bioarx<br>source_target=%{x}<br>count=%{y}<extra></extra>',
    'legendgroup': 'bioarx',
    'marker': {'color': '#EF553B', 'pattern': {'shape': ''}},
    'name': 'bioarx',
    'offsetgroup': 'bioarx',
    'orientation': 'v',
    'showlegend': True,

In [16]:
# Merge entity_grouped_df and drkg_grouped_df and add a new column to indicate the source
entity_grouped_df["source"] = "CustomKG"
# Remove the "Total" row
drkg_grouped_df["source"] = "DRKG"

# Concatenate the two dataframes
merged_df = pd.concat([entity_grouped_df, drkg_grouped_df])

# Plot the distribution of entities
fig = px.bar(merged_df, x="label", y="count", title="Distribution of entities", color="source", barmode="group")
fig.update_traces(texttemplate='%{value:.2s}', textposition='outside')
fig.show()

In [1]:
def check_relation_exists(row, relations_df):
    source_type, source_id = row["source"].split("::")
    target_type, target_id = row["target"].split("::")
    option1 = (source_id, source_type, target_id, target_type) in relations_df.index

    target_type, target_id = row["source"].split("::")
    source_type, source_id = row["target"].split("::")
    option2 = (source_id, source_type, target_id, target_type) in relations_df.index
    return option1 or option2

In [3]:
import pandas as pd

def merge_scores_with_entities_relations(
    scores: pd.DataFrame,
    entities: pd.DataFrame,
    relations: pd.DataFrame,
    target: str = "head",
) -> pd.DataFrame:
    """Merge the topkpd with the entities and relations dataframes

    Args:
        scores (pd.DataFrame): The scores dataframe, which have four columns: head, rel, tail, score
        entities (pd.DataFrame): The entities dataframe
        relations (pd.DataFrame): The relations dataframe

    Returns:
        pd.DataFrame: The merged dataframe
    """
    # Join the topkpd with the entity dataframe
    df = entities.copy()
    relations_df = relations.copy()

    scores = scores.rename(
        columns={"head": "source", "tail": "target", "rel": "relation"}
    )

    column_name = "source" if target == "head" else "target"

    df["node_id"] = entities["label"] + "::" + entities["id"]
    merged = scores.merge(df, left_on=column_name, right_on="node_id")

    relations_df.set_index(
        ["source_id", "source_type", "target_id", "target_type"], inplace=True
    )

    # Apply the function with axis=1 to check if the relationship exists
    merged["status"] = merged.apply(check_relation_exists, args=(relations_df,), axis=1)
    return merged

In [5]:
import os

rootdir = os.getcwd()
entity_file = os.path.join(rootdir, "graph_data/entities.tsv")
entities = pd.read_csv(entity_file, sep="\t", dtype=str)

relations_df = pd.read_csv(
    os.path.join(
        rootdir,
        "datasets/drkg+hsdn+custom+malacards/raw_relations.tsv",
    ),
    sep="\t",
    dtype=str,
)

In [6]:
mecfs_scores = pd.read_csv(
    "/Users/jy006/Downloads/mecfs_scores_1000.tsv", sep="\t", dtype=str
)

# Merge scores with entities, scores have four columns: head, rel, tail, score
mecfs_scores_with_entities = merge_scores_with_entities_relations(
    mecfs_scores, entities, relations_df, target="head"
)

mecfs_scores_with_entities.to_csv(
    "/Users/jy006/Downloads/mecfs_scores_with_entities.tsv", sep="\t", index=False
)

  option1 = (source_id, source_type, target_id, target_type) in relations_df.index
  option2 = (source_id, source_type, target_id, target_type) in relations_df.index


In [7]:
import numpy as np
import pandas as pd

# 模拟数据：假设有5个实体和3种关系
num_entities = 5
num_relations = 3
embedding_dim = 10  # 嵌入维度

# 随机生成实体和关系的嵌入
np.random.seed(0)
entity_embeddings = np.random.rand(num_entities, embedding_dim)
relation_embeddings = np.random.rand(num_relations, embedding_dim)

# 模拟的三元组数据集
# 假设我们关注的目标节点是实体0，我们将考虑所有以实体0为尾实体的三元组
target_entity = 0
triplets = [
    ("entity_%s" % h, "relation_%s" % r, "entity_%s" % target_entity)
    for h in range(num_entities)
    for r in range(num_relations)
]

# Save to dataframe

triplets_df = pd.DataFrame(triplets)
triplets_df.to_csv(
    "/Users/jy006/Downloads/biomedgps/triplets.tsv", sep="\t", index=False, header=False
)

entity_embeddings_df = pd.DataFrame(entity_embeddings)
entity_embeddings_df["embedding"] = entity_embeddings_df.apply(
    lambda row: "|".join([str(x) for x in row.values]), axis=1
)
entity_embeddings_df["id"] = ["entity_%s" % i for i in entity_embeddings_df.index.to_list()]
entity_embeddings_df = entity_embeddings_df[["id", "embedding"]]
entity_embeddings_df.to_csv(
    "/Users/jy006/Downloads/biomedgps/entity_embeddings.tsv", sep="\t", index=False
)

relation_embeddings_df = pd.DataFrame(relation_embeddings)
relation_embeddings_df["embedding"] = relation_embeddings_df.apply(
    lambda row: "|".join([str(x) for x in row.values]), axis=1
)
relation_embeddings_df["id"] = ["relation_%s" % i for i in relation_embeddings_df.index.to_list()]
relation_embeddings_df = relation_embeddings_df[["id", "embedding"]]
relation_embeddings_df.to_csv(
    "/Users/jy006/Downloads/biomedgps/relation_embeddings.tsv", sep="\t", index=False
)

### Annotation

In [2]:
import os
import pandas as pd

rootdir = os.getcwd()
entity_file = os.path.join(rootdir, "graph_data/entities.tsv")
entities = pd.read_csv(entity_file, sep="\t", dtype=str)

formatted_drkg = pd.read_csv(
    os.path.join(
        rootdir,
        "datasets/rawdata/drkg/formatted_drkg.tsv",
    ),
    sep="\t",
    dtype=str,
)

formatted_custom_mecfs = pd.read_csv(
    os.path.join(
        rootdir,
        "datasets/custom/formatted_custom_mecfs.tsv",
    ),
    sep="\t",
    dtype=str,
)

formatted_hsdn = pd.read_csv(
    os.path.join(
        rootdir,
        "datasets/custom/formatted_hsdn.tsv",
    ),
    sep="\t",
    dtype=str,
)

formatted_malacards = pd.read_csv(
    os.path.join(
        rootdir,
        "datasets/custom/formatted_malacards.tsv",
    ),
    sep="\t",
    dtype=str,
)

relations_df = pd.concat(
    [formatted_drkg, formatted_custom_mecfs, formatted_hsdn, formatted_malacards]
)

relations_df = relations_df[[
    "relation_type", "resource", "pmids", "key_sentence",
    "source_id", "source_type",	"target_id", "target_type"
]]

In [3]:
# Annotate relations_df with the entities dataframe
entities = entities[['id', 'label', 'name', 'description']]
relations_df = relations_df.merge(entities, left_on=["source_id", "source_type"], right_on=["id", "label"])
relations_df = relations_df.rename(columns={"name": "source_name", "description": "source_description"})
relations_df = relations_df.drop(columns=["id", "label"])
relations_df = relations_df.merge(entities, left_on=["target_id", "target_type"], right_on=["id", "label"])
relations_df = relations_df.rename(columns={"name": "target_name", "description": "target_description"})
relations_df = relations_df.drop(columns=["id", "label"])

# Save to file
relations_df.to_csv(
    "/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/knowledge_graph_annotated.tsv", sep="\t", index=False
)

In [None]:
relations_df = relations_df.drop(columns=["source_description", "target_description"])
# Save to file
relations_df.to_csv(
    "/Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/knowledge_graph.tsv",
    sep="\t",
    index=False,
)