In [1]:
import grape
import pandas as pd

2024-01-27 14:12:31.900236: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
INFO:pykeen.utils:Using opt_einsum


In [2]:
from grape.datasets.kgobo import NCBITAXON

In [3]:
_ = NCBITAXON()

In [4]:
ncbi = pd.read_csv("./graphs/kgobo/NCBITAXON/2023-09-14/ncbitaxon_kgx_tsv/ncbitaxon_kgx_tsv_nodes.tsv", sep="\t", low_memory=False)

In [5]:
species = pd.read_csv("./data/molecules/230106_frozen_metadata.csv.gz", low_memory=False)

In [6]:
species['wd_taxon'] = 'wd:' + species['organism_wikidata'].str.extract(r"(Q\d+)")

In [7]:
species = species.dropna(subset='organism_taxonomy_ncbiid').drop_duplicates(subset='organism_taxonomy_ncbiid')

In [8]:
wd_to_ncbi_edges=pd.DataFrame({
    'wikidata': species['wd_taxon'],
    'ncbi': [f"NCBITaxon:{i}" for i in species['organism_taxonomy_ncbiid'].astype(int).values],
    'type': 'biolink:same_as',
})

In [9]:
ncbi_node = pd.concat(
    [
        pd.DataFrame({
            'node': wd_to_ncbi_edges.wikidata,
            'type': 'biolink:OrganismTaxon',
        }),
        pd.DataFrame({
            'node': wd_to_ncbi_edges.ncbi,
            'type': 'biolink:OrganismalEntity'
        })
    ]
).drop_duplicates()

In [10]:
from grape import Graph

In [11]:
wd_to_ncbi_graph = Graph.from_pd(
    directed=True,
    edges_df=wd_to_ncbi_edges,
    edge_src_column='wikidata',
    edge_dst_column='ncbi',
    edge_type_column='type',
    nodes_df=ncbi_node,
    node_name_column='node',
    node_type_column='type'
)

In [12]:
lotus = Graph.from_csv(
    name="LOTUS",
    node_path="./data/full_lotus_nodes.csv",
    node_list_separator="\t",
    node_list_header=True,
    nodes_column_number=0,
    node_list_node_types_column_number=1,
    edge_path="./data/full_lotus_edges.csv",
    edge_list_separator="\t",
    edge_list_header=True,
    sources_column_number=0,
    destinations_column_number=1,
    edge_list_edge_types_column_number=2,
    #weights_column_number=3,
    directed=True,
)

In [13]:
ncbi_graph = NCBITAXON()

In [14]:
lotus_with_ncbi = lotus|wd_to_ncbi_graph|ncbi_graph.to_directed()

In [15]:
lotus_with_ncbi.dump_nodes(
    path="./data/lotus_with_ncbi_nodes.csv",
    header=True,
    nodes_column_number=0,
    nodes_column="nodes",
    node_types_column_number=1,
    node_type_column="type"
)

In [16]:
lotus_with_ncbi.dump_edges(
    path="./data/lotus_with_ncbi_edges.csv",
    header=True,
    directed=True,
    edge_types_column_number=2,
    edge_type_column='edge_type',
)

In [17]:
lotus_with_ncbi_cleaned = lotus_with_ncbi.remove_singleton_nodes()
lotus_with_ncbi_cleaned = lotus_with_ncbi_cleaned.remove_components(top_k_components=1)

In [18]:
lotus_with_ncbi_cleaned.dump_nodes(
    path="./data/lotus_with_ncbi_clean_nodes.csv",
    header=True,
    nodes_column_number=0,
    nodes_column="nodes",
    node_types_column_number=1,
    node_type_column="type"
)
lotus_with_ncbi_cleaned.dump_edges(
    path="./data/lotus_with_ncbi_clean_edges.csv",
    header=True,
    directed=True,
    edge_types_column_number=2,
    edge_type_column='edge_type',
)