## Extract relationships from a ttl file and output them as a biomedgps tsv file.

### 1. Dependencies

In [50]:
import re
import os
import rdflib
import pandas as pd

def extract(ttl_file, dest_dir):
    # Initialize the graph
    g = rdflib.Graph()

    # Parse the TTL file
    ttl_data = open(ttl_file, "r").read()
    g.parse(data=ttl_data, format="turtle")

    # Define namespaces
    wp = rdflib.Namespace("http://vocabularies.wikipathways.org/wp#")
    rdf = rdflib.namespace.RDF

    # Helper function to extract identifier from URI
    def extract_id(uri):
        return uri.split("/")[-1]

    # List to hold extracted relationships
    relationships = []

    def get_type(graph, node):
        types = [str(t).split("#")[-1] for t in graph.objects(node, rdf.type)]
        if "Metabolite" in types:
            return "Metabolite"
        elif "GeneProduct" in types:
            return "Gene"
        elif "Protein" in types:
            return "Gene"

    def format_id(id, g):
        formatted_id = extract_id(id)
        t = get_type(g, id)
        if t == "Gene":
            if "ncbigene" in id:
                return f"ENTREZ:{formatted_id}", "Gene"
            elif "ensembl" in id:
                return f"ENSEMBL:{formatted_id}", "Gene"
            elif "uniprot" in id:
                if g:
                    entrez_id = g.value(rdflib.URIRef(id), wp.bdbEntrezGene)
                    if entrez_id:
                        return f"ENTREZ:{extract_id(str(entrez_id))}", "Gene"
                return f"UniProt:{formatted_id}", "Gene"
        elif t == "Metabolite" and id.startswith("HMDB"):
            return f"HMDB:{id}", "Metabolite"
        elif t == "Metabolite":
            if g:
                hmdb_id = g.value(rdflib.URIRef(id), wp.bdbHmdb)
                if hmdb_id:
                    return f"HMDB:{extract_id(str(hmdb_id))}", "Metabolite"
            return id, "Metabolite"
        elif t == "Gene" and re.match(r"^\d+$", id):
            return f"ENTREZ:{id}", "Gene"
        return id, t

    query = """
    PREFIX wp: <http://vocabularies.wikipathways.org/wp#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?organism ?organismName
    WHERE {
    ?s wp:organism ?organism ;
        wp:organismName ?organismName .
    }
    """

    is_human_mouse_rat = False
    results = g.query(query)
    for row in results:
        print("\t", row.organism, row.organismName)
        if row.organism and str(row.organism) in [
            "http://purl.obolibrary.org/obo/NCBITaxon_9606",
            "http://purl.obolibrary.org/obo/NCBITaxon_10090",
            "http://purl.obolibrary.org/obo/NCBITaxon_10116",
        ]:
            is_human_mouse_rat = True
            break

    if not is_human_mouse_rat:
        print(f"Skipping {ttl_file} as it is not a human, mouse, or rat pathway")
        return

    def is_invalid_id(id):
        if not (
            id.startswith("ENTREZ")
            or id.startswith("HMDB")
            or id.startswith("CHEBI")
            or id.startswith("ENSEMBL")
            or id.startswith("WP")
            or id.startswith("UniProt")
        ):
            return True
        return False

    pathway_id = "Wikipathways:" + os.path.basename(ttl_file).removesuffix(".ttl")
    # Iterate through triples and filter the desired interactions
    for s, p, o in g:
        if str(s).startswith(
            "http://rdf.wikipathways.org/Pathway/"
        ) and rdf.type:
            if "Interaction" in str(s):
                sources = list(g.objects(s, wp.source))
                targets = list(g.objects(s, wp.target))

                if sources and targets:
                    for source in sources:
                        for target in targets:

                            source_id, source_type = format_id(source, g)
                            target_id, target_type = format_id(target, g)

                            if is_invalid_id(source_id) or is_invalid_id(target_id):
                                print(f"Foud invalid id: {source_id} or {target_id} at {ttl_file}")

                            if "http://rdf.wikipathways.org/Pathway" in str(source):
                                relationships.append(
                                    [
                                        target_id,
                                        pathway_id,
                                        target_type,
                                        "Pathway",
                                        f"Wikipathways::InPathway::{target_type}:Pathway",
                                        "Wikipathways",
                                    ]
                                )
                                continue

                            if "http://rdf.wikipathways.org/Pathway" in str(target):
                                relationships.append(
                                    [
                                        source_id,
                                        pathway_id,
                                        source_type,
                                        "Pathway",
                                        f"Wikipathways::InPathway::{source_type}:Pathway",
                                        "Wikipathways",
                                    ]
                                )
                                continue

                            relation_type = f"Wikipathways::DirectedInteraction::{source_type}:{target_type}"
                            relationships.append(
                                [
                                    source_id,
                                    target_id,
                                    source_type,
                                    target_type,
                                    relation_type,
                                    "Wikipathways",
                                ]
                            )

                            relationships.append(
                                [
                                    target_id,
                                    pathway_id,
                                    target_type,
                                    "Pathway",
                                    f"Wikipathways::InPathway::{target_type}:Pathway",
                                    "Wikipathways",
                                ]
                            )

                        relationships.append(
                            [
                                source_id,
                                pathway_id,
                                source_type,
                                "Pathway",
                                f"Wikipathways::InPathway::{source_type}:Pathway",
                                "Wikipathways",
                            ]
                        )
            elif "Interaction" in str(s) or "ComplexBinding" in str(s) or "Complex" in str(s):
                participants = list(g.objects(s, wp.participants))
                all_items = []
                for participant in participants:
                    p_id, p_type = format_id(participant, g)
                    if p_type is None:
                        continue
                    
                    if is_invalid_id(p_id):
                        print(f"Foud invalid id: {p_id} at {ttl_file}")
                    all_items.append((p_id, p_type))
                    relationships.append(
                        [
                            p_id,
                            pathway_id,
                            p_type,
                            "Pathway",
                            f"Wikipathways::InPathway::{p_type}:Pathway",
                            "Wikipathways",
                        ]
                    )
                # Generate all possible combinations of participants
                for i in range(len(all_items)):
                    for j in range(i + 1, len(all_items)):
                        source_id, source_type = all_items[i]
                        target_id, target_type = all_items[j]
                        relation_type = f"Wikipathways::ComplexBinding::{source_type}:{target_type}"
                        relationships.append(
                            [
                                source_id,
                                target_id,
                                source_type,
                                target_type,
                                relation_type,
                                "Wikipathways",
                            ]
                        )

    # Convert to DataFrame
    columns = [
        "source_id",
        "target_id",
        "source_type",
        "target_type",
        "relation_type",
        "resource",
    ]
    df = pd.DataFrame(relationships, columns=columns).drop_duplicates()

    df.to_csv(os.path.join(dest_dir, os.path.basename(ttl_file).removesuffix(".ttl") + ".tsv"), index=False, sep="\t")

### 2. Download the ttl file from the following link and unzip it.

https://data.wikipathways.org/current/rdf/

Unzip the file and copy the resulting ttl file to the input folder.

In [2]:
import os

link = "https://data.wikipathways.org/current/rdf/wikipathways-20240610-rdf-wp.zip"
os.system(f"wget {link}")
os.system("unzip wikipathways-20240610-rdf-wp.zip")

--2024-06-14 10:39:38--  https://data.wikipathways.org/current/rdf/wikipathways-20240610-rdf-wp.zip
Resolving data.wikipathways.org (data.wikipathways.org)... 185.199.111.153, 185.199.108.153, 185.199.109.153, ...
Connecting to data.wikipathways.org (data.wikipathways.org)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13983770 (13M) [application/zip]
Saving to: ‘wikipathways-20240610-rdf-wp.zip’

     0K .......... .......... .......... .......... ..........  0% 1.63M 8s
    50K .......... .......... .......... .......... ..........  0% 2.69M 7s
   100K .......... .......... .......... .......... ..........  1% 4.57M 5s
   150K .......... .......... .......... .......... ..........  1% 3.04M 5s
   200K .......... .......... .......... .......... ..........  1% 3.16M 5s
   250K .......... .......... .......... .......... ..........  2% 4.74M 4s
   300K .......... .......... .......... .......... ..........  2% 3.25M 4s
   350K .......... .....

Archive:  wikipathways-20240610-rdf-wp.zip
   creating: wp/
  inflating: wp/WP508.ttl            
  inflating: wp/WP4206.ttl           
  inflating: wp/WP427.ttl            
  inflating: wp/WP5224.ttl           
  inflating: wp/WP85.ttl             
  inflating: wp/WP1541.ttl           
  inflating: wp/WP4944.ttl           
  inflating: wp/WP4853.ttl           
  inflating: wp/WP2860.ttl           
  inflating: wp/WP3185.ttl           
  inflating: wp/WP2944.ttl           
  inflating: wp/WP1591.ttl           
  inflating: wp/WP5043.ttl           
  inflating: wp/WP5420.ttl           
  inflating: wp/WP332.ttl            
  inflating: wp/WP2113.ttl           
  inflating: wp/WP5181.ttl           
  inflating: wp/WP5199.ttl           
  inflating: wp/WP4545.ttl           
  inflating: wp/WP2212.ttl           
  inflating: wp/WP3248.ttl           
  inflating: wp/WP5143.ttl           
  inflating: wp/WP1016.ttl           
  inflating: wp/WP2839.ttl           
  inflating: wp/WP4504.ttl  

0

### 3. List and convert all ttl files to tsv files.

In [51]:
wp_input_dir = os.path.join(os.getcwd(), "wp")
wp_dest_dir = os.path.join(os.getcwd(), "wp_extracted")

if os.path.exists(wp_dest_dir):
    os.system(f"rm -r {wp_dest_dir}")
os.makedirs(wp_dest_dir, exist_ok=True)

ttl_files = [
    os.path.join(wp_input_dir, f) for f in os.listdir(wp_input_dir) if f.endswith(".ttl") and f.startswith("WP")
]
for ttl_file in ttl_files:
    if ttl_file.endswith("WP5347.ttl"):
        print(f"Processing {ttl_file}")
        extract(ttl_file, wp_dest_dir)

Processing /Users/jy006/Documents/Code/BioMedGPS/biomedgps-data/graph_data/relations/wikipathways/wp/WP5347.ttl
	 http://purl.obolibrary.org/obo/NCBITaxon_9606 Homo sapiens
