In [10]:
import rdflib
import csv
import time

ruta_csv = "../bbdd/datos_mircancer" 
ncrna2gene = rdflib.Graph()
ncrna2gene.parse("ncrna2gene.txt", format="turtle")

<Graph identifier=Nafea39ef5c1744cab6ab78ad75a0d760 (<class 'rdflib.graph.Graph'>)>

In [11]:
start = time.time()

# enlazar prefijos a namespaces
ncrna2gene.bind("rdf", rdflib.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))
ncrna2gene.bind("owl", rdflib.URIRef("http://www.w3.org/2002/07/owl#"))
ncrna2gene.bind("rdfs", rdflib.URIRef("http://www.w3.org/2000/01/rdf-schema#"))
ncrna2gene.bind("skos", rdflib.URIRef("http://www.w3.org/2004/02/skos/core#"))
ncrna2gene.bind("dcterms", rdflib.URIRef("http://purl.org/dc/terms/"))
ncrna2gene.bind("obo", rdflib.URIRef("http://purl.obolibrary.org/obo/"))
ncrna2gene.bind("sio", rdflib.URIRef("http://semanticscience.org/resource/"))
ncrna2gene.bind("faldo", rdflib.URIRef("http://biohackathon.org/resource/faldo#"))
ncrna2gene.bind("ncrna2genebg", rdflib.URIRef("http://rdf.biogateway.eu/ncrna2gene/9606/"))
ncrna2gene.bind("genebg", rdflib.URIRef("http://rdf.biogateway.eu/gene/9606/"))
ncrna2gene.bind("ncrnabg", rdflib.URIRef("http://rdf.biogateway.eu/ncrna/9606/"))
ncrna2gene.bind("ncrnabg", rdflib.URIRef("http://rdf.biogateway.eu/ncrna2gene/9606/"))
ncrna2gene.bind("assembly", rdflib.URIRef("https://www.ncbi.nlm.nih.gov/assembly/"))
ncrna2gene.bind("schema", rdflib.URIRef("http://schema.org/"))
ncrna2gene.bind("bao", rdflib.URIRef("http://www.bioassayontology.org/bao#"))
ncrna2gene.bind("ncbi", rdflib.URIRef("https://www.ncbi.nlm.nih.gov/"))
ncrna2gene.bind("id", rdflib.URIRef("http://identifiers.org/"))
ncrna2gene.bind("biolink", rdflib.URIRef("https://wx3id.org/biolink/vocab/"))

# definir un diccionario con los ID de las bases de datos que se usen
database_ids = {
    "RefSeq": "http://purl.obolibrary.org/obo/NCIT_C45335",  
    "Ensembl": "http://purl.obolibrary.org/obo/NCIT_C45763",
    "miRBase": "http://purl.obolibrary.org/obo/MI_2358",
    "miRCancer": "http://mircancer.ecu.edu/",
    "LNCipedia": "https://lncipedia.org/download"
}


with open(ruta_csv, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if row['Gene_ID'] in [None, '', 'NA']:
            continue
        # extraer ID de gen y transcrito si viene en formato de URI
        gene_id = row['Gene_ID']
        ncrna_id = row['ncRNA_ID'].split('/')[-1]

        # indicar la base de datos de la que se obtiene la información
        data_source = "miRCancer"

        ncRNAgene = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrnagene/9606/{gene_id}")
        ncRNA = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrna/9606/{ncrna_id}")
        ncrna2gen = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrna2gene/bgw!{ncrna_id}--hgncsymbol!9606/{gene_id}")

        ncRNAgene_source = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrnagene/9606/{gene_id}#{data_source}")
        ncRNA_source = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrna/9606/{ncrna_id}#{data_source}")
        ncrna2gene_source = rdflib.URIRef(f"http://rdf.biogateway.eu/ncrna2gene/bgw!{ncrna_id}--hgncsymbol!9606/{gene_id}#{data_source}")
        database_uri = database_ids.get(data_source, "http://purl.obolibrary.org/obo/NCIT_C15426")

        ncrna2gene.add((ncrna2gen, rdflib.RDF.type, rdflib.OWL.Class))
        ncrna2gene.add((ncrna2gene_source, rdflib.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), ncrna2gen))
        ncrna2gene.add((ncrna2gen, rdflib.RDFS.subClassOf, rdflib.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement")))
        ncrna2gene.add((ncrna2gen, rdflib.RDF.predicate, rdflib.URIRef("http://semanticscience.org/resource/SIO_010080")))
        ncrna2gene.add((ncrna2gen, rdflib.namespace.SKOS.prefLabel, rdflib.Literal(f"bgw!{ncrna_id}--hgncsymbol!9606/{gene_id}")))
        ncrna2gene.add((ncrna2gene_source, rdflib.namespace.SKOS.prefLabel, rdflib.Literal(f"bgw!{ncrna_id}--hgncsymbol!9606/{gene_id} according to {data_source}.")))
        ncrna2gene.add((ncrna2gen, rdflib.namespace.SKOS.definition, rdflib.Literal(f"Non-coding RNA {ncrna_id} encoded by gene {gene_id}.")))
        ncrna2gene.add((ncrna2gene_source, rdflib.namespace.SKOS.definition, rdflib.Literal(f"Non-coding RNA {ncrna_id} encoded by gene {gene_id} according to {data_source}.")))
        ncrna2gene.add((ncrna2gen, rdflib.RDF.object, rdflib.URIRef(f"http://rdf.biogateway.eu/gene/9606/{gene_id}")))
        ncrna2gene.add((ncrna2gen, rdflib.RDF.subject, rdflib.URIRef(f"http://rdf.biogateway.eu/ncrna/9606/{ncrna_id}")))
        ncrna2gene.add((ncrna2gene_source, rdflib.URIRef("http://schema.org/evidenceOrigin"), rdflib.URIRef(database_uri)))
        ncrna2gene.add((ncrna2gene_source, rdflib.URIRef("http://semanticscience.org/resource/SIO_000253"), rdflib.URIRef(database_uri)))

output_name = f'ncrna2gene_{data_source.lower()}.rdf'
ncrna2gene.serialize(destination=output_name, format='turtle')

end = time.time()


In [5]:
total_time = (end - start) / 60
print(f"El fichero '{output_name}' ha sido creado en {total_time:.2f} minutos.")

El fichero 'ncrna2gene_ensembl.rdf' ha sido creado en 0.83 minutos.
