From 3b0fc3e11e5e7cf3ace7354fc0797cde37426f29 Mon Sep 17 00:00:00 2001 From: TomConlin Date: Fri, 20 May 2016 04:01:47 -0700 Subject: [PATCH] cross scv links exist --- dipper/sources/ClinVarXML_alpha.py | 96 ++++++++++++++++-------------- 1 file changed, 51 insertions(+), 45 deletions(-) diff --git a/dipper/sources/ClinVarXML_alpha.py b/dipper/sources/ClinVarXML_alpha.py index fd50bdcf..fd549784 100755 --- a/dipper/sources/ClinVarXML_alpha.py +++ b/dipper/sources/ClinVarXML_alpha.py @@ -12,10 +12,14 @@ import sys import gzip import hashlib +import logging import argparse import xml.etree.ElementTree as ET # import Requests + +logger = logging.getLogger(__name__) + # from dipper import curie_map # hangs on to stale data? @@ -27,15 +31,15 @@ # The name of the ingest we are doing IPATH = re.split(r'/', os.path.realpath(__file__)) -(INAME, dotpy) = re.split(r'\.', IPATH[-1].lower()) +(INAME, DOTPY) = re.split(r'\.', IPATH[-1].lower()) RPATH = '/' + '/'.join(IPATH[1:-3]) FILES = {'f1': 'ClinVarFullRelease_00-latest.xml.gz'} # I am not positive allowing the slash is ligit CURIERE = re.compile(r'^.*:[A-Za-z0-9_][A-Za-z0-9_./]*[A-Za-z0-9_]$') - -ENIGMA = 'https://submit.ncbi.nlm.nih.gov/ft/byid/hxnfuuxx/enigma_rules_2015-03-26.pdf' +ENIGMA = \ + 'https://submit.ncbi.nlm.nih.gov/ft/byid/hxnfuuxx/enigma_rules_2015-03-26.pdf' # handle arguments for IO argparser = argparse.ArgumentParser() @@ -82,7 +86,7 @@ 'foaf': 'http://xmlns.com/foaf/0.1/', '_': 'https://monarchinitiave.org/.well-known/genid/', 'BFO': 'http://purl.obolibrary.org/obo/BFO_', - 'ERO' : 'http://purl.obolibrary.org/obo/ERO_', + 'ERO': 'http://purl.obolibrary.org/obo/ERO_', 'GENO': 'http://purl.obolibrary.org/obo/GENO_', 'GO': 'http://purl.obolibrary.org/obo/GO_', 'RO': 'http://purl.obolibrary.org/obo/RO_', @@ -93,9 +97,9 @@ 'NCBITaxon': 'http://purl.obolibrary.org/obo/NCBITaxon_', 'NCBIGene': 'http://www.ncbi.nlm.nih.gov/gene/', 'MmusDv': 'http://purl.obolibrary.org/obo/MmusDv_', - 'OBAN': 'http://purl.org/oban/', 'owl': 'http://www.w3.org/2002/07/owl#', 'OBO': 'http://purl.obolibrary.org/obo/', + 'OIO': 'http://www.geneontology.org/formats/oboInOwl#', 'SEPIO': 'http://purl.obolibrary.org/obo/SEPIO_', 'ClinVar': 'http://www.ncbi.nlm.nih.gov/clinvar/', 'ClinVarVariant': 'http://www.ncbi.nlm.nih.gov/clinvar/variation/', @@ -128,7 +132,7 @@ def make_spo(sub, prd, obj): if match is not None and objcuri in CURIEMAP: objt = '<' + CURIEMAP[objcuri] + objid + '>' - elif(obj.isnumeric()): + elif obj.isnumeric(): objt = obj else: objt = '"' + obj.strip('"') + '"' @@ -140,7 +144,7 @@ def make_spo(sub, prd, obj): def scv_link(scv_sig): ''' Creates links between SCV based on their pathonnicty significancce - + # GENO:0000840 - GENO:0000840 --> equivalent_to SEPIO:0000098 # GENO:0000841 - GENO:0000841 --> equivalent_to SEPIO:0000098 # GENO:0000843 - GENO:0000843 --> equivalent_to SEPIO:0000098 @@ -158,6 +162,7 @@ def scv_link(scv_sig): 'GENO:0000841': 2, # likely pathogenic 'GENO:0000844': 4, # likely benign 'GENO:0000843': 8} # benign + lnk = { 0: 'SEPIO:0000098', 1: 'SEPIO:0000099', @@ -166,14 +171,15 @@ def scv_link(scv_sig): 4: 'SEPIO:0000099', 6: 'SEPIO:0000101', 7: 'SEPIO:0000100'} - - for scv_a in scv_sig.keys(): - scv_av = scv_sig[scv_a] - scv_sig.remove(scv_a) + keys = sorted(scv_sig.keys()) + for scv_a in keys: + scv_av = scv_sig.pop(scv_a) for scv_b in scv_sig.keys(): - link = lnk[abs(sig[scv_av] - sig[scv_sig[scv_b]])] - print(make_spo(scv_a, link, scv_b)) - print(make_spo(scv_b, link, scv_a)) + if scv_av in sig and scv_sig[scv_b] in sig: + link = lnk[abs(sig[scv_av] - sig[scv_sig[scv_b]])] + print(make_spo(scv_a, link, scv_b)) + print(make_spo(scv_b, link, scv_a)) + ################################################################ # CONSTANTS once at the beginning (would be better not at all). @@ -208,7 +214,6 @@ def scv_link(scv_sig): print(make_spo('OBO:RO_0003303', 'rdf:type', 'owl:ObjectProperty')) print(make_spo('OBO:GENO_0000418', 'rdf:type', 'owl:ObjectProperty')) - # larval stage term mapping file # will want namespace I expect. # strips comments and blank lines @@ -227,19 +232,18 @@ def scv_link(scv_sig): TREE = ET.parse(fh) ReleaseSet = TREE.getroot() if ReleaseSet.get('Type') != 'full': - print("Not a full release", file=sys.stderr) + logger.warning('Not a full release') sys.exit(-1) rs_dated = ReleaseSet.get('Dated') # "2016-03-01 (date_last_seen) for ClinVarSet in ReleaseSet.findall('ClinVarSet[RecordStatus]'): if ClinVarSet.find('RecordStatus').text != 'current': - print( - ClinVarSet.get('ID') + " " + - rs_dated + ' .', file=sys.stderr) + logger.warning( + ClinVarSet.get('ID') + " is not current as of " + rs_dated) continue # or break? - # collect svc significance calls + # collect svc significance calls within a rcv pathocalls = {} # There is only one RCV per ClinVarSet @@ -257,8 +261,9 @@ def scv_link(scv_sig): # I do not expect we care as we shouldn't keep the RCV. if RCVAssertion.find('RecordStatus').text != 'current': - print( - rcv_acc + " " + rs_dated, file=sys.stderr) + logger.warning( + rcv_acc + " " + rs_dated) + continue # Child elements # @@ -288,9 +293,9 @@ def scv_link(scv_sig): RCV_MeasureSet.findall('Measure'): rcv_variant_type = TT.get(RCV_Measure.get('Type')) if rcv_variant_type is None: - print( + logger.warning( rcv_acc + " UNKNOWN VARIANT TYPE " + - RCV_Measure.get('Type').text, file=sys.stderr) + RCV_Measure.get('Type').text) continue RCV_VariantName = RCV_Measure.find( @@ -298,8 +303,8 @@ def scv_link(scv_sig): if RCV_VariantName is not None: rcv_variant_label = RCV_VariantName.text else: - print( - rcv_acc + " VARIANT MISSING LABEL", file=sys.stderr) + logger.warning( + rcv_acc + " VARIANT MISSING LABEL") # /RCV/MeasureSet/Measure/Name/ElementValue/[@Type="Preferred"] ####################################################################### @@ -327,7 +332,7 @@ def scv_link(scv_sig): rcv_disease_label = RCV_TraitName.text # print(rcv_acc + ' ' + rcv_disease_label) else: - print(rcv_acc + " MISSING DISEASE NAME ", file=sys.stderr) + logger.warning(rcv_acc + " MISSING DISEASE NAME") # Prioritize OMIM for RCV_Trait in RCV_TraitSet.findall('Trait[@Type="Disease"]'): @@ -368,10 +373,10 @@ def scv_link(scv_sig): for RCV_Trait in\ RCV_TraitSet.findall('Trait[@Type="Disease"]'): for RCV_TraitXRef in RCV_Trait.findall('XRef'): - print( + logger.warning( rcv_acc + " UNKNOWN DISEASE DB:\t" + RCV_TraitXRef.get('DB') + ":" + - RCV_TraitXRef.get('ID'), file=sys.stderr) + RCV_TraitXRef.get('ID')) # 82372 MedGen # 58 EFO # 1 Human Phenotype Ontology @@ -382,7 +387,7 @@ def scv_link(scv_sig): if rcv_disease_db is None or rcv_disease_id is None or \ rcv_disease_label is None or rcv_variant_id is None or \ rcv_variant_type is None or rcv_variant_label is None: - print(rcv_acc + " RCV IS WONKY, BYEBYE", file=sys.stderr) + logger.warning(rcv_acc + " RCV IS WONKY, BYEBYE") continue rcv_disease_curi = rcv_disease_db + ':' + rcv_disease_id @@ -391,11 +396,9 @@ def scv_link(scv_sig): # Descend into each SCV grouped with the current RCV ####################################################################### - # keep a collection of an RCV's associations and their patho call - # when the collection is complete, i.e when pathocalls isn't empty here + # keep a collection of a SCV's associations and patho significance call + # when the collection is complete, # interlink based on patho call - if len(pathocalls) > 0: - link_scv(pathocalls) pathocalls = {} @@ -518,14 +521,14 @@ def scv_link(scv_sig): 'AttributeSet/Attribute[@Type="AssertionMethod"]') if SCV_Attribute is not None: scv_assert_method = SCV_Attribute.text - # this string needs to be mapped to a class curie + # this string needs to be mapped to a curie if scv_assert_method in TT: scv_assert_id = TT[scv_assert_method] # TRIPLES specified_by # <:_assertion_id> print(make_spo( _assertion_id, 'SEPIO:0000041', scv_assert_id)) - # + # # print(make_spo( @@ -546,7 +549,6 @@ def scv_link(scv_sig): # SCV_ReviewStatus = ClinicalSignificance.find('ReviewStatus') # if SCV_ReviewStatus is not None: # scv_review = SCV_ReviewStatus.text - SCV_Description = ClinicalSignificance.find('Description') SCV_Citation = \ ClinicalSignificance.find('Citation/ID[@Source="PubMed"]') @@ -565,7 +567,8 @@ def scv_link(scv_sig): 'literature-based study')) scv_significance = scv_geno = None - if SCV_Description: + SCV_Description = ClinicalSignificance.find('Description') + if SCV_Description is not None: scv_significance = SCV_Description.text scv_geno = TT[scv_significance] if scv_geno is not None: @@ -576,14 +579,15 @@ def scv_link(scv_sig): monarch_assoc, 'OBAN:association_has_predicate', scv_geno)) - # store association's significance to compare w/sibs - pathocalls.update((monarch_assoc, scv_geno)) # - print(make_spo(rcv_variant_id, scv_geno, rcv_disease_curi)) + print(make_spo('ClinVarVariant:' + rcv_variant_id, scv_geno, rcv_disease_curi)) # . print(make_spo( monarch_assoc, 'OIO:hasdbxref', 'ClinVar:' + rcv_acc)) + # store association's significance to compare w/sibs + pathocalls[monarch_assoc] = scv_geno + # scv_assert_type = SCV_Assertion.find('Assertion').get('Type') # check scv_assert_type == 'variation to disease'? @@ -615,7 +619,9 @@ def scv_link(scv_sig): # blank node _provenance_id = \ '_:' + \ - hashlib.md5((_evidence_id + scv_evidence_type).encode('utf-8')).hexdigest()[1:17] + hashlib.md5( + (_evidence_id + scv_evidence_type). + encode('utf-8')).hexdigest()[1:17] # TRIPLES # has_provenance # <_evidence_id><_provenence_id> @@ -688,9 +694,7 @@ def scv_link(scv_sig): SCV_NCBI = SCV_Measure.find('XRef[@DB="Gene"]') if SCV_NCBI is not None: - scv_ncbigene_id = '\ - NCBIGene:' + SCV_NCBI.get('ID') - + scv_ncbigene_id = 'NCBIGene:' + SCV_NCBI.get('ID') # TRIPLES # print(make_spo( @@ -713,3 +717,5 @@ def scv_link(scv_sig): # if SCV_CiteId: # scv_citesource = SCV_CiteId.get('Source') # scv_citeid = SCV_CiteId.text + # print("write out any scv links for ", pathocalls) + scv_link(pathocalls)