Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
TomConlin committed Apr 5, 2016
1 parent 6587f21 commit 4443792
Showing 1 changed file with 63 additions and 79 deletions.
142 changes: 63 additions & 79 deletions dipper/sources/ClinVarXML_alpha.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@

# http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/sample_xml/RCV000077146.xml

# I'm running in another dir so you will have to also
# have the xml and the mapping file there too
# I'm running in another dir so you will have to also,
# be sure to have the xml and the mapping file there too

# FILENAME = 'BRCA_ClinVarSet.xml.gz'
#
FILENAME = 'ClinVarFullRelease_00-latest.xml.gz'

# TODO: Edge labels done once at the beginning
# <SEPIO:0000007><rdfs:label><'has_supporting_evidence'> .
# <SEPIO:0000011><rdfs:label><'has_provenance'> .
# <SEPIO:0000017><rdfs:label><'has_agent'> .
# <SEPIO:0000095><rdfs:label><'before_date'> .
# <SEPIO:0000041><rdfs:label><'specified_by'> .

# scv_assertcount = scv_measurecount = scv_traitcount = scv_citecount = 0
# rs_cvset = 0

# larval stage term mapping file
# will want namespace I expect.
Expand Down Expand Up @@ -51,10 +55,11 @@
continue # or break?

# There is only one RCV per ClinVarSet
rcv_variant_id = rcv_variant_type = None
rcv_disease_db = rcv_disease_id = None
RCVAssertion = ClinVarSet.find('ReferenceClinVarAssertion')
rcv_variant_id = rcv_variant_type = rcv_variant_label = None
rcv_disease_db = rcv_disease_id = rcv_disease_label = None
rcv_disease_curi = None

RCVAssertion = ClinVarSet.find('ReferenceClinVarAssertion')
rcv_created = RCVAssertion.get('DateCreated')
rcv_updated = RCVAssertion.get('DateLastUpdated')
rcv_id = RCVAssertion.get('ID')
Expand All @@ -66,7 +71,6 @@
if RCVAssertion.find('RecordStatus').text != 'current':
print(
rcv_acc + " <is not current on> " + rs_dated, file=sys.stderr)
pass

# Child elements
#
Expand All @@ -89,8 +93,7 @@

RCV_MeasureSet = RCVAssertion.find('MeasureSet')
# Note: it is a "set" but have only seen a half dozen with two,
# all of type: copy number gain SO:0001742t.
rcv_variant_id = rcv_variant_type = rcv_variant_label = None
# all of type: copy number gain SO:0001742
rcv_variant_id = RCV_MeasureSet.get('ID')

for RCV_Measure in \
Expand All @@ -101,6 +104,7 @@
rcv_acc + " UNKNOWN VARIANT TYPE " +
RCV_Measure.get('Type').text, file=sys.stderr)
continue

RCV_VariantName = RCV_Measure.find(
'Name/ElementValue[@Type="Preferred"]')
if RCV_VariantName is not None:
Expand All @@ -115,16 +119,12 @@
# reluctantly starting with the RCV disease
# not the SCV traits as submitted due to time constraints

# RCV_TraitSet = RCVAssertion.find('TraitSet')

for RCV_TraitSet in RCVAssertion.findall('TraitSet'):

# /ReleaseSet/ClinVarSet/ReferenceClinVarAssertion/
# TraitSet/Trait[@Type="Disease"]/@ID
# /RCV/TraitSet/Trait[@Type="Disease"]/@ID
# 144,327 2016-Mar

# /ReleaseSet/ClinVarSet/ReferenceClinVarAssertion/
# TraitSet/Trait[@Type="Disease"]/XRef/@DB
# /RCV/TraitSet/Trait[@Type="Disease"]/XRef/@DB
# 29 Human Phenotype Ontology
# 82 EFO
# 659 Gene
Expand All @@ -138,7 +138,7 @@

if RCV_TraitName is not None:
rcv_disease_label = RCV_TraitName.text
print("rcv_disease_label: ", rcv_disease_label)
# print(rcv_acc + ' ' + rcv_disease_label)
else:
print(rcv_acc + " MISSING DISEASE NAME ", file=sys.stderr)

Expand All @@ -152,7 +152,7 @@
break

# Accept Orphanet if no OMIM
if rcv_disease_db is None:
if rcv_disease_db is None or rcv_disease_id is None:
for RCV_Trait in \
RCV_TraitSet.findall('Trait[@Type="Disease"]'):
if rcv_disease_db is not None:
Expand All @@ -169,26 +169,27 @@
for RCV_Trait in\
RCV_TraitSet.findall('Trait[@Type="Disease"]'):
for RCV_TraitXRef in RCV_Trait.findall('XRef'):
print(
rcv_acc + "\tUNKNOWN DISEASE DB:\t" +
RCV_TraitXRef.get('DB') + "\t" +
RCV_TraitXRef.get('ID'), file=sys.stderr)
# print(
# rcv_acc + " UNKNOWN DISEASE DB:\t" +
# RCV_TraitXRef.get('DB') + ":" +
# RCV_TraitXRef.get('ID'), file=sys.stderr)
# 82372 MedGen
# 58 EFO
# 1 Human Phenotype Ontology
break

# Check that we have enough info from the RCV
# to justify parsing the related SCVs
if rcv_disease_db is None or rcv_disease_id is None or \
rcv_variant_id is None or rcv_variant_type is None:
rcv_disease_label is None or rcv_variant_id is None or \
rcv_variant_type is None or rcv_variant_label is None:
print(rcv_acc + " ERROR IS WONKY BYEBYE", file=sys.stderr)
continue

rcv_disease_curi = rcv_disease_db + '.' + rcv_disease_id
# else:
# print(rcv_acc + "\t" + rcv_variant_id + "\t" + rcv_variant_type)
# print(rcv_acc + "\t" + rcv_disease_db + "\t" + rcv_disease_id)
rcv_disease_curi = rcv_disease_db + ':' + rcv_disease_id

#######################################################################
# Descend into each SCV grouped with the current RCV
#######################################################################

for SCV_Assertion in ClinVarSet.findall('ClinVarAssertion'):
Expand Down Expand Up @@ -216,33 +217,29 @@

scv_id = SCV_Assertion.get('ID')
monarch_id = hashlib.md5(
rcv_id.encode('utf-8') + scv_id.encode('utf-8')).hexdigest()[1:17]

(rcv_id + scv_id).encode('utf-8')).hexdigest()[1:17]
monarch_assoc = 'MONARCH:' + monarch_id

ClinVarAccession = SCV_Assertion.find('ClinVarAccession')
scv_acc = ClinVarAccession.get('Acc')
scv_accver = int(ClinVarAccession.get('Version'))
scv_orgid = ClinVarAccession.get('OrgID')
scv_updated = ClinVarAccession.get('DateUpdated')

# blank node identifiers
evidence_id = r'_' + monarch_assoc + '->evidence'
provenance_id = r'_' + monarch_assoc + '->provenance'
evidence_id = '_' + monarch_assoc + '->evidence'
provenance_id = '_' + monarch_assoc + '->provenance'
assertion_id = provenance_id + '->assertion'

# TODO: Edge labels done once at the beginning
# <SEPIO:0000007><rdfs:label><'has_supporting_evidence'> .
# <SEPIO:0000011><rdfs:label><'has_provenance'> .
# <SEPIO:0000017><rdfs:label><'has_agent'> .
# <SEPIO:0000095><rdfs:label><'before_date'> .
# <SEPIO:0000041><rdfs:label><'specified_by'> .

# TRIPLES
# <monarch_assoc><rdf:type><OBAN:association> .
# <monarch_assoc><association_has_subject><ClinVarVariant:rcv_variant_id> .
# <ClinVarVariant:rcv_variant_id><rdfs:label><rcv_variant_label> .
# <ClinVarVariant:rcv_variant_id><rdf:type><rcv_variant_type> .
# <monarch_assoc><association_has_object><rcv_disease_db:rcv_disease_id> .
# <rcv_disease_db:rcv_disease_id><rdfs:label><rcv_disease_label> .

# <monarch_assoc><SEPIO:0000007><_:evidence_id> .
# <monarch_assoc><SEPIO:0000011><_:provenance_id> .

# <_:evidence_id><rdf:type><SEPIO:0000000> .
# <_:evidence_id><rdfs:label><'evidence line'> .
# <_:provenance_id><rdf:type><SEPIO:0000003> .
Expand All @@ -251,22 +248,9 @@
# <_:provenance_id><had_output><_:assertion_id> .
# <_:assertion_id><rdf:type><SEPIO:0000001> .
# <_:assertion_id><rdfs:label><'assertion'> .

# scv_name = SCV_Assertion.get('SubmissionName')
# ClinVarSubmissionID
ClinVarAccession = SCV_Assertion.find('ClinVarAccession')
scv_acc = ClinVarAccession.get('Acc')
scv_accver = int(ClinVarAccession.get('Version'))
scv_orgid = ClinVarAccession.get('OrgID')
scv_updated = ClinVarAccession.get('DateUpdated')

# TRIPLES
# TODO CURI for
# CVS: = 'http://www.ncbi.nlm.nih.gov/clinvar/submitters/'
#
# <_:assertion_id><dc:identifier><scv_acc + '.' + scv_accver>
# <_:provenance_id><SEPIO:0000017><CVS:scv_orgid> .
# <CVS:scv_orgid><rdf:type><FOAF:organization> .
# <_:provenance_id><SEPIO:0000017><ClinVar:submitters/scv_orgid> .
# <ClinVar:submitters/scv_orgid><rdf:type><FOAF:organization> .
# <_:provenance_id><SEPIO:0000095><scv_updated> .

# /SCV/AttributeSet/Attribute[@Type="AssertionMethod"]
Expand All @@ -285,8 +269,8 @@
# SEPIO:0000037 'variant classification guideline' #class label

# scv_type = ClinVarAccession.get('Type') # assert == 'SCV' ?
# AdditionalSubmitters
# RecordStatus # assert =='current' ?

ClinicalSignificance = SCV_Assertion.find('ClinicalSignificance')
# scv_eval_date = ClinicalSignificance.get('DateLastEvaluated')
# SCV_ReviewStatus = ClinicalSignificance.find('ReviewStatus')
Expand Down Expand Up @@ -319,45 +303,44 @@

for SCV_ObsIn in SCV_Assertion.findall('ObservedIn'):

# /*/*/*/ObservedIn/Sample
# /*/*/*/ObservedIn/Method
# /*/*/*/ObservedIn/ObservedData
# /*/*/*/ObservedIn/TraitSet
# /*/*/*/ObservedIn/Citation
# /*/*/*/ObservedIn/Co-occurrenceSet
# /*/*/*/ObservedIn/Comment
# /*/*/*/ObservedIn/XRef
# /SCV/ObservedIn/Sample
# /SCV/ObservedIn/Method
# /SCV/ObservedIn/ObservedData
# /SCV/ObservedIn/TraitSet
# /SCV/ObservedIn/Citation
# /SCV/ObservedIn/Co-occurrenceSet
# /SCV/ObservedIn/Comment
# /SCV/ObservedIn/XRef

# Sample/Origin
# Sample/Species@TaxonomyId="9606" is a constant
# scv_affectedstatus = \
# SCV_ObsIn.find('Sample').find('AffectedStatus').text

# Method/NamePlatform
# Method/TypePlatform
# Method/Description
# Method/SourceType
# Method/MethodType
# SCV/ObservedIn/Method/MethodType
# /SCV/ObservedIn/Method/NamePlatform
# /SCV/ObservedIn/Method/TypePlatform
# /SCV/ObservedIn/Method/Description
# /SCV/ObservedIn/Method/SourceType
# /SCV/ObservedIn/Method/MethodType
# /SCV/ObservedIn/Method/MethodType
for SCV_OIMT in SCV_ObsIn.findall('Method/MethodType'):
if 'not provided' != SCV_OIMT.text:
scv_evidence_type = onto_map[SCV_OIMT.text]
# TODO need 'not provided' mapping
# TODO need 'not provided' mapping? prolly not.

# TRIPLES
# has_supporting_process
# <_:evidence_id><SEPIO:0000085><scv_evidence_type>
# TRIPLES
# has_supporting_process
# <_:evidence_id><SEPIO:0000085><scv_evidence_type>

# ObservedData/Attribute@Type
# ObservedData/Attribute@integerValue
# /SCV/ObservedIn/ObservedData/Attribute@Type
# /SCV/ObservedIn/ObservedData/Attribute@integerValue

# Trait being taken from RCV instead (,for now)
# for SCV_TraitSet in SCV_ObsIn.findall('TraitSet'):

# /*/*/*/*/TraitSet/Comment
# /*/*/*/*/TraitSet/Trait

# for SCV_Trait in SCV_TraitSet.findall('Trait'):

# # /*/*/*/*/*/Trait/Name
# # /*/*/*/*/*/Trait/Symbol
# # /*/*/*/*/*/Trait/TraitRelationship
Expand Down Expand Up @@ -406,7 +389,8 @@

SCV_NCBI = SCV_Measure.find('XRef[@DB="Gene"]')
if SCV_NCBI is not None:
scv_ncbigene_id = 'NCBIGene:' + SCV_NCBI.get('ID')
scv_ncbigene_id = '\
NCBIGene:' + SCV_NCBI.get('ID')

# TRIPLES
# <rcv_variant_id><GENO:0000418><scv_ncbigene_id>
Expand Down

0 comments on commit 4443792

Please sign in to comment.