Merge 770cd16 into 4b1c854

monarch-initiative · Apr 15, 2016 · 7a72f8c · 7a72f8c
2 parents 4b1c854 + 770cd16
commit 7a72f8c
Show file tree

Hide file tree

Showing 14 changed files with 775 additions and 158 deletions.
diff --git a/README.md b/README.md
@@ -27,12 +27,12 @@ like [Protege](http://protege.stanford.edu/).
 ## Requirements
 * [Python 3](https://www.python.org/downloads/) or higher (and therefore pip3 if using pip)
 * One of the unit tests requires
-[owltools](https://code.google.com/p/owltools/wiki/InstallOWLTools) be available on your path.  You could modify
+[owltools](https://github.com/owlcollab/owltools) be available on your path.  You could modify
 the code to skip this, if necessary
 * Running make test requires nosetests (if on OS X you may need to `sudo pip3 install nose`)
 
 * Required external python packages:
-    * [rdflib](https://code.google.com/p/rdflib/)
+    * [rdflib](https://pypi.python.org/pypi/rdflib)
     * isodate
     * roman
     * pyyaml
@@ -43,9 +43,9 @@ the code to skip this, if necessary
     * [python-docx](https://github.com/python-openxml/python-docx)
     * beautifulsoup4
     * GitPython
-    * [biopython](https://github.com/biopython/biopython)
     * intermine
     * pysftp
+    * [Requests](http://requests.readthedocs.org/en/master/)
 
 Note, Dipper imports source modules dynamically at runtime.  As a result it is possible to build a core set
 of requirements and add source specific dependencies as needed.  Presently this only implemented with pip requirements
@@ -61,8 +61,6 @@ To install dependencies for all sources:
 
 If you encounter any errors installing these packages using Homebrew, it could be due to [a curent known issue in upgrading to  pip3](https://github.com/Homebrew/homebrew/issues/25752). In this case, first force reinstall pip2 (````pip2 install --upgrade --force-reinstall pip````) and then install the package using pip3 (eg. ````pip3 install psycopg2````.)
 
-* The OMIM source requires the 'compress' and 'uncompress' system commands to be available, for LZW decompression.  
-(This may be a problem for windows users.) 
 
 * Some of the parsers require login and/or connection credentials with the remote system.  In those cases
  you will need to add the credentials to a conf.json file.  Please see individual parsers for details.   
@@ -140,7 +138,7 @@ Furthermore, we wanted to provide the bioinformatics community with a set of scr
 get started transforming these standard data sources. 
 
 A manuscript is in preparation.  In the mean time, if you use any of our code or derived data, please cite 
-this repository and the [Monarch Initiative](http://www.monarchinitiative.org).
+this repository and the [Monarch Initiative](https://monarchinitiative.org).
 
 ## Identifiers
 Throughout the Monarch web application, we display external entities using their human-friendly labels

diff --git a/dipper/clinvar_alpha_word_ontology.txt b/dipper/clinvar_alpha_word_ontology.txt
@@ -0,0 +1,70 @@
+# Format:
+# the string to match, some tabs, the ontology identifier to return.
+# post octothorp comments and empty lines are ignored
+
+
+# Variant (Subject) Sequence Types  
+single nucleotide variant	SO:0001483
+Deletion					SO:0000159
+copy number gain			SO:0001742
+copy number loss			SO:0001743
+Duplication					SO:1000035
+Insertion					SO:0000667
+Indel						SO:1000032
+Variation					SO:0001059
+Microsatellite				GENO:0000847
+protein only				GENO:0000848
+inversion					SO:1000036
+Translocation				SO:0000199
+Tandem duplication			SO:1000173
+Complex						SO:0001784 
+fusion						SO:0001882
+Structural variant 			SO:0001785
+NT expansion				SO:1000039
+short repeat  				GENO:0000846
+undetermined variant  		SO:0001059
+
+# Association types
+benign								GENO:0000843  # benign
+Benign								GENO:0000843  # benign
+conflicting data from submitters	GENO:0000845  # uncertain significance
+Likely benign						GENO:0000844  # likely benign
+likely pathogenic					GENO:0000841  # likely pathogenic
+Likely pathogenic					GENO:0000841  # likely pathogenic
+no known pathogenicity				GENO:0000845  # uncertain significance
+non-pathogenic						GENO:0000845  # uncertain significance
+pathogenic							GENO:0000840  # pathogenic
+Pathogenic							GENO:0000840  # pathogenic
+Pathogenic/Likely pathogenic		GENO:0000840  # pathogenic
+pathologic							GENO:0000840  # pathogenic
+probable-non-pathogenic				GENO:0000844  # likely benign
+probable-pathogenic					GENO:0000841  # likely pathogenic
+probably not pathogenic				GENO:0000844  # likely benign
+Suspected Benign					GENO:0000844  # likely benign
+uncertain							GENO:0000845  # uncertain significance
+Uncertain significance				GENO:0000845  # uncertain significance
+unknown								GENO:0000845  # uncertain significance
+Unknown								GENO:0000845  # uncertain significance
+variant of unknown significance		GENO:0000845  # uncertain significance
+MUT									GENO:0000845  # uncertain significance 
+likely pathogenic - adrenal pheochromocytoma	GENO:0000841  # likely pathogenic
+
+# punting to RCV on disease for now
+# (would be a little under 10,000 terms to map here
+# maybe down to 3,500 if clustered)
+
+
+# evidence & provenance
+# SEPIO
+clinical testing					SEPIO:0000067
+literature only						SEPIO:0000080
+reference population 				SEPIO:0000102
+research							SEPIO:0000066
+curation							SEPIO:0000081
+in vitro							SEPIO:0000073
+in vivo								SEPIO:0000074
+case-control						SEPIO:0000071
+
+# provence
+ENIGMA BRCA1/2 Classification Criteria (2015)						SEPIO:1000001
+Counsyl Autosomal Dominant Disease Classification criteria (2015)	SEPIO_1000002
diff --git a/dipper/models/assoc/Association.py b/dipper/models/assoc/Association.py
@@ -42,7 +42,7 @@ class Assoc:
         'towards': 'RO:0002503',
         'has_subject': 'OBAN:association_has_subject',
         'has_object': 'OBAN:association_has_object',
-        'has_predicate': 'OBAN:association_has_object_property',
+        'has_predicate': 'OBAN:association_has_predicate',
         'is_about': 'IAO:00000136',
         'has_evidence': 'RO:0002558',
         'has_source': 'dc:source',

diff --git a/dipper/sources/ClinVar.py b/dipper/sources/ClinVar.py
@@ -286,7 +286,10 @@ def _get_variants(self, limit):
 
                 # they use -1 to indicate unknown gene
                 if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
-                    gene_id = ':'.join(('NCBIGene', str(gene_num)))
+                    if re.match(r'^Gene:', gene_num):
+                        gene_num = "NCBI" + gene_num
+                    else:
+                        gene_id = ':'.join(('NCBIGene', str(gene_num)))
 
                 # FIXME there are some "variants" that are actually haplotypes
                 # probably will get taken care of when we switch to processing
@@ -384,8 +387,10 @@ def _get_variants(self, limit):
                         m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", p)
                         if m is not None and len(m.groups()) > 0:
                             p = re.sub(m.group(1), 'Orphanet:', p.strip())
-                        elif re.match(r'SNOMED CT', p):
-                            p = re.sub(r'SNOMED CT', 'SNOMED', p.strip())
+                        elif re.match(r"ORPHA:", p):
+                            p = re.sub(r'ORPHA:', 'Orphanet:', p.strip())
+                        elif re.match(r'SNOMED .*', p):
+                            p = re.sub(r'SNOMED .*', 'SNOMED', p.strip())
 
                         assoc = G2PAssoc(self.name, seqalt_id, p.strip())
                         assoc.add_association_to_graph(g)