From 6fe840de41d246366e1f820b160eacee007081c6 Mon Sep 17 00:00:00 2001 From: kshefchek Date: Tue, 2 Feb 2016 13:41:20 -0800 Subject: [PATCH] refactors postgres functions into new PostgreSQLSource class, cleans up setup.py and requirments (remove docx and biopython), updates to README --- README.md | 21 ++-- dipper/sources/MGI.py | 6 +- dipper/sources/MPD.py | 2 - dipper/sources/PostgreSQLSource.py | 149 +++++++++++++++++++++++++++++ dipper/sources/Source.py | 136 +------------------------- requirements.txt | 2 - requirements/core.txt | 6 ++ requirements/coriell.txt | 1 + requirements/genereviews.txt | 1 + requirements/hpoannotations.txt | 1 + requirements/mgi.txt | 1 + requirements/zfin.txt | 1 + setup.py | 4 +- 13 files changed, 182 insertions(+), 149 deletions(-) create mode 100644 dipper/sources/PostgreSQLSource.py create mode 100644 requirements/core.txt create mode 100644 requirements/coriell.txt create mode 100644 requirements/genereviews.txt create mode 100644 requirements/hpoannotations.txt create mode 100644 requirements/mgi.txt create mode 100644 requirements/zfin.txt diff --git a/README.md b/README.md index 9af497c1..aeff47d7 100644 --- a/README.md +++ b/README.md @@ -29,20 +29,29 @@ like [Protege](http://protege.stanford.edu/). * One of the unit tests requires [owltools](https://code.google.com/p/owltools/wiki/InstallOWLTools) be available on your path. You could modify the code to skip this, if necessary -* unit tests require nosetests (if on OS X you may need to `sudo pip3 install nose`) +* Running make test requires nosetests (if on OS X you may need to `sudo pip3 install nose`) * Required external python packages: - * [psycopg2](http://initd.org/psycopg/) * [rdflib](https://code.google.com/p/rdflib/) * isodate * roman - * [python-docx](https://github.com/python-openxml/python-docx) * pyyaml - * pysftp - * [biopython](https://github.com/biopython/biopython) - * docx + + +* Optional source specific python packages: + * [psycopg2](http://initd.org/psycopg/) + * [python-docx](https://github.com/python-openxml/python-docx) * beautifulsoup4 * GitPython + * intermine + * pysftp + +Note, Dipper imports source modules dynamically at runtime. As a result it is possible to build a core set +of requirements and add source specific dependencies as needed. Presently this only implemented with pip requirements +files. For example to build dependencies for MGI: + + pip3 install -r requirements/core.txt + pip3 install -r requirements/mgi.txt If you encounter any errors installing these packages using Homebrew, it could be due to [a curent known issue in upgrading to pip3](https://github.com/Homebrew/homebrew/issues/25752). In this case, first force reinstall pip2 (````pip2 install --upgrade --force-reinstall pip````) and then install the package using pip3 (eg. ````pip3 install psycopg2````.) diff --git a/dipper/sources/MGI.py b/dipper/sources/MGI.py index 838f1b74..bc5948bf 100644 --- a/dipper/sources/MGI.py +++ b/dipper/sources/MGI.py @@ -4,7 +4,7 @@ import logging import re -from dipper.sources.Source import Source +from dipper.sources.PostgreSQLSource import PostgreSQLSource from dipper.models.assoc.Association import Assoc from dipper.models.Dataset import Dataset from dipper.models.assoc.G2PAssoc import G2PAssoc @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -class MGI(Source): +class MGI(PostgreSQLSource): """ This is the [Mouse Genome Informatics](http://www.informatics.jax.org/) resource, from which we process genotype and phenotype data about laboratory mice. @@ -127,7 +127,7 @@ class MGI(Source): } def __init__(self): - Source.__init__(self, 'mgi') + super().__init__('mgi') self.namespaces.update(curie_map.get()) # update the dataset object with details about this resource diff --git a/dipper/sources/MPD.py b/dipper/sources/MPD.py index ca8f2000..2bc794cf 100644 --- a/dipper/sources/MPD.py +++ b/dipper/sources/MPD.py @@ -2,8 +2,6 @@ import re import logging import io -import math -import zipfile from zipfile import ZipFile from dipper.models.Provenance import Provenance diff --git a/dipper/sources/PostgreSQLSource.py b/dipper/sources/PostgreSQLSource.py new file mode 100644 index 00000000..929e7801 --- /dev/null +++ b/dipper/sources/PostgreSQLSource.py @@ -0,0 +1,149 @@ +import psycopg2 +import logging +import os +from dipper.sources.Source import Source + + +logger = logging.getLogger(__name__) + + +class PostgreSQLSource(Source): + """ + Class for interfacing with remote Postgres databases + """ + + def __init__(self, name=None): + super().__init__(name) + return + + def fetch_from_pgdb(self, tables, cxn, limit=None, force=False): + """ + Will fetch all Postgres tables from the specified database in the cxn connection parameters. + This will save them to a local file named the same as the table, in tab-delimited format, including a header. + :param tables: Names of tables to fetch + :param cxn: database connection details + :param limit: A max row count to fetch for each table + :return: None + """ + + con = None + try: + con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'], + user=cxn['user'], password=cxn['password']) + cur = con.cursor() + for t in tables: + logger.info("Fetching data from table %s", t) + self._getcols(cur, t) + query = ' '.join(("SELECT * FROM", t)) + countquery = ' '.join(("SELECT COUNT(*) FROM", t)) + if limit is not None: + query = ' '.join((query, "LIMIT", str(limit))) + countquery = ' '.join((countquery, "LIMIT", str(limit))) + + outfile = '/'.join((self.rawdir,t)) + + filerowcount = -1 + tablerowcount = -1 + if not force: + # check local copy. assume that if the # rows are the same, that the table is the same + # TODO may want to fix this assumption + if os.path.exists(outfile): + # get rows in the file + filerowcount = self.file_len(outfile) + logger.info("rows in local file: %s", filerowcount) + + # get rows in the table + # tablerowcount=cur.rowcount + cur.execute(countquery) + tablerowcount = cur.fetchone()[0] + + if force or filerowcount < 0 or (filerowcount-1) != tablerowcount: # rowcount-1 because there's a header + if force: + logger.info("Forcing download of %s", t) + else: + logger.info("%s local (%d) different from remote (%d); fetching.", t, filerowcount, tablerowcount) + # download the file + logger.info("COMMAND:%s", query) + outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query) + with open(outfile, 'w') as f: + cur.copy_expert(outputquery, f) + else: + logger.info("local data same as remote; reusing.") + + finally: + if con: + con.close() + return + + def fetch_query_from_pgdb(self, qname, query, con, cxn, limit=None, force=False): + """ + Supply either an already established connection, or connection parameters. + The supplied connection will override any separate cxn parameter + :param qname: The name of the query to save the output to + :param query: The SQL query itself + :param con: The already-established connection + :param cxn: The postgres connection information + :param limit: If you only want a subset of rows from the query + :return: + """ + if con is None and cxn is None: + logger.error("ERROR: you need to supply connection information") + return + if con is None and cxn is not None: + con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'], + user=cxn['user'], password=cxn['password']) + + outfile = '/'.join((self.rawdir, qname)) + cur = con.cursor() + countquery = ' '.join(("SELECT COUNT(*) FROM (", query, ") x")) # wrap the query to get the count + if limit is not None: + countquery = ' '.join((countquery, "LIMIT", str(limit))) + + # check local copy. assume that if the # rows are the same, that the table is the same + filerowcount = -1 + tablerowcount = -1 + if not force: + if os.path.exists(outfile): + # get rows in the file + filerowcount = self.file_len(outfile) + logger.info("INFO: rows in local file: %s", filerowcount) + + # get rows in the table + # tablerowcount=cur.rowcount + cur.execute(countquery) + tablerowcount = cur.fetchone()[0] + + if force or filerowcount < 0 or (filerowcount-1) != tablerowcount: # rowcount-1 because there's a header + if force: + logger.info("Forcing download of %s", qname) + else: + logger.info("%s local (%s) different from remote (%s); fetching.", qname, filerowcount, tablerowcount) + # download the file + logger.debug("COMMAND:%s", query) + outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query) + with open(outfile, 'w') as f: + cur.copy_expert(outputquery, f) + # Regenerate row count to check integrity + filerowcount = self.file_len(outfile) + if (filerowcount-1) != tablerowcount: + raise Exception("Download from MGI failed, %s != %s", (filerowcount-1), tablerowcount) + else: + logger.info("local data same as remote; reusing.") + + return + + # TODO generalize this to a set of utils + def _getcols(self, cur, table): + """ + Will execute a pg query to get the column names for the given table. + :param cur: + :param table: + :return: + """ + query = ' '.join(("SELECT * FROM", table, "LIMIT 0")) # for testing + + cur.execute(query) + colnames = [desc[0] for desc in cur.description] + logger.info("COLS (%s): %s", table, colnames) + + return \ No newline at end of file diff --git a/dipper/sources/Source.py b/dipper/sources/Source.py index 95b13142..572bae5f 100644 --- a/dipper/sources/Source.py +++ b/dipper/sources/Source.py @@ -1,4 +1,4 @@ -import psycopg2 +#import psycopg2 __author__ = 'nicole' @@ -230,7 +230,7 @@ def checkIfRemoteIsNewer(self, remote, local, headers): resp_header = response.getheaders() size = resp_header.get('Content-length') last_modified = resp_header.get('last-modified') # check me - except : + except: resp_header = None size = 0 last_modified = None @@ -326,122 +326,6 @@ def fetch_from_url(self, remotefile, localfile, is_dl_forced, headers=None): logger.info("file created: %s", time.asctime(time.localtime(st[ST_CTIME]))) return - def fetch_from_pgdb(self, tables, cxn, limit=None, force=False): - """ - Will fetch all Postgres tables from the specified database in the cxn connection parameters. - This will save them to a local file named the same as the table, in tab-delimited format, including a header. - :param tables: Names of tables to fetch - :param cxn: database connection details - :param limit: A max row count to fetch for each table - :return: None - """ - - con = None - try: - con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'], - user=cxn['user'], password=cxn['password']) - cur = con.cursor() - for t in tables: - logger.info("Fetching data from table %s", t) - self._getcols(cur, t) - query = ' '.join(("SELECT * FROM", t)) - countquery = ' '.join(("SELECT COUNT(*) FROM", t)) - if limit is not None: - query = ' '.join((query, "LIMIT", str(limit))) - countquery = ' '.join((countquery, "LIMIT", str(limit))) - - outfile = '/'.join((self.rawdir,t)) - - filerowcount = -1 - tablerowcount = -1 - if not force: - # check local copy. assume that if the # rows are the same, that the table is the same - # TODO may want to fix this assumption - if os.path.exists(outfile): - # get rows in the file - filerowcount = self.file_len(outfile) - logger.info("rows in local file: %s", filerowcount) - - # get rows in the table - # tablerowcount=cur.rowcount - cur.execute(countquery) - tablerowcount = cur.fetchone()[0] - - if force or filerowcount < 0 or (filerowcount-1) != tablerowcount: # rowcount-1 because there's a header - if force: - logger.info("Forcing download of %s", t) - else: - logger.info("%s local (%d) different from remote (%d); fetching.", t, filerowcount, tablerowcount) - # download the file - logger.info("COMMAND:%s", query) - outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query) - with open(outfile, 'w') as f: - cur.copy_expert(outputquery, f) - else: - logger.info("local data same as remote; reusing.") - - finally: - if con: - con.close() - return - - def fetch_query_from_pgdb(self, qname, query, con, cxn, limit=None, force=False): - """ - Supply either an already established connection, or connection parameters. - The supplied connection will override any separate cxn parameter - :param qname: The name of the query to save the output to - :param query: The SQL query itself - :param con: The already-established connection - :param cxn: The postgres connection information - :param limit: If you only want a subset of rows from the query - :return: - """ - if con is None and cxn is None: - logger.error("ERROR: you need to supply connection information") - return - if con is None and cxn is not None: - con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'], - user=cxn['user'], password=cxn['password']) - - outfile = '/'.join((self.rawdir, qname)) - cur = con.cursor() - countquery = ' '.join(("SELECT COUNT(*) FROM (", query, ") x")) # wrap the query to get the count - if limit is not None: - countquery = ' '.join((countquery, "LIMIT", str(limit))) - - # check local copy. assume that if the # rows are the same, that the table is the same - filerowcount = -1 - tablerowcount = -1 - if not force: - if os.path.exists(outfile): - # get rows in the file - filerowcount = self.file_len(outfile) - logger.info("INFO: rows in local file: %s", filerowcount) - - # get rows in the table - # tablerowcount=cur.rowcount - cur.execute(countquery) - tablerowcount = cur.fetchone()[0] - - if force or filerowcount < 0 or (filerowcount-1) != tablerowcount: # rowcount-1 because there's a header - if force: - logger.info("Forcing download of %s", qname) - else: - logger.info("%s local (%s) different from remote (%s); fetching.", qname, filerowcount, tablerowcount) - # download the file - logger.debug("COMMAND:%s", query) - outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query) - with open(outfile, 'w') as f: - cur.copy_expert(outputquery, f) - # Regenerate row count to check integrity - filerowcount = self.file_len(outfile) - if (filerowcount-1) != tablerowcount: - raise Exception("Download from MGI failed, %s != %s", (filerowcount-1), tablerowcount) - else: - logger.info("local data same as remote; reusing.") - - return - def process_xml_table(self, elem, table_name, processing_function, limit): """ This is a convenience function to process the elements of an xml document, when the xml is used @@ -545,22 +429,6 @@ def compare_local_remote_bytes(self, remotefile, localfile): local_size, remotefile, remote_size) return is_equal - # TODO generalize this to a set of utils - def _getcols(self, cur, table): - """ - Will execute a pg query to get the column names for the given table. - :param cur: - :param table: - :return: - """ - query = ' '.join(("SELECT * FROM", table, "LIMIT 0")) # for testing - - cur.execute(query) - colnames = [desc[0] for desc in cur.description] - logger.info("COLS (%s): %s", table, colnames) - - return - def file_len(self, fname): with open(fname) as f: l = sum(1 for line in f) diff --git a/requirements.txt b/requirements.txt index d0850162..c308b9df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,6 @@ roman python-docx pyyaml pysftp -biopython -docx beautifulsoup4 GitPython intermine diff --git a/requirements/core.txt b/requirements/core.txt new file mode 100644 index 00000000..f96fef0f --- /dev/null +++ b/requirements/core.txt @@ -0,0 +1,6 @@ +rdflib +isodate +roman +python-docx +pyyaml +biopython \ No newline at end of file diff --git a/requirements/coriell.txt b/requirements/coriell.txt new file mode 100644 index 00000000..89c07e1d --- /dev/null +++ b/requirements/coriell.txt @@ -0,0 +1 @@ +pysftp \ No newline at end of file diff --git a/requirements/genereviews.txt b/requirements/genereviews.txt new file mode 100644 index 00000000..041f722c --- /dev/null +++ b/requirements/genereviews.txt @@ -0,0 +1 @@ +beautifulsoup4 \ No newline at end of file diff --git a/requirements/hpoannotations.txt b/requirements/hpoannotations.txt new file mode 100644 index 00000000..5779f396 --- /dev/null +++ b/requirements/hpoannotations.txt @@ -0,0 +1 @@ +GitPython \ No newline at end of file diff --git a/requirements/mgi.txt b/requirements/mgi.txt new file mode 100644 index 00000000..ddb37e19 --- /dev/null +++ b/requirements/mgi.txt @@ -0,0 +1 @@ +psycopg2 \ No newline at end of file diff --git a/requirements/zfin.txt b/requirements/zfin.txt new file mode 100644 index 00000000..c44287fd --- /dev/null +++ b/requirements/zfin.txt @@ -0,0 +1 @@ +intermine \ No newline at end of file diff --git a/setup.py b/setup.py index d4242249..442a11c0 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ version='0.0.1', description='Data Ingest Pipeline', packages=find_packages(), - install_requires=['psycopg2', 'rdflib', 'isodate', 'roman', 'python-docx', 'pyyaml', 'pysftp', 'biopython', - 'docx', 'beautifulsoup4', 'GitPython', 'intermine'], + install_requires=['psycopg2', 'rdflib', 'isodate', 'roman', 'python-docx', 'pyyaml', 'pysftp', + 'beautifulsoup4', 'GitPython', 'intermine'], include_package_data=True )