From 6fe840de41d246366e1f820b160eacee007081c6 Mon Sep 17 00:00:00 2001
From: kshefchek <kshefchek@gmail.com>
Date: Tue, 2 Feb 2016 13:41:20 -0800
Subject: [PATCH] refactors postgres functions into new PostgreSQLSource class,
 cleans up setup.py and requirments (remove docx and biopython), updates to
 README

---
 README.md                          |  21 ++--
 dipper/sources/MGI.py              |   6 +-
 dipper/sources/MPD.py              |   2 -
 dipper/sources/PostgreSQLSource.py | 149 +++++++++++++++++++++++++++++
 dipper/sources/Source.py           | 136 +-------------------------
 requirements.txt                   |   2 -
 requirements/core.txt              |   6 ++
 requirements/coriell.txt           |   1 +
 requirements/genereviews.txt       |   1 +
 requirements/hpoannotations.txt    |   1 +
 requirements/mgi.txt               |   1 +
 requirements/zfin.txt              |   1 +
 setup.py                           |   4 +-
 13 files changed, 182 insertions(+), 149 deletions(-)
 create mode 100644 dipper/sources/PostgreSQLSource.py
 create mode 100644 requirements/core.txt
 create mode 100644 requirements/coriell.txt
 create mode 100644 requirements/genereviews.txt
 create mode 100644 requirements/hpoannotations.txt
 create mode 100644 requirements/mgi.txt
 create mode 100644 requirements/zfin.txt

diff --git a/README.md b/README.md
index 9af497c1..aeff47d7 100644
--- a/README.md
+++ b/README.md
@@ -29,20 +29,29 @@ like [Protege](http://protege.stanford.edu/).
 * One of the unit tests requires
 [owltools](https://code.google.com/p/owltools/wiki/InstallOWLTools) be available on your path.  You could modify
 the code to skip this, if necessary
-* unit tests require nosetests (if on OS X you may need to `sudo pip3 install nose`)
+* Running make test requires nosetests (if on OS X you may need to `sudo pip3 install nose`)
 
 * Required external python packages:
-    * [psycopg2](http://initd.org/psycopg/) 
     * [rdflib](https://code.google.com/p/rdflib/)
     * isodate
     * roman
-    * [python-docx](https://github.com/python-openxml/python-docx)
     * pyyaml
-    * pysftp
-    * [biopython](https://github.com/biopython/biopython)
-    * docx
+
+    
+* Optional source specific python packages:
+    * [psycopg2](http://initd.org/psycopg/)
+    * [python-docx](https://github.com/python-openxml/python-docx)
     * beautifulsoup4
     * GitPython
+    * intermine
+    * pysftp
+    
+Note, Dipper imports source modules dynamically at runtime.  As a result it is possible to build a core set
+of requirements and add source specific dependencies as needed.  Presently this only implemented with pip requirements
+files. For example to build dependencies for MGI:
+
+        pip3 install -r requirements/core.txt
+        pip3 install -r requirements/mgi.txt
     
 If you encounter any errors installing these packages using Homebrew, it could be due to [a curent known issue in upgrading to  pip3](https://github.com/Homebrew/homebrew/issues/25752). In this case, first force reinstall pip2 (````pip2 install --upgrade --force-reinstall pip````) and then install the package using pip3 (eg. ````pip3 install psycopg2````.)
 
diff --git a/dipper/sources/MGI.py b/dipper/sources/MGI.py
index 838f1b74..bc5948bf 100644
--- a/dipper/sources/MGI.py
+++ b/dipper/sources/MGI.py
@@ -4,7 +4,7 @@
 import logging
 import re
 
-from dipper.sources.Source import Source
+from dipper.sources.PostgreSQLSource import PostgreSQLSource
 from dipper.models.assoc.Association import Assoc
 from dipper.models.Dataset import Dataset
 from dipper.models.assoc.G2PAssoc import G2PAssoc
@@ -19,7 +19,7 @@
 logger = logging.getLogger(__name__)
 
 
-class MGI(Source):
+class MGI(PostgreSQLSource):
     """
     This is the [Mouse Genome Informatics](http://www.informatics.jax.org/) resource,
     from which we process genotype and phenotype data about laboratory mice.
@@ -127,7 +127,7 @@ class MGI(Source):
     }
 
     def __init__(self):
-        Source.__init__(self, 'mgi')
+        super().__init__('mgi')
         self.namespaces.update(curie_map.get())
 
         # update the dataset object with details about this resource
diff --git a/dipper/sources/MPD.py b/dipper/sources/MPD.py
index ca8f2000..2bc794cf 100644
--- a/dipper/sources/MPD.py
+++ b/dipper/sources/MPD.py
@@ -2,8 +2,6 @@
 import re
 import logging
 import io
-import math
-import zipfile
 from zipfile import ZipFile
 from dipper.models.Provenance import Provenance
 
diff --git a/dipper/sources/PostgreSQLSource.py b/dipper/sources/PostgreSQLSource.py
new file mode 100644
index 00000000..929e7801
--- /dev/null
+++ b/dipper/sources/PostgreSQLSource.py
@@ -0,0 +1,149 @@
+import psycopg2
+import logging
+import os
+from dipper.sources.Source import Source
+
+
+logger = logging.getLogger(__name__)
+
+
+class PostgreSQLSource(Source):
+    """
+    Class for interfacing with remote Postgres databases
+    """
+
+    def __init__(self, name=None):
+        super().__init__(name)
+        return
+
+    def fetch_from_pgdb(self, tables, cxn, limit=None, force=False):
+        """
+        Will fetch all Postgres tables from the specified database in the cxn connection parameters.
+        This will save them to a local file named the same as the table, in tab-delimited format, including a header.
+        :param tables: Names of tables to fetch
+        :param cxn: database connection details
+        :param limit: A max row count to fetch for each table
+        :return: None
+        """
+
+        con = None
+        try:
+            con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'],
+                                   user=cxn['user'], password=cxn['password'])
+            cur = con.cursor()
+            for t in tables:
+                logger.info("Fetching data from table %s", t)
+                self._getcols(cur, t)
+                query = ' '.join(("SELECT * FROM", t))
+                countquery = ' '.join(("SELECT COUNT(*) FROM", t))
+                if limit is not None:
+                    query = ' '.join((query, "LIMIT", str(limit)))
+                    countquery = ' '.join((countquery, "LIMIT", str(limit)))
+
+                outfile = '/'.join((self.rawdir,t))
+
+                filerowcount = -1
+                tablerowcount = -1
+                if not force:
+                    # check local copy.  assume that if the # rows are the same, that the table is the same
+                    # TODO may want to fix this assumption
+                    if os.path.exists(outfile):
+                        # get rows in the file
+                        filerowcount = self.file_len(outfile)
+                        logger.info("rows in local file: %s", filerowcount)
+
+                    # get rows in the table
+                    # tablerowcount=cur.rowcount
+                    cur.execute(countquery)
+                    tablerowcount = cur.fetchone()[0]
+
+                if force or filerowcount < 0 or (filerowcount-1) != tablerowcount:  # rowcount-1 because there's a header
+                    if force:
+                        logger.info("Forcing download of %s", t)
+                    else:
+                        logger.info("%s local (%d) different from remote (%d); fetching.", t, filerowcount, tablerowcount)
+                    # download the file
+                    logger.info("COMMAND:%s", query)
+                    outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query)
+                    with open(outfile, 'w') as f:
+                        cur.copy_expert(outputquery, f)
+                else:
+                    logger.info("local data same as remote; reusing.")
+
+        finally:
+            if con:
+                con.close()
+        return
+
+    def fetch_query_from_pgdb(self, qname, query, con, cxn, limit=None, force=False):
+        """
+        Supply either an already established connection, or connection parameters.
+        The supplied connection will override any separate cxn parameter
+        :param qname:  The name of the query to save the output to
+        :param query:  The SQL query itself
+        :param con:  The already-established connection
+        :param cxn: The postgres connection information
+        :param limit: If you only want a subset of rows from the query
+        :return:
+        """
+        if con is None and cxn is None:
+            logger.error("ERROR: you need to supply connection information")
+            return
+        if con is None and cxn is not None:
+            con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'],
+                                   user=cxn['user'], password=cxn['password'])
+
+        outfile = '/'.join((self.rawdir, qname))
+        cur = con.cursor()
+        countquery = ' '.join(("SELECT COUNT(*) FROM (", query, ") x"))  # wrap the query to get the count
+        if limit is not None:
+            countquery = ' '.join((countquery, "LIMIT", str(limit)))
+
+        # check local copy.  assume that if the # rows are the same, that the table is the same
+        filerowcount = -1
+        tablerowcount = -1
+        if not force:
+            if os.path.exists(outfile):
+                # get rows in the file
+                filerowcount = self.file_len(outfile)
+                logger.info("INFO: rows in local file: %s", filerowcount)
+
+            # get rows in the table
+            # tablerowcount=cur.rowcount
+            cur.execute(countquery)
+            tablerowcount = cur.fetchone()[0]
+
+        if force or filerowcount < 0 or (filerowcount-1) != tablerowcount:  # rowcount-1 because there's a header
+            if force:
+                logger.info("Forcing download of %s", qname)
+            else:
+                logger.info("%s local (%s) different from remote (%s); fetching.", qname, filerowcount, tablerowcount)
+            # download the file
+            logger.debug("COMMAND:%s", query)
+            outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query)
+            with open(outfile, 'w') as f:
+                cur.copy_expert(outputquery, f)
+            # Regenerate row count to check integrity
+            filerowcount = self.file_len(outfile)
+            if (filerowcount-1) != tablerowcount:
+                raise Exception("Download from MGI failed, %s != %s", (filerowcount-1), tablerowcount)
+        else:
+            logger.info("local data same as remote; reusing.")
+
+        return
+
+        # TODO generalize this to a set of utils
+    def _getcols(self, cur, table):
+        """
+        Will execute a pg query to get the column names for the given table.
+        :param cur:
+        :param table:
+        :return:
+        """
+        query = ' '.join(("SELECT * FROM", table, "LIMIT 0"))  # for testing
+
+        cur.execute(query)
+        colnames = [desc[0] for desc in cur.description]
+        logger.info("COLS (%s): %s", table, colnames)
+
+        return
\ No newline at end of file
diff --git a/dipper/sources/Source.py b/dipper/sources/Source.py
index 95b13142..572bae5f 100644
--- a/dipper/sources/Source.py
+++ b/dipper/sources/Source.py
@@ -1,4 +1,4 @@
-import psycopg2
+#import psycopg2
 
 __author__ = 'nicole'
 
@@ -230,7 +230,7 @@ def checkIfRemoteIsNewer(self, remote, local, headers):
             resp_header = response.getheaders()
             size = resp_header.get('Content-length')
             last_modified = resp_header.get('last-modified')  # check me
-        except :
+        except:
             resp_header = None
             size = 0
             last_modified = None
@@ -326,122 +326,6 @@ def fetch_from_url(self, remotefile, localfile, is_dl_forced, headers=None):
         logger.info("file created: %s", time.asctime(time.localtime(st[ST_CTIME])))
         return
 
-    def fetch_from_pgdb(self, tables, cxn, limit=None, force=False):
-        """
-        Will fetch all Postgres tables from the specified database in the cxn connection parameters.
-        This will save them to a local file named the same as the table, in tab-delimited format, including a header.
-        :param tables: Names of tables to fetch
-        :param cxn: database connection details
-        :param limit: A max row count to fetch for each table
-        :return: None
-        """
-
-        con = None
-        try:
-            con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'],
-                                   user=cxn['user'], password=cxn['password'])
-            cur = con.cursor()
-            for t in tables:
-                logger.info("Fetching data from table %s", t)
-                self._getcols(cur, t)
-                query = ' '.join(("SELECT * FROM", t))
-                countquery = ' '.join(("SELECT COUNT(*) FROM", t))
-                if limit is not None:
-                    query = ' '.join((query, "LIMIT", str(limit)))
-                    countquery = ' '.join((countquery, "LIMIT", str(limit)))
-
-                outfile = '/'.join((self.rawdir,t))
-
-                filerowcount = -1
-                tablerowcount = -1
-                if not force:
-                    # check local copy.  assume that if the # rows are the same, that the table is the same
-                    # TODO may want to fix this assumption
-                    if os.path.exists(outfile):
-                        # get rows in the file
-                        filerowcount = self.file_len(outfile)
-                        logger.info("rows in local file: %s", filerowcount)
-
-                    # get rows in the table
-                    # tablerowcount=cur.rowcount
-                    cur.execute(countquery)
-                    tablerowcount = cur.fetchone()[0]
-
-                if force or filerowcount < 0 or (filerowcount-1) != tablerowcount:  # rowcount-1 because there's a header
-                    if force:
-                        logger.info("Forcing download of %s", t)
-                    else:
-                        logger.info("%s local (%d) different from remote (%d); fetching.", t, filerowcount, tablerowcount)
-                    # download the file
-                    logger.info("COMMAND:%s", query)
-                    outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query)
-                    with open(outfile, 'w') as f:
-                        cur.copy_expert(outputquery, f)
-                else:
-                    logger.info("local data same as remote; reusing.")
-
-        finally:
-            if con:
-                con.close()
-        return
-
-    def fetch_query_from_pgdb(self, qname, query, con, cxn, limit=None, force=False):
-        """
-        Supply either an already established connection, or connection parameters.
-        The supplied connection will override any separate cxn parameter
-        :param qname:  The name of the query to save the output to
-        :param query:  The SQL query itself
-        :param con:  The already-established connection
-        :param cxn: The postgres connection information
-        :param limit: If you only want a subset of rows from the query
-        :return:
-        """
-        if con is None and cxn is None:
-            logger.error("ERROR: you need to supply connection information")
-            return
-        if con is None and cxn is not None:
-            con = psycopg2.connect(host=cxn['host'], database=cxn['database'], port=cxn['port'],
-                                   user=cxn['user'], password=cxn['password'])
-
-        outfile = '/'.join((self.rawdir, qname))
-        cur = con.cursor()
-        countquery = ' '.join(("SELECT COUNT(*) FROM (", query, ") x"))  # wrap the query to get the count
-        if limit is not None:
-            countquery = ' '.join((countquery, "LIMIT", str(limit)))
-
-        # check local copy.  assume that if the # rows are the same, that the table is the same
-        filerowcount = -1
-        tablerowcount = -1
-        if not force:
-            if os.path.exists(outfile):
-                # get rows in the file
-                filerowcount = self.file_len(outfile)
-                logger.info("INFO: rows in local file: %s", filerowcount)
-
-            # get rows in the table
-            # tablerowcount=cur.rowcount
-            cur.execute(countquery)
-            tablerowcount = cur.fetchone()[0]
-
-        if force or filerowcount < 0 or (filerowcount-1) != tablerowcount:  # rowcount-1 because there's a header
-            if force:
-                logger.info("Forcing download of %s", qname)
-            else:
-                logger.info("%s local (%s) different from remote (%s); fetching.", qname, filerowcount, tablerowcount)
-            # download the file
-            logger.debug("COMMAND:%s", query)
-            outputquery = "COPY ({0}) TO STDOUT WITH DELIMITER AS '\t' CSV HEADER".format(query)
-            with open(outfile, 'w') as f:
-                cur.copy_expert(outputquery, f)
-            # Regenerate row count to check integrity
-            filerowcount = self.file_len(outfile)
-            if (filerowcount-1) != tablerowcount:
-                raise Exception("Download from MGI failed, %s != %s", (filerowcount-1), tablerowcount)
-        else:
-            logger.info("local data same as remote; reusing.")
-
-        return
-
     def process_xml_table(self, elem, table_name, processing_function, limit):
         """
         This is a convenience function to process the elements of an xml document, when the xml is used
@@ -545,22 +429,6 @@ def compare_local_remote_bytes(self, remotefile, localfile):
                           local_size, remotefile, remote_size)
         return is_equal
 
-    # TODO generalize this to a set of utils
-    def _getcols(self, cur, table):
-        """
-        Will execute a pg query to get the column names for the given table.
-        :param cur:
-        :param table:
-        :return:
-        """
-        query = ' '.join(("SELECT * FROM", table, "LIMIT 0"))  # for testing
-
-        cur.execute(query)
-        colnames = [desc[0] for desc in cur.description]
-        logger.info("COLS (%s): %s", table, colnames)
-
-        return
-
     def file_len(self, fname):
         with open(fname) as f:
             l = sum(1 for line in f)
diff --git a/requirements.txt b/requirements.txt
index d0850162..c308b9df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,8 +5,6 @@ roman
 python-docx
 pyyaml
 pysftp
-biopython
-docx
 beautifulsoup4
 GitPython
 intermine
diff --git a/requirements/core.txt b/requirements/core.txt
new file mode 100644
index 00000000..f96fef0f
--- /dev/null
+++ b/requirements/core.txt
@@ -0,0 +1,6 @@
+rdflib
+isodate
+roman
+python-docx
+pyyaml
+biopython
\ No newline at end of file
diff --git a/requirements/coriell.txt b/requirements/coriell.txt
new file mode 100644
index 00000000..89c07e1d
--- /dev/null
+++ b/requirements/coriell.txt
@@ -0,0 +1 @@
+pysftp
\ No newline at end of file
diff --git a/requirements/genereviews.txt b/requirements/genereviews.txt
new file mode 100644
index 00000000..041f722c
--- /dev/null
+++ b/requirements/genereviews.txt
@@ -0,0 +1 @@
+beautifulsoup4
\ No newline at end of file
diff --git a/requirements/hpoannotations.txt b/requirements/hpoannotations.txt
new file mode 100644
index 00000000..5779f396
--- /dev/null
+++ b/requirements/hpoannotations.txt
@@ -0,0 +1 @@
+GitPython
\ No newline at end of file
diff --git a/requirements/mgi.txt b/requirements/mgi.txt
new file mode 100644
index 00000000..ddb37e19
--- /dev/null
+++ b/requirements/mgi.txt
@@ -0,0 +1 @@
+psycopg2
\ No newline at end of file
diff --git a/requirements/zfin.txt b/requirements/zfin.txt
new file mode 100644
index 00000000..c44287fd
--- /dev/null
+++ b/requirements/zfin.txt
@@ -0,0 +1 @@
+intermine
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d4242249..442a11c0 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
       version='0.0.1',
       description='Data Ingest Pipeline',
       packages=find_packages(),
-      install_requires=['psycopg2', 'rdflib', 'isodate', 'roman', 'python-docx', 'pyyaml', 'pysftp', 'biopython',
-                        'docx', 'beautifulsoup4', 'GitPython', 'intermine'],
+      install_requires=['psycopg2', 'rdflib', 'isodate', 'roman', 'python-docx', 'pyyaml', 'pysftp',
+                        'beautifulsoup4', 'GitPython', 'intermine'],
       include_package_data=True
       )