Merge pull request #9 from mhalagan-nmdp/update-docs

Updated documentation and tests.
nmdp-bioinformatics · Apr 16, 2018 · 04042d5 · 04042d5
2 parents cd0c4b0 + 128656a
commit 04042d5
Show file tree

Hide file tree

Showing 17 changed files with 690 additions and 623 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -2,15 +2,10 @@
 # This file will be regenerated if you run travis_pypi_setup.py
 
 language: python
-python: 3.5
+python: 3.6
 
 env:
-  - TOXENV=py35
-  - TOXENV=py34
-  - TOXENV=py33
-  - TOXENV=py27
-  - TOXENV=py26
-  - TOXENV=pypy
+  - TOXENV=py36
 
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install: pip install -U tox
@@ -23,10 +18,10 @@ script: tox -e ${TOXENV}
 deploy:
   provider: pypi
   distributions: sdist bdist_wheel
-  user: mhalagan-nmdp
+  user: mhalagan
   password:
     secure: PLEASE_REPLACE_ME
   on:
     tags: true
-    repo: mhalagan-nmdp/pyhml
-    condition: $TOXENV == py27
+    repo: nmdp-bioinformatics/pyhml
+    condition: $TOXENV == py36
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -11,3 +11,4 @@ Contributors
 ------------
 
 * Mike Halagan <mhalagan@nmdp.org>
+* Bob Milius <bmilius@nmdp.org>
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,27 @@
 History
 =======
 
+0.0.4 (2017-04-15)
+------------------
+
+* Fixed dependency issues.
+* Moved tobiotype to HML object.
+* Moved toDF to HML object and renamed toPandas()
+* Added tests and linked to travis.ci
+
+0.0.3 (2017-04-14)
+------------------
+
+* Added the ability to parse .gz files
+* Added the ability to parse HML files with bad tags.
+
+
+0.0.2 (2017-11-14)
+------------------
+
+* Fixed issues with parsing HML files with missing data
+
+
 0.0.1 (2017-10-19)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -6,23 +6,23 @@ pyHML
 .. image:: https://img.shields.io/pypi/v/pyhml.svg
         :target: https://pypi.python.org/pypi/pyhml
 
-.. image:: https://img.shields.io/travis/mhalagan-nmdp/pyhml.svg
-        :target: https://travis-ci.org/mhalagan-nmdp/pyhml
+.. image:: https://img.shields.io/travis/nmdp-bioinformatics/pyhml.svg
+        :target: https://travis-ci.org/nmdp-bioinformatics/pyhml
 
 .. image:: https://readthedocs.org/projects/pyhml/badge/?version=latest
         :target: https://pyhml.readthedocs.io/en/latest/?badge=latest
         :alt: Documentation Status
 
-.. image:: https://pyup.io/repos/github/mhalagan-nmdp/pyhml/shield.svg
-     :target: https://pyup.io/repos/github/mhalagan-nmdp/pyhml/
+.. image:: https://pyup.io/repos/github/nmdp-bioinformatics/pyHML/shield.svg
+     :target: https://pyup.io/repos/github/nmdp-bioinformatics/pyHML/
      :alt: Updates
 
 
 Python HML parser
 
 * Free software: LGPL 3.0
 * Documentation: https://pyhml.readthedocs.io.
-
+* `Jupyter Notebook`_
 
 Features
 --------
@@ -36,13 +36,13 @@ Features
     outdir = 'output/directory'
 
     # Print out each subject in fasta format
-    pyhml.tobiotype(hml, outdir, dtype='fasta', by='subject')
+    hml.tobiotype(outdir, dtype='fasta', by='subject')
 
     # Print out the full HML file in IMGT dat file format
-    pyhml.tobiotype(hml, outdir, dtype='imgt', by='hml')
+    hml.tobiotype(outdir, dtype='imgt', by='hml')
 
     # Get pandas DF from HML object
-    pandasdf = pyhml.toDF(hml)
+    pandasdf = hml.toPandas()
     print(pandasdf)
 
              ID     Locus                             glstring dbversion  \
@@ -97,4 +97,4 @@ This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypack
 
 .. _Cookiecutter: https://github.com/audreyr/cookiecutter
 .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
-
+.. _`Jupyter Notebook`: https://github.com/nmdp-bioinformatics/pyHML/blob/master/notebook/Examples.ipynb
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -7,6 +7,14 @@ To use pyHML in a project::
     import pyhml 
 	hmlparser = pyhml.HmlParser()
     hml = hmlparser.parse("hml_example.xml")
-    pandasdf = pyhml.toDF(hml)
+    pandasdf = hml.toPandas()
 
+    # Ouput the HML data as a IPD-IMGT/HLA .dat file for each subject
+    hml.tobiotype("output/directory", dtype='imgt', by='subject')
+
+    # Output the whole HML file as one fasta file
+    hml.tobiotype("output/directory", dtype='fasta', by='file')
+
+    # Defaults to dtype='fasta' and by='subject'
+    hml.tobiotype("output/directory")
 
diff --git a/notebook/Examples.ipynb b/notebook/Examples.ipynb
diff --git a/pyhml/__init__.py b/pyhml/__init__.py
@@ -23,9 +23,7 @@
 #
 from __future__ import absolute_import
 from .pyhml import HmlParser
-from .pyhml import tobiotype
-from .pyhml import toDF
 
 __author__ = """Mike Halagan"""
 __email__ = 'mhalagan@nmdp.org'
-__version__ = '0.0.3'
+__version__ = '0.0.4'
diff --git a/pyhml/models/hml.py b/pyhml/models/hml.py
@@ -9,6 +9,7 @@
 from ..util import deserialize_model
 import pandas as pd
 from Bio import SeqIO
+from pandas import DataFrame
 
 
 class HML(Model):
@@ -170,12 +171,19 @@ def sample(self, sample: List[Sample]):
 
         self._sample = sample
 
-    def toPandas(self):
+    def toPandas(self) -> DataFrame:
         """
-        Sets the typing of this Sample.
+        Returns all the HML data as a pandas DataFrame.
 
-        :param typing: The typing of this Sample.
-        :type typing: List[Typing]
+        Examples:
+
+            >>> import pyhml
+            >>> hmlparser = pyhml.HmlParser(verbose=True)
+            >>> hml = hmlparser.parse(hml_file)
+            >>> hml_df = hml.toPandas()
+
+        :return: Pandas dataframe
+        :rtype: DataFrame
         """
         data = []
         for sample in self.sample:
@@ -193,8 +201,19 @@ def tobiotype(self, outdir, dtype='fasta', by='file'):
         """
         Converts an HML object to a BioPython data fromat
 
-        :param typing: The typing of this Sample.
-        :type typing: List[Typing]
+        Examples:
+
+            >>> import pyhml
+            >>> hmlparser = pyhml.HmlParser(verbose=True)
+            >>> hml = hmlparser.parse(hml_file)
+            >>> hml.tobiotype("output/directory",dtype='imgt', by='subject')
+
+        :param outdir: The output directory
+        :type outdir: str
+        :param dtype: The BioPython output type
+        :type version: str
+        :param by: What to print out the HML file by
+        :type by: str
         """
         if by == 'subject':
             for sample in self.sample:

diff --git a/pyhml/pyhml.py b/pyhml/pyhml.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 #    pyhml pyHML.
-#    Copyright (c) 2017 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
+#    Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
 #
 #    This library is free software; you can redistribute it and/or modify it
 #    under the terms of the GNU Lesser General Public License as published
@@ -22,31 +22,24 @@
 #
 import os
 import re
+import logging
 import xmlschema
 import xmltodict
 
-from Bio import SeqIO
-from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition
+from sh import gunzip
 from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
-from collections import OrderedDict
 from Bio.Alphabet import IUPAC
 
 from pyhml.models.hml import HML
-from pyhml.models.reporting_center import ReportingCenter
 from pyhml.models.sample import Sample
 from pyhml.models.typing import Typing
-from pyhml.models.allele_assignment import AlleleAssignment
+from pyhml.models.haploid import Haploid
 from pyhml.models.consensus import Consensus
-from pyhml.models.typing_method import TypingMethod
-from pyhml.models.consensus_seq_block import ConsensusSeqBlock
 from pyhml.models.ref_database import RefDatabase
 from pyhml.models.ref_sequence import RefSequence
-from pyhml.models.haploid import Haploid
-
-from sh import gunzip
-
-import logging
+from pyhml.models.reporting_center import ReportingCenter
+from pyhml.models.allele_assignment import AlleleAssignment
+from pyhml.models.consensus_seq_block import ConsensusSeqBlock
 
 logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p',
@@ -55,13 +48,26 @@
 
 class HmlParser(object):
     """
-    import pyhml
-    hmlparser = pyhml.HmlParser()
-    hml_file = "hml_test.xml"
-    hml = hmlparser.parse(hml_file)
-    hml_df = pyhml.toDF(hml)
+    A python HML parser that converts any valid HML
+    file into an python ``object``. Allows users to easily
+    interact with HML data as python objects. Users can
+    also easily convert the HML data to a pandas DataFrame. If
+    no ``hmlversion`` is provided, then the schemas for all
+    HML versions are loaded.
+
+    Examples:
+
+        >>> import pyhml
+        >>> hmlparser = pyhml.HmlParser(verbose=True)
+        >>> hml = hmlparser.parse(hml_file)
+        >>> hml_df = hml.toPandas()
+
+    :param hmlversion: A specific HML version to load.
+    :type hmlversion: str
+    :param verbose: Flag for running in verbose.
+    :type verbose: bool
     """
-    def __init__(self, hmlversion=None, verbose=False):
+    def __init__(self, hmlversion: str=None, verbose: bool=False):
         """
         HmlParser - a model
         """
@@ -70,6 +76,8 @@ def __init__(self, hmlversion=None, verbose=False):
         self.hmlversion = hmlversion
         data_dir = os.path.dirname(__file__)
         self.logger = logging.getLogger("Logger." + __name__)
+
+        # TODO: get schemas from hml.b12x.org
         self.versions = ['1.0.1', '1.0', '0.9.4', '0.9.5',
                          '0.9.6', '0.9.7', '1.0.2']
         if not hmlversion:
@@ -82,12 +90,17 @@ def __init__(self, hmlversion=None, verbose=False):
             xsd_file = data_dir + '/data/hml-' + hmlversion + '.xsd'
             self.schemas.update({hmlversion: xmlschema.XMLSchema(xsd_file)})
 
-    def parse(self, hml_file):
+    def parse(self, hml_file: str) -> HML:
         """
-        Sets the typing of this Sample.
+        Parses an HML file into a python object.
 
-        :param typing: The typing of this Sample.
-        :type typing: List[Typing]
+            >>> hml = hmlparser.parse(hml_file)
+            >>> hml_df = hml.toPandas()
+
+        :param hml_file: A valid HML file
+        :type: str
+        :return: Object containing HML data
+        :rtype: HML
         """
         # Unzip HML file if it has a .gz extention
         if re.search("\.gz", hml_file):
@@ -285,7 +298,7 @@ def _unzip_clean(self, hmlfile):
         :type typing: List[Typing]
         """
         gunzip(hmlfile)
-        hml_unzipped = ".".join(hmlfile.split(".")[0:3])
+        hml_unzipped = ".".join(hmlfile.split(".")[0:len(hmlfile.split("."))-1])
         cmd4 = "perl -p -i -e 's/<\?xml.+\?>//g' " + hml_unzipped
         os.system(cmd4)
         cmd1 = "perl -p -i -e 's/\?//g' " + hml_unzipped

diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,9 @@
-biopython==1.70
-bson==0.5.2
+biopython==1.71
 numpy==1.14.2
 pandas==0.20.3
 python-dateutil==2.7.2
 pytz==2018.3
 six==1.11.0
 xmlschema==0.9.13
 xmltodict==0.11.0
+sh==1.12.14
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.3
+current_version = 0.0.4
 commit = True
 tag = True
 

diff --git a/setup.py b/setup.py
@@ -36,11 +36,11 @@
     'biopython==1.70',
     'pandas==0.20.3',
     'numpy==1.14.2',
-    'bson==0.5.2',
     'six==1.11.0',
     'xmlschema==0.9.13',
     'xmltodict==0.11.0',
     'pytz==2018.3',
+    'sh==1.12.14',
     'python-dateutil==2.7.2'
 ]
 
@@ -50,7 +50,7 @@
 
 setup(
     name='pyhml',
-    version='0.0.3',
+    version='0.0.4',
     description="Python HML parser",
     long_description=readme + '\n\n' + history,
     author="Mike Halagan",

diff --git a/tests/resources/2609.hml101.xml.gz b/tests/resources/2609.hml101.xml.gz
diff --git a/tests/resources/3054.hml101.xml.gz b/tests/resources/3054.hml101.xml.gz
diff --git a/tests/test_pyhml.py b/tests/test_pyhml.py
@@ -32,14 +32,14 @@
 Tests for `pyhml` module.
 """
 
-
+import os
 import sys
 import unittest
 
 from pyhml.pyhml import HmlParser
 from pyhml.models.hml import HML
+
 from Bio import SeqIO
-import os
 from pandas import DataFrame
 
 

diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py26, py27, py33, py34, py35, flake8
+envlist = py36, flake8
 
 [testenv:flake8]
 basepython=python
@@ -10,9 +10,9 @@ commands=flake8 pyhml
 setenv =
     PYTHONPATH = {toxinidir}:{toxinidir}/pyhml
 
-commands = python setup.py test
+commands = python -m unittest
 
 ; If you want to make tox run the tests with the same versions, create a
 ; requirements.txt with the pinned versions and uncomment the following lines:
 ; deps =
-;     -r{toxinidir}/requirements.txt
+;     -r {toxinidir}/requirements.txt