Merge branch 'master' of github.com:okfn/bibserver

rufuspollock-okfn · Jan 19, 2012 · f41d25e · f41d25e
2 parents 362987e + e650b3c
commit f41d25e
Show file tree

Hide file tree

Showing 14 changed files with 4,268 additions and 31 deletions.
diff --git a/bibserver/parser.py b/bibserver/parser.py
@@ -3,6 +3,7 @@
 from parsers.BibTexParser import BibTexParser
 from parsers.JSONParser import JSONParser
 from parsers.CSVParser import CSVParser
+from parsers.RISParser import RISParser
 
 class Parser(object):
 
@@ -13,17 +14,16 @@ def parse(self, fileobj, format):
         :return: a python dict json-i-fiable to bibjson.
         '''
         if format == "bibtex" or format == "bib":
-            parser = BibTexParser()
-            data, metadata = parser.parse(fileobj)
+            parser = BibTexParser(fileobj)
         elif format == "json":
-            parser = JSONParser()
-            data, metadata = parser.parse(fileobj)        
+            parser = JSONParser(fileobj)
         elif format == "csv" or format == "google":
-            parser = CSVParser()
-            data, metadata = parser.parse(fileobj)
+            parser = CSVParser(fileobj)
+        elif format == "ris":
+            parser = RISParser(fileobj)
         else:
             raise Exception('Unable to convert from format: %s' % format)
-
+        data, metadata = parser.parse()
         return data, metadata
 
 

diff --git a/bibserver/parsers/BibTexParser.py b/bibserver/parsers/BibTexParser.py
@@ -4,6 +4,8 @@
 import unicodedata
 import re
 
+from bibserver.parsers import BaseParser
+
 '''this file can be called as a module or called directly from the command line like so:
 
 python BibTexParser.py /path/to/file.bib
@@ -23,9 +25,11 @@
 Returns a record dict
 '''
 
-class BibTexParser(object):
+class BibTexParser(BaseParser):
 
-    def __init__(self):
+    def __init__(self, fileobj):
+        super(BibTexParser, self).__init__(fileobj)
+
         # set which bibjson schema this parser parses to
         self.schema = "v0.82"
         self.has_metadata = False
@@ -46,13 +50,13 @@ def __init__(self):
         }
         self.identifier_types = ["doi","isbn","issn"]
 
-    def parse(self, fileobj):
+    def parse(self):
         '''given a fileobject, parse it for bibtex records,
         and pass them to the record parser'''
         records = []
         record = ""
         # read each line, bundle them up until they form an object, then send for parsing
-        for line in fileobj:
+        for line in self.fileobj:
             if '--BREAK--' in line:
                 break
             else:
@@ -2663,11 +2667,10 @@ def getnames(self,names):
 
 # in case file is run directly
 if __name__ == "__main__":
-    import sys
-    parser = BibTexParser()
+    import sys    
     try:
-        fileobj = open(sys.argv[1])
-        print parser.parse(fileobj)
+        parser = BibTexParser(open(sys.argv[1]))
+        print parser.parse()
     except:
         print parser.parse_record(sys.argv[1])
 
diff --git a/bibserver/parsers/CSVParser.py b/bibserver/parsers/CSVParser.py
@@ -1,13 +1,11 @@
 import csv
+from bibserver.parsers import BaseParser
 
-class CSVParser(object):
+class CSVParser(BaseParser):
 
-    def __init__(self):
-        pass
-
-    def parse(self, fileobj):
+    def parse(self):
         #dialect = csv.Sniffer().sniff(fileobj.read(1024))
-        d = csv.DictReader(fileobj)
+        d = csv.DictReader(self.fileobj)
         data = []
 
         # do any required conversions

diff --git a/bibserver/parsers/JSONParser.py b/bibserver/parsers/JSONParser.py
@@ -1,12 +1,10 @@
 import json
+from bibserver.parsers import BaseParser
 
-class JSONParser(object):
+class JSONParser(BaseParser):
 
-    def __init__(self):
-        pass
-
-    def parse(self, fileobj):
-        incoming = json.load(fileobj)
+    def parse(self):
+        incoming = json.load(self.fileobj)
 
         if 'records' in incoming:
             # if the incoming is bibjson, get records and metadata

diff --git a/bibserver/parsers/RISParser.py b/bibserver/parsers/RISParser.py
@@ -0,0 +1,109 @@
+'''this file can be called as a module or called directly from the command line like so:
+
+python RISParser.py /path/to/file.txt
+Returns a list of record dicts
+
+Details of the RIS format
+http://en.wikipedia.org/wiki/RIS_%28file_format%29
+'''
+
+FIELD_MAP = {
+    "DO": "doi", 
+    "SP": "pages", 
+    "M2": "start page", 
+    "DB": "name of database", 
+    "DA": "date", 
+    "M1": "number", 
+    "M3": "type", 
+    "N1": "notes", 
+    "ST": "short title", 
+    "DP": "database provider", 
+    "CN": "call number", 
+    "IS": "number", 
+    "LB": "label", 
+    "TA": "translated author", 
+    "TY": "type ", 
+    "UR": "url", 
+    "TT": "translated title", 
+    "PY": "year", 
+    "PB": "publisher", 
+    "A3": "tertiary author", 
+    "C8": "custom 8", 
+    "A4": "subsidiary author", 
+    "TI": "title", 
+    "C3": "custom 3", 
+    "C2": "pmcid", 
+    "C1": "note", 
+    "C7": "custom 7", 
+    "C6": "nihmsid", 
+    "C5": "custom 5", 
+    "C4": "custom 4", 
+    "AB": "note", 
+    "AD": "institution", 
+    "VL": "volume", 
+    "CA": "caption", 
+    "T2": "secondary title", 
+    "T3": "tertiary title", 
+    "AN": "accession number", 
+    "L4": "figure", 
+    "NV": "number of volumes", 
+    "AU": "author", 
+    "RP": "reprint edition", 
+    "L1": "file attachments", 
+    "ET": "epub date", 
+    "A2": "author", 
+    "RN": "notes", 
+    "LA": "language", 
+    "CY": "place published", 
+    "J2": "alternate title", 
+    "RI": "reviewed item", 
+    "KW": "keywords", 
+    "SN": "issn", 
+    "Y2": "access date", 
+    "SE": "section", 
+    "OP": "original publication"
+}
+
+VALUE_MAP = {
+    'AU' : lambda v: [{u'name':vv.decode('utf8')} for vv in v]
+}
+DEFAULT_VALUE_FUNC = lambda v: u' '.join(vv.decode('utf8') for vv in v)
+
+from bibserver.parsers import BaseParser
+
+class RISParser(BaseParser):
+    def __init__(self, fileobj):
+        super(RISParser, self).__init__(fileobj)
+        self.data = []
+
+    def add_chunk(self, chunk):
+        if not chunk: return
+        tmp = {}
+        for k,v in chunk.items():
+            tmp[FIELD_MAP.get(k, k)] =  VALUE_MAP.get(k, DEFAULT_VALUE_FUNC)(v)   
+        self.data.append(tmp)
+
+    def parse(self):
+        data, chunk = [], {}
+        for line in self.fileobj:
+            line = line.strip()
+            if not line: continue
+            parts = line.split('  - ')
+            if len(parts) < 2: continue
+            field = parts[0]
+            if field == 'TY':
+                self.add_chunk(chunk)
+                chunk = {}
+            value = '  - '.join(parts[1:])
+            if value:
+                chunk.setdefault(field, []).append(value)        
+        self.add_chunk(chunk)
+        return self.data, {}
+
+# in case file is run directly
+if __name__ == "__main__":
+    import sys, json
+    fileobj = open(sys.argv[1])
+    parser = RISParser(fileobj)
+    data, metadata = parser.parse()
+    sys.stdout.write(json.dumps(data, indent=2))
diff --git a/bibserver/parsers/__init__.py b/bibserver/parsers/__init__.py
@@ -0,0 +1,8 @@
+class BaseParser(object):
+    def __init__(self, fileobj):
+        if hasattr(fileobj, 'seek'):
+            # Some files have Byte-order marks inserted at the start
+            possible_BOM = fileobj.read(3)
+            if possible_BOM != '\xef\xbb\xbf':
+                fileobj.seek(0)
+        self.fileobj = fileobj
diff --git a/doc/Makefile b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/BibServer.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/BibServer.qhc"
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/BibServer"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/BibServer"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."