Actual modifications that belong with commit 4300aa1

Fixes Unicode handling in BibTexParser and charset detection and BOM removal in BaseParser. NOTE: This has the limitation that input files need to fit in memory, as the input file is read and then provided to Subclasses via StringIO.
rufuspollock-okfn · Jan 31, 2012 · f4e001e · f4e001e
1 parent 4300aa1
commit f4e001e
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 12 deletions.
diff --git a/bibserver/parsers/BibTexParser.py b/bibserver/parsers/BibTexParser.py
@@ -1,6 +1,5 @@
 import string
 import json
-import chardet
 import unicodedata
 import re
 
@@ -242,10 +241,7 @@ def string_subst(self, val):
             if val == k:
                 val = self.replace_dict[k]
         if not isinstance(val, unicode):
-            encoding = chardet.detect(val)["encoding"]
-            if not encoding:
-                encoding = 'ascii'
-            val = unicode(val,encoding,'ignore')
+            val = unicode(val,self.encoding,'ignore')
         if '\\' in val or '{' in val:
             for k, v in self.unicode_to_latex.iteritems():
                 if v in val:
@@ -267,7 +263,7 @@ def add_val(self, val):
         val = self.strip_braces(val)
         val = self.string_subst(val)
         """alter based on particular key types"""
-        return unicodedata.normalize('NFKD', val).replace(u'\x00', '').replace(u'\x1A', '').encode('utf-8','ignore')
+        return unicodedata.normalize('NFKD', val).replace(u'\x00', '').replace(u'\x1A', '')
 
 
     def add_key(self, key):

diff --git a/bibserver/parsers/__init__.py b/bibserver/parsers/__init__.py
@@ -1,8 +1,12 @@
+import cStringIO
+import chardet
+
 class BaseParser(object):
     def __init__(self, fileobj):
-        if hasattr(fileobj, 'seek'):
-            # Some files have Byte-order marks inserted at the start
-            possible_BOM = fileobj.read(3)
-            if possible_BOM != '\xef\xbb\xbf':
-                fileobj.seek(0)
-        self.fileobj = fileobj
+        data = fileobj.read()
+        self.encoding = chardet.detect(data).get('encoding', 'ascii')
+
+        # Some files have Byte-order marks inserted at the start
+        if data[:3] == '\xef\xbb\xbf':
+            data = data[3:]
+        self.fileobj = cStringIO.StringIO(data)
diff --git a/test/test_bibtexparser.py b/test/test_bibtexparser.py
@@ -9,3 +9,11 @@ def test_01(self):
         print data
         assert data[0]['title'] == 'Visibility to infinity in the hyperbolic plane, despite obstacles'
 
+    def test_empty_name(self):
+        collection = 'testing'
+        sample = open('test/data/sampleutf8.bibtex')
+        parser = BibTexParser(sample)
+        data, metadata = parser.parse()
+        print data[0]['title']
+        assert data[0]['title'] == u'\u201cBibliotheken fu\u0308r o\u0308ffnen\u201d'
+