Skip to content

Commit

Permalink
Actual modifications that belong with commit 4300aa1
Browse files Browse the repository at this point in the history
Fixes Unicode handling in BibTexParser and charset detection and BOM removal in BaseParser.
NOTE: This has the limitation that input files need to fit in memory, as the input file is read and then provided to Subclasses via StringIO.
  • Loading branch information
epoz committed Jan 31, 2012
1 parent 4300aa1 commit f4e001e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
8 changes: 2 additions & 6 deletions bibserver/parsers/BibTexParser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import string
import json
import chardet
import unicodedata
import re

Expand Down Expand Up @@ -242,10 +241,7 @@ def string_subst(self, val):
if val == k:
val = self.replace_dict[k]
if not isinstance(val, unicode):
encoding = chardet.detect(val)["encoding"]
if not encoding:
encoding = 'ascii'
val = unicode(val,encoding,'ignore')
val = unicode(val,self.encoding,'ignore')
if '\\' in val or '{' in val:
for k, v in self.unicode_to_latex.iteritems():
if v in val:
Expand All @@ -267,7 +263,7 @@ def add_val(self, val):
val = self.strip_braces(val)
val = self.string_subst(val)
"""alter based on particular key types"""
return unicodedata.normalize('NFKD', val).replace(u'\x00', '').replace(u'\x1A', '').encode('utf-8','ignore')
return unicodedata.normalize('NFKD', val).replace(u'\x00', '').replace(u'\x1A', '')


def add_key(self, key):
Expand Down
16 changes: 10 additions & 6 deletions bibserver/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import cStringIO
import chardet

class BaseParser(object):
def __init__(self, fileobj):
if hasattr(fileobj, 'seek'):
# Some files have Byte-order marks inserted at the start
possible_BOM = fileobj.read(3)
if possible_BOM != '\xef\xbb\xbf':
fileobj.seek(0)
self.fileobj = fileobj
data = fileobj.read()
self.encoding = chardet.detect(data).get('encoding', 'ascii')

# Some files have Byte-order marks inserted at the start
if data[:3] == '\xef\xbb\xbf':
data = data[3:]
self.fileobj = cStringIO.StringIO(data)
8 changes: 8 additions & 0 deletions test/test_bibtexparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,11 @@ def test_01(self):
print data
assert data[0]['title'] == 'Visibility to infinity in the hyperbolic plane, despite obstacles'

def test_empty_name(self):
collection = 'testing'
sample = open('test/data/sampleutf8.bibtex')
parser = BibTexParser(sample)
data, metadata = parser.parse()
print data[0]['title']
assert data[0]['title'] == u'\u201cBibliotheken fu\u0308r o\u0308ffnen\u201d'

0 comments on commit f4e001e

Please sign in to comment.