Skip to content

Commit

Permalink
force unicode strings on field input
Browse files Browse the repository at this point in the history
Adds util.unicode(string) (in new util module)
This gets the preferred encoding and tries to decode the string.
It is safe to pass in numbers or unicode objects.
The result will still be unicode.

Decoding errors are ignored, the corresponding characters are skipped.
Hopefully Lucene will give some results when some chars are missing.

Since we have all strings in unicode now, we don't need the unicode
literals u'...' anymore in _do_mb_search. (tested)
This might help supporting Python3.

Signed-off-by: Johannes Dewender <github@JonnyJD.net>
  • Loading branch information
JonnyJD committed Mar 28, 2012
1 parent e791ea1 commit 97c535d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
15 changes: 10 additions & 5 deletions musicbrainzngs/musicbrainz.py
Expand Up @@ -6,7 +6,6 @@
import urlparse
import urllib2
import urllib
import mbxml
import re
import threading
import time
Expand All @@ -16,6 +15,9 @@
import xml.etree.ElementTree as etree
from xml.parsers import expat

import mbxml
import util

_version = "0.3dev"
_log = logging.getLogger("musicbrainzngs")

Expand Down Expand Up @@ -519,7 +521,10 @@ def _do_mb_search(entity, query='', fields={}, limit=None, offset=None):
for the given entity type.
"""
# Encode the query terms as a Lucene query string.
query_parts = [query.replace('\x00', '').strip()]
query_parts = []
if query:
clean_query = util._unicode(query)
query_parts.append(clean_query)
for key, value in fields.iteritems():
# Ensure this is a valid search field.
if key not in VALID_SEARCH_FIELDS[entity]:
Expand All @@ -528,12 +533,12 @@ def _do_mb_search(entity, query='', fields={}, limit=None, offset=None):
)

# Escape Lucene's special characters.
value = util._unicode(value)
value = re.sub(r'([+\-&|!(){}\[\]\^"~*?:\\])', r'\\\1', value)
value = value.replace('\x00', '').strip()
value = value.lower() # Avoid binary operators like OR.
if value:
query_parts.append(u'%s:(%s)' % (key, value))
full_query = u' '.join(query_parts).strip()
query_parts.append('%s:(%s)' % (key, value))
full_query = ' '.join(query_parts).strip()
if not full_query:
raise ValueError('at least one query term is required')

Expand Down
23 changes: 23 additions & 0 deletions musicbrainzngs/util.py
@@ -0,0 +1,23 @@
# This file is part of the musicbrainzngs library
# Copyright (C) Alastair Porter, Adrian Sampson, and others
# This file is distributed under a BSD-2-Clause type license.
# See the COPYING file for more information.

import sys
import locale

def _unicode(string, encoding=None):
"""Try to decode byte strings to unicode.
This can only be a guess, but this might be better than failing.
It is safe to use this on numbers or strings that are already unicode.
"""
if isinstance(string, str):
# use given encoding, stdin, preferred until something != None is found
if encoding is None:
encoding = sys.stdin.encoding
if encoding is None:
encoding = locale.getpreferredencoding()
unicode_string = unicode(string, encoding, "ignore")
else:
unicode_string = unicode(string)
return unicode_string.replace('\x00', '').strip()

0 comments on commit 97c535d

Please sign in to comment.