From 97c535d41113eb18ea086cca0b2b091538e664f1 Mon Sep 17 00:00:00 2001
From: Johannes Dewender <github@JonnyJD.net>
Date: Wed, 21 Mar 2012 23:43:38 +0100
Subject: [PATCH] force unicode strings on field input

Adds util.unicode(string) (in new util module)
This gets the preferred encoding and tries to decode the string.
It is safe to pass in numbers or unicode objects.
The result will still be unicode.

Decoding errors are ignored, the corresponding characters are skipped.
Hopefully Lucene will give some results when some chars are missing.

Since we have all strings in unicode now, we don't need the unicode
literals u'...' anymore in _do_mb_search. (tested)
This might help supporting Python3.

Signed-off-by: Johannes Dewender <github@JonnyJD.net>
---
 musicbrainzngs/musicbrainz.py | 15 ++++++++++-----
 musicbrainzngs/util.py        | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)
 create mode 100644 musicbrainzngs/util.py

diff --git a/musicbrainzngs/musicbrainz.py b/musicbrainzngs/musicbrainz.py
index 65127b5..ebf2175 100644
--- a/musicbrainzngs/musicbrainz.py
+++ b/musicbrainzngs/musicbrainz.py
@@ -6,7 +6,6 @@
 import urlparse
 import urllib2
 import urllib
-import mbxml
 import re
 import threading
 import time
@@ -16,6 +15,9 @@
 import xml.etree.ElementTree as etree
 from xml.parsers import expat
 
+import mbxml
+import util
+
 _version = "0.3dev"
 _log = logging.getLogger("musicbrainzngs")
 
@@ -519,7 +521,10 @@ def _do_mb_search(entity, query='', fields={}, limit=None, offset=None):
 	for the given entity type.
 	"""
 	# Encode the query terms as a Lucene query string.
-	query_parts = [query.replace('\x00', '').strip()]
+	query_parts = []
+	if query:
+		clean_query = util._unicode(query)
+		query_parts.append(clean_query)
 	for key, value in fields.iteritems():
 		# Ensure this is a valid search field.
 		if key not in VALID_SEARCH_FIELDS[entity]:
@@ -528,12 +533,12 @@ def _do_mb_search(entity, query='', fields={}, limit=None, offset=None):
 			)
 
 		# Escape Lucene's special characters.
+		value = util._unicode(value)
 		value = re.sub(r'([+\-&|!(){}\[\]\^"~*?:\\])', r'\\\1', value)
-		value = value.replace('\x00', '').strip()
 		value = value.lower() # Avoid binary operators like OR.
 		if value:
-			query_parts.append(u'%s:(%s)' % (key, value))
-	full_query = u' '.join(query_parts).strip()
+			query_parts.append('%s:(%s)' % (key, value))
+	full_query = ' '.join(query_parts).strip()
 	if not full_query:
 		raise ValueError('at least one query term is required')
 
diff --git a/musicbrainzngs/util.py b/musicbrainzngs/util.py
new file mode 100644
index 0000000..e5c081b
--- /dev/null
+++ b/musicbrainzngs/util.py
@@ -0,0 +1,23 @@
+# This file is part of the musicbrainzngs library
+# Copyright (C) Alastair Porter, Adrian Sampson, and others
+# This file is distributed under a BSD-2-Clause type license.
+# See the COPYING file for more information.
+
+import sys
+import locale
+
+def _unicode(string, encoding=None):
+    """Try to decode byte strings to unicode.
+    This can only be a guess, but this might be better than failing.
+    It is safe to use this on numbers or strings that are already unicode.
+    """
+    if isinstance(string, str):
+        # use given encoding, stdin, preferred until something != None is found
+        if encoding is None:
+            encoding = sys.stdin.encoding
+        if encoding is None:
+            encoding = locale.getpreferredencoding()
+        unicode_string = unicode(string, encoding, "ignore")
+    else:
+        unicode_string = unicode(string)
+    return unicode_string.replace('\x00', '').strip()