Skip to content
298 changes: 298 additions & 0 deletions pandas/core/encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Mutable sequences handling? specificaly tuples.
# generators - via wrapper?

import unittest

from pandas.util import py3compat
from pandas.core.common import PandasError
import numpy as np
import sys


try:
next
except NameError: # pragma: no cover
# Python < 2.6
def next(x):
return x.next()

# this should live in some package-wide conf object
input_encoding='utf-8'
perform_conversion=True
guess_enc_on_decode_failure=True
guess_enc_min_char_count=100
guess_enc_max_iter=5000
guess_enc_min_confidence=0.8

def set_input_encoding(encoding):
global input_encoding
input_encoding=encoding

def _should_process(obj):
"""
A predicate function which determines whether obj should
be processed for byte-string conversion based on it's type.

Parameters
----------
obj - any object

Returns
-------
bool - True if the object should be processed
"""

# pd.Index* are isinstance(np.ndarray) but should be excluded
# because their constructors call decode directly.
#
return perform_conversion and \
( isinstance(obj,(list,dict)) or \
type(obj) == np.ndarray or \
type(obj) == np.void )

def _can_import(name):
"""
Returns True if the named module/package can be imported"

Parameters
----------
`name` - package / module name.

Returns
-------
bool - True if `name` can be imported.

"""
try:
__import__(name)
return True
except ImportError:
return False

def _decode_obj(obj, encoding):
"""
Recieves an object, and decodes any non-ascii byte-strings found
to unicode using the given encoding.

You should use `decode_catch_errors` to get friendly error messages
when decoding fails.

supports str,unicode, and mutable sequences
This function iterates over `seq`, decoding any byte-string object found using the
given `encoding`.

supports str/unicode and mutable sequences as input, all others
are returned as-is (including generators, for now)

Handles arbitrarily nested sequences.

Parameters
----------
`obj` - any object.

Returns
-------
result - seq with all non-ascii bytestring decoded into utf-8

Raises
------
UnicodeDecodeError - if decoding with the given encoding fails
"""

import types
def _dec_str(s,encoding=encoding):
try:
s.encode('ascii') # if it's ascii leave it alone
except UnicodeDecodeError:
s = s.decode(encoding) # if not, convert to unicode
# might raise another UnicodeDecodeError - handled by the caller
return s

def _dec_seq(seq):
if isinstance(seq, dict):
for k in seq.keys(): # grab the list of keys before we do mutation
v=seq[k]
if isinstance(k, str):
k = _dec_str(k)
elif _should_process(k): # keys are immutable, need this?
k = (yield _dec_seq(k))

if isinstance(v, str):
v = _dec_str(v)
elif _should_process(v):
v = (yield _dec_seq(v))

seq.pop(k) # need this
seq[k] = v

else:
for i,e in enumerate(seq):
if isinstance(e, str):
seq[i] = _dec_str(e)
elif _should_process(e):
(yield _dec_seq(e))

yield seq

if py3compat.PY3:
return obj

if isinstance(obj, basestring): # strings are simple
if isinstance(obj, str):
obj=_dec_str(obj)
return obj

if not _should_process(obj): # misc. objects are too
return obj

s = [_dec_seq(obj)]
values = []
while True: # others - not so much, let's see what we can do.
g = s.pop()
if values:
e = g.send(values.pop())
else:
e = next(g)
if type(e) == types.GeneratorType:
s.extend([g, e])
else:
if s:
values.append(e)
else:
return e

def _extract_txt_from_obj(obj,max_iter=sys.maxint):
"""
a generator which walks `obj`, yielding any byte-string found

will stop after at most `max_iter` iterations.

Parameters
----------
`obj` - any iterable

Yields
-------
byte-strings.

Raises
------
StopIteration - when there are no more byte-strings in the sequence

"""

if obj is None or isinstance(obj,basestring):
if isinstance(obj,unicode):
return
yield obj
return

s = [iter(obj)]
cnt=0
while s:
g = s.pop()
for e in g:
cnt+=1
if isinstance(e, str):
yield e
elif isinstance(e, dict):
s.extend([g, e.iterkeys(), e.itervalues()])
elif _should_process(e):
s.append(g, iter(e))

if cnt >= max_iter:
return

def _detect_encoding(obj,min_cnt=guess_enc_min_char_count,max_iter=guess_enc_max_iter):
"""
extracts byte-string from obj via `_extract_txt_from_obj` and uses
the `chardet` package to detect the encoding used.

Can handle nested sequences, also looks at dict keys and values.

Parameters
----------
`obj` - input string or sequence

`min_cnt` - specifies the minimum amount of characters which must be fed to to
the detector before we allow a decision.

`max_iter` - an upper bound on the number of elements examined in the sequence
looking for text.
This guards against the corner-case of a huge list with a decoding error only
near it's end.

Returns
-------
`result` - {'encoding': str, 'confidence': float}, or
{} if no encoding was found.
"""
if not _can_import("chardet"):
return {}

from chardet.universaldetector import UniversalDetector
detector = UniversalDetector()
cnt = 0 # keep track of number of characters processed
for txt in _extract_txt_from_obj(obj,max_iter):
cnt += len(txt)
detector.feed(txt)
if (cnt > min_cnt and detector.done) :
break
detector.close()
res=detector.result
if res and res['confidence'] > guess_enc_min_confidence\
and cnt > min_cnt:
return detector.result
else:
return {}

def decode_catch_errors(obj, encoding=None):
"""
Delegates to `_decode_obj` in order to convert byte-string within obj into
unicode when necessary. If a decode error occurs, prints a user friendly
error message, and if the chardet library is available will try to give
the user a good guesss about the encoding used by extracting text from `obj`

Parameters
----------
`obj` - anything
encoding - an acceptable encoding to be passed to str.decode()

Returns
-------
`result` - `obj` with byte-strings decoded into unicode strings

Raises
------
`PandasError(msg)` - with msg being a friendly error message to the user
"""

try:
encoding = encoding or input_encoding
return _decode_obj(obj, encoding)
except UnicodeDecodeError:
from textwrap import dedent
msg = \
"""
The input Data contains strings that cannot be decoded with `%s`.
You should specify a correct encoding to the object constructor,
or set the value of the default input encoding in XXX.
"""

s = dedent(msg) % encoding
if guess_enc_on_decode_failure:
if not _can_import("chardet"):
s += 'The "chardet" package is not installed - ' +\
"can't suggest an encoding."
else:
det_enc=_detect_encoding(obj)
if det_enc:
conf = det_enc['confidence']
enc = det_enc['encoding']
s += 'You might try "%s" as the encoding (Confidence: %2.1f)'\
% (enc, conf)

raise PandasError(s)
17 changes: 4 additions & 13 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,23 +237,12 @@ def _to_str_columns(self, force_unicode=False):

if not py3compat.PY3:
if force_unicode:
def make_unicode(x):
if isinstance(x, unicode):
return x
return x.decode('utf-8')
strcols = map(lambda col: map(make_unicode, col), strcols)
strcols = map(lambda col: map(unicode, col), strcols)
else:
# generally everything is plain strings, which has ascii
# encoding. problem is when there is a char with value over 127
# - everything then gets converted to unicode.
try:
map(lambda col: map(str, col), strcols)
except UnicodeError:
def make_unicode(x):
if isinstance(x, unicode):
return x
return x.decode('utf-8')
strcols = map(lambda col: map(make_unicode, col), strcols)
strcols = map(lambda col: map(unicode, col), strcols)

return strcols

Expand Down Expand Up @@ -1121,6 +1110,8 @@ def reset(self):


def _put_lines(buf, lines):
# handles #891 where ascii and unicode fields are mixed
# but will fail if encoded bytesting +unicode fields are mixed
if any(isinstance(x, unicode) for x in lines):
lines = [unicode(x) for x in lines]
buf.write('\n'.join(lines))
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from pandas.util.decorators import deprecate, Appender, Substitution

from pandas.tseries.period import PeriodIndex
import pandas.core.encoding as en

import pandas.core.algorithms as algos
import pandas.core.datetools as datetools
Expand Down Expand Up @@ -366,6 +367,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
if data is None:
data = {}

columns= en.decode_catch_errors(columns)
index= en.decode_catch_errors(index)
data= en.decode_catch_errors(data)

if isinstance(data, DataFrame):
data = data._data

Expand Down Expand Up @@ -950,7 +955,7 @@ def from_items(cls, items, columns=None, orient='columns'):

@classmethod
def from_csv(cls, path, header=0, sep=',', index_col=0,
parse_dates=True, encoding=None):
parse_dates=True, encoding='utf-8'):
"""
Read delimited file into DataFrame

Expand Down Expand Up @@ -1675,6 +1680,8 @@ def iget_value(self, i, j):

def __getitem__(self, key):
# slice rows
key=en.decode_catch_errors(key)

if isinstance(key, slice):
from pandas.core.indexing import _is_index_slice
idx_type = self.index.inferred_type
Expand Down Expand Up @@ -1793,6 +1800,9 @@ def __setattr__(self, name, value):
def __setitem__(self, key, value):
# support boolean setting with DataFrame input, e.g.
# df[df > df2] = 0
value=en.decode_catch_errors(value)
key=en.decode_catch_errors(key)

if isinstance(key, DataFrame):
if not (key.index.equals(self.index) and
key.columns.equals(self.columns)):
Expand Down Expand Up @@ -1972,6 +1982,9 @@ def xs(self, key, axis=0, level=None, copy=True):
-------
xs : Series or DataFrame
"""

key=en.decode_catch_errors(key)

labels = self._get_axis(axis)
if level is not None:
loc, new_ax = labels.get_loc_level(key, level=level)
Expand Down
Loading