Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Unicode : change df.to_string() and friends to always return unicode objects #2224

Merged
merged 9 commits into from Nov 27, 2012
View
@@ -36,7 +36,7 @@
string representation of NAN to use, default 'NaN'
formatters : list or dict of one-parameter functions, optional
formatter functions to apply to columns' elements by position or name,
- default None
+ default None, if the result is a string , it must be a unicode string.
float_format : one-parameter function, optional
formatter function to apply to columns' elements if they are floats
default None
@@ -62,7 +62,7 @@ class SeriesFormatter(object):
def __init__(self, series, buf=None, header=True, length=True,
na_rep='NaN', name=False, float_format=None):
self.series = series
- self.buf = buf if buf is not None else StringIO()
+ self.buf = buf if buf is not None else StringIO(u"")
self.name = name
self.na_rep = na_rep
self.length = length
@@ -112,7 +112,7 @@ def to_string(self):
series = self.series
if len(series) == 0:
- return ''
+ return u''
fmt_index, have_header = self._get_formatted_index()
fmt_values = self._get_formatted_values()
@@ -135,9 +135,7 @@ def to_string(self):
if footer:
result.append(footer)
- if py3compat.PY3:
- return unicode(u'\n'.join(result))
- return com.console_encode(u'\n'.join(result))
+ return unicode(u'\n'.join(result))
if py3compat.PY3: # pragma: no cover
_encode_diff = lambda x: 0
@@ -200,10 +198,15 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
else:
self.columns = frame.columns
- def _to_str_columns(self, force_unicode=False):
+ def _to_str_columns(self, force_unicode=None):
"""
Render a DataFrame to a list of columns (as lists of strings).
"""
+ import warnings
+ if force_unicode is not None: # pragma: no cover
+ warnings.warn("force_unicode is deprecated, it will have no effect",
+ FutureWarning)
+
# may include levels names also
str_index = self._get_formatted_index()
str_columns = self._get_formatted_column_labels()
@@ -237,32 +240,17 @@ def _to_str_columns(self, force_unicode=False):
if self.index:
strcols.insert(0, str_index)
- if not py3compat.PY3:
- if force_unicode:
- def make_unicode(x):
- if isinstance(x, unicode):
- return x
- return x.decode('utf-8')
- strcols = map(lambda col: map(make_unicode, col), strcols)
- else:
- # Generally everything is plain strings, which has ascii
- # encoding. Problem is when there is a char with value over
- # 127. Everything then gets converted to unicode.
- try:
- map(lambda col: map(str, col), strcols)
- except UnicodeError:
- def make_unicode(x):
- if isinstance(x, unicode):
- return x
- return x.decode('utf-8')
- strcols = map(lambda col: map(make_unicode, col), strcols)
-
return strcols
- def to_string(self, force_unicode=False):
+ def to_string(self, force_unicode=None):
"""
Render a DataFrame to a console-friendly tabular output.
"""
+ import warnings
+ if force_unicode is not None: # pragma: no cover
+ warnings.warn("force_unicode is deprecated, it will have no effect",
+ FutureWarning)
+
frame = self.frame
if len(frame.columns) == 0 or len(frame.index) == 0:
@@ -272,15 +260,20 @@ def to_string(self, force_unicode=False):
com.pprint_thing(frame.index)))
text = info_line
else:
- strcols = self._to_str_columns(force_unicode)
+ strcols = self._to_str_columns()
text = adjoin(1, *strcols)
self.buf.writelines(text)
- def to_latex(self, force_unicode=False, column_format=None):
+ def to_latex(self, force_unicode=None, column_format=None):
"""
Render a DataFrame to a LaTeX tabular environment output.
"""
+ import warnings
+ if force_unicode is not None: # pragma: no cover
+ warnings.warn("force_unicode is deprecated, it will have no effect",
+ FutureWarning)
+
frame = self.frame
if len(frame.columns) == 0 or len(frame.index) == 0:
@@ -289,7 +282,7 @@ def to_latex(self, force_unicode=False, column_format=None):
frame.columns, frame.index))
strcols = [[info_line]]
else:
- strcols = self._to_str_columns(force_unicode)
+ strcols = self._to_str_columns()
if column_format is None:
column_format = '|l|%s|' % '|'.join('c' for _ in strcols)
@@ -726,18 +719,10 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
self.justify = justify
def get_result(self):
- if self._have_unicode():
- fmt_values = self._format_strings(use_unicode=True)
- else:
- fmt_values = self._format_strings(use_unicode=False)
-
+ fmt_values = self._format_strings()
return _make_fixed_width(fmt_values, self.justify)
- def _have_unicode(self):
- mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
- return mask.any()
-
- def _format_strings(self, use_unicode=False):
+ def _format_strings(self):
if self.float_format is None:
float_format = print_config.float_format
if float_format is None:
View
@@ -612,20 +612,51 @@ def _need_info_repr_(self):
else:
return False
- def __repr__(self):
+ def __str__(self):
+ """
+ Return a string representation for a particular DataFrame
+
+ Invoked by str(df) in both py2/py3.
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+
+ if py3compat.PY3:
+ return self.__unicode__()
+ return self.__bytes__()
+
+ def __bytes__(self):
"""
Return a string representation for a particular DataFrame
+
+ Invoked by bytes(df) in py3 only.
+ Yields a bytestring in both py2/py3.
+ """
+ return com.console_encode(self.__unicode__())
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular DataFrame
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
"""
- buf = StringIO()
+ buf = StringIO(u"")
if self._need_info_repr_():
self.info(buf=buf, verbose=self._verbose_info)
else:
self.to_string(buf=buf)
+
value = buf.getvalue()
+ assert type(value) == unicode
- if py3compat.PY3:
- return unicode(value)
- return com.console_encode(value)
+ return value
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular DataFrame
+
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+ return str(self)
def _repr_html_(self):
"""
@@ -1379,19 +1410,21 @@ def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='',
def to_string(self, buf=None, columns=None, col_space=None, colSpace=None,
header=True, index=True, na_rep='NaN', formatters=None,
float_format=None, sparsify=None, nanRep=None,
- index_names=True, justify=None, force_unicode=False):
+ index_names=True, justify=None, force_unicode=None):
"""
Render a DataFrame to a console-friendly tabular output.
"""
+ import warnings
+ if force_unicode is not None: # pragma: no cover
+ warnings.warn("force_unicode is deprecated, it will have no effect",
+ FutureWarning)
if nanRep is not None: # pragma: no cover
- import warnings
warnings.warn("nanRep is deprecated, use na_rep",
FutureWarning)
na_rep = nanRep
if colSpace is not None: # pragma: no cover
- import warnings
warnings.warn("colSpace is deprecated, use col_space",
FutureWarning)
col_space = colSpace
@@ -1404,15 +1437,10 @@ def to_string(self, buf=None, columns=None, col_space=None, colSpace=None,
justify=justify,
index_names=index_names,
header=header, index=index)
- formatter.to_string(force_unicode=force_unicode)
+ formatter.to_string()
if buf is None:
result = formatter.buf.getvalue()
- if not force_unicode:
- try:
- result = str(result)
- except ValueError:
- pass
return result
@Appender(fmt.docstring_to_string, indents=1)
View
@@ -132,12 +132,48 @@ def __array_finalize__(self, obj):
def _shallow_copy(self):
return self.view()
- def __repr__(self):
+ def __str__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by str(df) in both py2/py3.
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+
if py3compat.PY3:
- prepr = com.pprint_thing(self)
+ return self.__unicode__()
+ return self.__bytes__()
+
+ def __bytes__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by bytes(df) in py3 only.
+ Yields a bytestring in both py2/py3.
+ """
+ return com.console_encode(self.__unicode__())
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
+ """
+ if len(self) > 6 and len(self) > np.get_printoptions()['threshold']:
+ data = self[:3].tolist() + ["..."] + self[-3:].tolist()
else:
- prepr = com.pprint_thing_encoded(self)
- return 'Index(%s, dtype=%s)' % (prepr, self.dtype)
+ data = self
+
+ prepr = com.pprint_thing(data)
+ return '%s(%s, dtype=%s)' % (type(self).__name__, prepr, self.dtype)
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular Index
+
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+ return str(self)
def astype(self, dtype):
return Index(self.values.astype(dtype), name=self.name,
@@ -207,15 +243,6 @@ def summary(self, name=None):
name = type(self).__name__
return '%s: %s entries%s' % (name, len(self), index_summary)
- def __str__(self):
- try:
- return np.array_repr(self.values)
- except UnicodeError:
- converted = u','.join(com.pprint_thing(x) for x in self.values)
- result = u'%s([%s], dtype=''%s'')' % (type(self).__name__, converted,
- str(self.values.dtype))
- return com.console_encode(result)
-
def _mpl_repr(self):
# how to represent ourselves to matplotlib
return self.values
@@ -394,8 +421,8 @@ def format(self, name=False):
result = []
for dt in self:
if dt.time() != zero_time or dt.tzinfo is not None:
- return header + ['%s' % x for x in self]
- result.append('%d-%.2d-%.2d' % (dt.year, dt.month, dt.day))
+ return header + [u'%s' % x for x in self]
+ result.append(u'%d-%.2d-%.2d' % (dt.year, dt.month, dt.day))
return header + result
values = self.values
@@ -1319,7 +1346,33 @@ def _array_values(self):
def dtype(self):
return np.dtype('O')
- def __repr__(self):
+ def __str__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by str(df) in both py2/py3.
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+
+ if py3compat.PY3:
+ return self.__unicode__()
+ return self.__bytes__()
+
+ def __bytes__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by bytes(df) in py3 only.
+ Yields a bytestring in both py2/py3.
+ """
+ return com.console_encode(self.__unicode__())
+
+ def __unicode__(self):
+ """
+ Return a string representation for a particular Index
+
+ Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3.
+ """
output = 'MultiIndex\n%s'
options = np.get_printoptions()
@@ -1335,10 +1388,15 @@ def __repr__(self):
np.set_printoptions(threshold=options['threshold'])
- if py3compat.PY3:
- return output % summary
- else:
- return com.console_encode(output % summary)
+ return output % summary
+
+ def __repr__(self):
+ """
+ Return a string representation for a particular Index
+
+ Yields Bytestring in Py2, Unicode String in py3.
+ """
+ return str(self)
def __len__(self):
return len(self.labels[0])
@@ -1496,7 +1554,7 @@ def format(self, space=2, sparsify=None, adjoin=True, names=False,
formatted = lev.take(lab).format()
else:
# weird all NA case
- formatted = [str(x) for x in com.take_1d(lev.values, lab)]
+ formatted = [com.pprint_thing(x) for x in com.take_1d(lev.values, lab)]
stringified_levels.append(formatted)
result_levels = []
Oops, something went wrong.