Merge commit 'v0.8.0b2-68-g7240b87' into debian-0.8

* commit 'v0.8.0b2-68-g7240b87': (68 commits) TST: additional coverage and cruft removal for ts plotting pandas-dev#1245 BUG: test coverage, groupby bug fixes BUG: fix NumPy 1.7 argmin workaround, test coverage BUG: out of bounds on buffer access if time doesn't exist in TimeSeries.at_time BUG: revert mpl hackaround TST: resample test coverage etc. pandas-dev#1245 BUG: test coverage and misc bug fixes, cruft deletion in period.py pandas-dev#1245 TST: finish test coverage of pandas.tseries.index pandas-dev#1245 BUG: fix closed='left' resample bug. test coverage pandas-dev#1245 TST: test coverage pandas-dev#1245 BUG: raise exception in DataFrame.fillna when axis=1 and pass dict/Series. close pandas-dev#1485 BUG: fillna called with Series should be analogous to with dict close pandas-dev#1486 BUG: fix MS/BMS range generation / onOffset bugs causing pandas-dev#1483 ENH: at_time/between_time work with tz-localized time series. refactoring and cleanup close pandas-dev#1481 BUG: label slicing with duplicate values, close pandas-dev#1480 TST: remove rogue print statement BUG: fixed broken imports BUG: do not convert bday freq in ts plots pandas-dev#1482 BUG: mask NaNs in non-ts plots TST: test case for tseries plots with data gaps ...
neurodebian · Jun 21, 2012 · 8562adc · 8562adc
2 parents a1d7688 + 7240b87
commit 8562adc
Show file tree

Hide file tree

Showing 60 changed files with 2,425 additions and 1,605 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -139,6 +139,9 @@ pandas 0.8.0
   - Series.append and DataFrame.append no longer check for duplicate indexes
     by default, add verify_integrity parameter (#1394)
   - Refactor Factor class, old constructor moved to Factor.from_array
+  - Modified internals of MultiIndex to use less memory (no longer represented
+    as array of tuples) internally, speed up construction time and many methods
+    which construct intermediate hierarchical indexes (#1467)
 
 **Bug fixes**
 
@@ -186,6 +189,7 @@ pandas 0.8.0
   - Reset index mapping when grouping Series in Cython (#1423)
   - Fix outer/inner DataFrame.join with non-unique indexes (#1421)
   - Fix MultiIndex groupby bugs with empty lower levels (#1401)
+  - Calling fillna with a Series will have same behavior as with dict (#1486)
 
 pandas 0.7.3
 ============

diff --git a/doc/make.py b/doc/make.py
@@ -79,7 +79,7 @@ def latex():
 
         os.chdir('../..')
     else:
-        print 'latex build has not been tested on windows'
+        print('latex build has not been tested on windows')
 
 def check_build():
     build_dirs = [

diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -44,7 +44,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around
    @savefig series_plot_basic.png width=4.5in
    ts.plot()
 
-If the index consists of dates, it calls ``gca().autofmt_xdate()`` to try to
+If the index consists of dates, it calls ``gcf().autofmt_xdate()`` to try to
 format the x-axis nicely as per above. The method takes a number of arguments
 for controlling the look of the plot:
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -78,21 +78,21 @@ def _count_generic(values, table_type, type_caster):
     from pandas.core.series import Series
 
     values = type_caster(values)
-    table = table_type(len(values))
+    table = table_type(min(len(values), 1000000))
     uniques, labels, counts = table.factorize(values)
 
     return Series(counts, index=uniques)
 
 def _match_generic(values, index, table_type, type_caster):
     values = type_caster(values)
     index = type_caster(index)
-    table = table_type(len(index))
+    table = table_type(min(len(index), 1000000))
     table.map_locations(index)
     return table.lookup(values)
 
 def _unique_generic(values, table_type, type_caster):
     values = type_caster(values)
-    table = table_type(len(values))
+    table = table_type(min(len(values), 1000000))
     uniques = table.unique(values)
     return type_caster(uniques)
 
@@ -223,17 +223,25 @@ def quantile(x, q, interpolation_method='fraction'):
     score : float
         Score at percentile.
 
-    Examples
+    Examplesb
     --------
     >>> from scipy import stats
     >>> a = np.arange(100)
     >>> stats.scoreatpercentile(a, 50)
     49.5
 
     """
-    values = np.sort(np.asarray(x))
+    x = np.asarray(x)
+    mask = com.isnull(x)
+
+    x = x[-mask]
+
+    values = np.sort(x)
 
     def _get_score(at):
+        if len(values) == 0:
+            return np.nan
+
         idx = at * (len(values) - 1)
         if (idx % 1 == 0):
             score = values[idx]

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -6,7 +6,7 @@
 from pandas.core.algorithms import factorize, match, unique, value_counts
 
 from pandas.core.common import isnull, notnull, save, load
-from pandas.core.factor import Factor
+from pandas.core.categorical import Categorical, Factor
 from pandas.core.format import (set_printoptions, reset_printoptions,
                                 set_eng_float_format)
 from pandas.core.index import Index, Int64Index, MultiIndex
@@ -15,17 +15,16 @@
 from pandas.core.frame import DataFrame
 from pandas.core.panel import Panel
 from pandas.core.groupby import groupby
-from pandas.core.reshape import pivot_simple as pivot
+from pandas.core.reshape import pivot_simple as pivot, get_dummies
 
 WidePanel = Panel
 
-from pandas.core.daterange import DateRange # deprecated
-
 from pandas.tseries.offsets import DateOffset
 from pandas.tseries.tools import to_datetime
 from pandas.tseries.index import (DatetimeIndex, Timestamp,
                                   date_range, bdate_range)
 from pandas.tseries.period import Period, PeriodIndex
 
 # legacy
+from pandas.core.daterange import DateRange # deprecated
 import pandas.core.datetools as datetools
diff --git a/pandas/core/factor.py → pandas/core/categorical.py b/pandas/core/factor.py → pandas/core/categorical.py
@@ -6,9 +6,9 @@
 import pandas.core.common as com
 
 
-def _factor_compare_op(op):
+def _cat_compare_op(op):
     def f(self, other):
-        if isinstance(other, (Factor, np.ndarray)):
+        if isinstance(other, (Categorical, np.ndarray)):
             values = np.asarray(self)
             f = getattr(values, op)
             return f(np.asarray(other))
@@ -23,7 +23,7 @@ def f(self, other):
 
     return f
 
-class Factor(object):
+class Categorical(object):
     """
     Represents a categorical variable in classic R / S-plus fashion
 
@@ -41,12 +41,6 @@ class Factor(object):
       * levels : ndarray
     """
     def __init__(self, labels, levels, name=None):
-        from pandas.core.index import _ensure_index
-
-        levels = _ensure_index(levels)
-        if not levels.is_unique:
-            raise ValueError('Factor levels must be unique')
-
         self.labels = labels
         self.levels = levels
         self.name = name
@@ -58,28 +52,49 @@ def from_array(cls, data):
         except TypeError:
             labels, levels, _ = factorize(data, sort=False)
 
-        return Factor(labels, levels)
+        return Categorical(labels, levels,
+                           name=getattr(data, 'name', None))
+
+    _levels = None
+    def _set_levels(self, levels):
+        from pandas.core.index import _ensure_index
+
+        levels = _ensure_index(levels)
+        if not levels.is_unique:
+            raise ValueError('Categorical levels must be unique')
+        self._levels = levels
+
+    def _get_levels(self):
+        return self._levels
 
-    levels = None
+    levels = property(fget=_get_levels, fset=_set_levels)
 
-    __eq__ = _factor_compare_op('__eq__')
-    __ne__ = _factor_compare_op('__ne__')
-    __lt__ = _factor_compare_op('__lt__')
-    __gt__ = _factor_compare_op('__gt__')
-    __le__ = _factor_compare_op('__le__')
-    __ge__ = _factor_compare_op('__ge__')
+    __eq__ = _cat_compare_op('__eq__')
+    __ne__ = _cat_compare_op('__ne__')
+    __lt__ = _cat_compare_op('__lt__')
+    __gt__ = _cat_compare_op('__gt__')
+    __le__ = _cat_compare_op('__le__')
+    __ge__ = _cat_compare_op('__ge__')
 
     def __array__(self, dtype=None):
-        return com.take_1d(self.levels, self.labels)
+        return com.take_1d(self.levels.values, self.labels)
 
     def __len__(self):
         return len(self.labels)
 
     def __repr__(self):
-        temp = 'Factor:%s\n%s\nLevels (%d): %s'
+        temp = 'Categorical: %s\n%s\n%s'
         values = np.asarray(self)
+        levheader = 'Levels (%d): ' % len(self.levels)
+        levstring = np.array_repr(self.levels,
+                                  max_line_width=60)
+
+        indent = ' ' * (levstring.find('[') + len(levheader) + 1)
+        lines = levstring.split('\n')
+        levstring = '\n'.join([lines[0]] + [indent + x.lstrip() for x in lines[1:]])
+
         return temp % ('' if self.name is None else self.name,
-                       repr(values), len(self.levels), self.levels)
+                       repr(values), levheader + levstring)
 
     def __getitem__(self, key):
         if isinstance(key, (int, np.integer)):
@@ -89,22 +104,24 @@ def __getitem__(self, key):
             else:
                 return self.levels[i]
         else:
-            return Factor(self.labels[key], self.levels)
+            return Categorical(self.labels[key], self.levels)
 
     def equals(self, other):
         """
-        Returns True if factors are equal
+        Returns True if categorical arrays are equal
 
         Parameters
         ----------
-        other : Factor
+        other : Categorical
 
         Returns
         -------
         are_equal : boolean
         """
-        if not isinstance(other, Factor):
+        if not isinstance(other, Categorical):
             return False
 
         return (self.levels.equals(other.levels) and
                 np.array_equal(self.labels, other.labels))
+
+Factor = Categorical
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -56,29 +56,36 @@ def isnull(obj):
         return lib.checknull(obj)
 
     from pandas.core.generic import PandasObject
-    from pandas import Series
     if isinstance(obj, np.ndarray):
-        if obj.dtype.kind in ('O', 'S'):
-            # Working around NumPy ticket 1542
-            shape = obj.shape
-            result = np.empty(shape, dtype=bool)
-            vec = lib.isnullobj(obj.ravel())
-            result[:] = vec.reshape(shape)
-
-            if isinstance(obj, Series):
-                result = Series(result, index=obj.index, copy=False)
-        elif obj.dtype == np.dtype('M8[ns]'):
-            # this is the NaT pattern
-            result = np.array(obj).view('i8') == lib.iNaT
-        else:
-            result = -np.isfinite(obj)
-        return result
+        return _isnull_ndarraylike(obj)
     elif isinstance(obj, PandasObject):
         # TODO: optimize for DataFrame, etc.
         return obj.apply(isnull)
+    elif hasattr(obj, '__array__'):
+        return _isnull_ndarraylike(obj)
     else:
         return obj is None
 
+def _isnull_ndarraylike(obj):
+    from pandas import Series
+    values = np.asarray(obj)
+
+    if values.dtype.kind in ('O', 'S'):
+        # Working around NumPy ticket 1542
+        shape = values.shape
+        result = np.empty(shape, dtype=bool)
+        vec = lib.isnullobj(values.ravel())
+        result[:] = vec.reshape(shape)
+
+        if isinstance(obj, Series):
+            result = Series(result, index=obj.index, copy=False)
+    elif values.dtype == np.dtype('M8[ns]'):
+        # this is the NaT pattern
+        result = values.view('i8') == lib.iNaT
+    else:
+        result = -np.isfinite(obj)
+    return result
+
 def notnull(obj):
     '''
     Replacement for numpy.isfinite / -numpy.isnan which is suitable
@@ -482,6 +489,8 @@ def _possibly_cast_item(obj, item, dtype):
 
 def _is_bool_indexer(key):
     if isinstance(key, np.ndarray) and key.dtype == np.object_:
+        key = np.asarray(key)
+
         if not lib.is_bool_array(key):
             if isnull(key).any():
                 raise ValueError('cannot index with vector containing '

diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -672,8 +672,8 @@ def _has_names(index):
 # Global formatting options
 
 def set_printoptions(precision=None, column_space=None, max_rows=None,
-                     max_columns=None, colheader_justify='right',
-                     max_colwidth=50, notebook_repr_html=None,
+                     max_columns=None, colheader_justify=None,
+                     max_colwidth=None, notebook_repr_html=None,
                      date_dayfirst=None, date_yearfirst=None):
     """
     Alter default behavior of DataFrame.toString