Merge tag 'v0.8.0b2' into debian-0.8

Version 0.8.0 beta 2 * tag 'v0.8.0b2': (37 commits) RLS: 0.8.0 beta 2 BUG: bytes_to_str for read_csv BUG: import BytesIO for py3compat BUG: fix compat errors for yahoo data reader ENH: convert datetime.datetime ourselves, 15x speedup Make tox work across versions of Python from 2.5 to 3.2 Reenable py31 and py32 in .travis.yml TST: test coverage TST: oops, delete stray line REF: factor out ujson extension into pandasjson for now TST: eliminate copies in datetime64 serialization; don't copy data in DatetimeIndex, close pandas-dev#1320 DOC: refresh time zone docs close pandas-dev#1447 BUG: always raise exception when concat keys aren't found in passed levels, close pandas-dev#1406 ENH: implement passed quantile array to qcut and document that plus factors, close pandas-dev#1407 ENH: clearer out of bounds error message in cut/qcut, close pandas-dev#1409 ENH: allow renaming of index levels when concatenating, close pandas-dev#1419 BUG: fix MultiIndex bugs described in pandas-dev#1401 DOC: release notes BUG: implement multiple DataFrame.join / merge on non-unique indexes by multiple merges, close pandas-dev#1421 REF: remove offset names from pandas namespace ...
neurodebian · Jun 12, 2012 · 5b8661b · 5b8661b
2 parents c94ecc8 + fde270b
commit 5b8661b
Show file tree

Hide file tree

Showing 47 changed files with 703 additions and 6,016 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,13 +4,13 @@ python:
   - 2.5
   - 2.6
   - 2.7
+  - 3.1
   - 3.2
 
 install:
   - "if [[ $TRAVIS_PYTHON_VERSION == '2.5' ]]; then pip install --use-mirrors simplejson; fi"
   - pip install --use-mirrors cython numpy nose pytz
 
 script:
-  - python setup.py build_ext --inplace
-  - python setup.py install
-  - nosetests pandas
+  - python setup.py build_ext install
+  - nosetests --exe -w /tmp pandas.tests
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -83,6 +83,8 @@ pandas 0.8.0
   - Add new ``qcut`` for cutting with quantiles (#1378)
   - Add ``value_counts`` top level array method (#1392)
   - Added Andrews curves plot tupe (#1325)
+  - Add lag plot (#1440)
+  - Add autocorrelation_plot (#1425)
   - Add support for tox and Travis CI (#1382)
   - Add support for ordered factors and use in GroupBy (#292)
 
@@ -119,6 +121,9 @@ pandas 0.8.0
   - Exclude "nuisance" columns automatically in GroupBy.transform (#1364)
   - Support functions-as-strings in GroupBy.transform (#1362)
   - Use index name as xlabel/ylabel in plots (#1415)
+  - Add ``convert_dtype`` option to Series.apply to be able to leave data as
+    dtype=object (#1414)
+  - Can specify all index level names in concat (#1419)
 
 **API Changes**
 
@@ -177,6 +182,10 @@ pandas 0.8.0
   - Treat dict return values as Series in GroupBy.apply (#823)
   - Respect column selection for DataFrame in in GroupBy.transform (#1365)
   - Fix MultiIndex partial indexing bug (#1352)
+  - Enable assignment of rows in mixed-type DataFrame via .ix (#1432)
+  - Reset index mapping when grouping Series in Cython (#1423)
+  - Fix outer/inner DataFrame.join with non-unique indexes (#1421)
+  - Fix MultiIndex groupby bugs with empty lower levels (#1401)
 
 pandas 0.7.3
 ============

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -369,6 +369,47 @@ index labels with the minimum and maximum corresponding values:
    df1.idxmin(axis=0)
    df1.idxmax(axis=1)
 
+Value counts (histogramming)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``value_counts`` Series method and top-level function computes a histogram
+of a 1D array of values. It can also be used as a function on regular arrays:
+
+.. ipython:: python
+
+   data = np.random.randint(0, 7, size=50)
+   data
+   s = Series(data)
+   s.value_counts()
+   value_counts(data)
+
+
+Discretization and quantiling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Continuous values can be discretized using the ``cut`` (bins based on values)
+and ``qcut`` (bins based on sample quantiles) functions:
+
+.. ipython:: python
+
+   arr = np.random.randn(20)
+   factor = cut(arr, 4)
+   factor
+
+   factor = cut(arr, [-5, -1, 0, 1, 5])
+   factor
+
+``qcut`` computes sample quantiles. For example, we could slice up some
+normally distributed data into equal-size quartiles like so:
+
+.. ipython:: python
+
+   arr = np.random.randn(30)
+   factor = qcut(arr, [0, .25, .5, .75, 1])
+   factor
+   value_counts(factor)
+
+
 .. _basics.apply:
 
 Function application

diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst
@@ -590,3 +590,17 @@ If there are any NaN values in the grouping key, these will be automatically
 excluded. So there will never be an "NA group". This was not the case in older
 versions of pandas, but users were generally discarding the NA group anyway
 (and supporting it was an implementation headache).
+
+Grouping with ordered factors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Categorical variables represented as instance of pandas's ``Factor`` class can
+be used as group keys. If so, the order of the levels will be preserved:
+
+.. ipython:: python
+
+   data = Series(np.random.randn(100))
+
+   factor = qcut(data, [0, .25, .5, .75, 1.])
+
+   data.groupby(factor).mean()
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -852,16 +852,22 @@ other functions:
    rng_utc = date_range('3/6/2012 00:00', periods=10, freq='D', tz='UTC')
    print(rng_utc.tz)
 
-You can use the ``tz_convert`` method to convert pandas objects to a particular
-time zone:
+Timestamps, like Python's ``datetime.datetime`` object can be either time zone
+naive or time zone aware. Naive time series and DatetimeIndex objects can be
+*localized* using ``tz_localize``:
 
 .. ipython:: python
 
    ts = Series(randn(len(rng)), rng)
 
-   ts_utc = ts.tz_convert('UTC')
+   ts_utc = ts.tz_localize('UTC')
    ts_utc
 
+You can use the ``tz_convert`` method to convert pandas objects to convert
+tz-aware data to another time zone:
+
+.. ipython:: python
+
    ts_utc.tz_convert('US/Eastern')
 
 Under the hood, all timestamps are stored in UTC. Scalar values from a
@@ -886,3 +892,22 @@ time zones using ``tz_convert``:
    rng_eastern[5]
    rng_berlin[5]
    rng_eastern[5].tz_convert('Europe/Berlin')
+
+Localization of Timestamps functions just like DatetimeIndex and TimeSeries:
+
+.. ipython:: python
+
+   rng[5]
+   rng[5].tz_localize('Asia/Shanghai')
+
+
+Operations between TimeSeries in difficult time zones will yield UTC
+TimeSeries, aligning the data on the UTC timestamps:
+
+.. ipython:: python
+
+   eastern = ts_utc.tz_convert('US/Eastern')
+   berlin = ts_utc.tz_convert('Europe/Berlin')
+   result = eastern + berlin
+   result
+   result.index
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -283,3 +283,45 @@ of the same class will usually be closer together and form larger structures.
 
    @savefig andrews_curves.png width=6in
    andrews_curves(data, 'Name')
+
+Lag Plot
+~~~~~~~~
+
+Lag plots are used to check if a data set or time series is random. Random
+data should not exhibit any structure in the lag plot. Non-random structure
+implies that the underlying data are not random.
+
+.. ipython:: python
+
+   from pandas.tools.plotting import lag_plot
+
+   plt.figure()
+
+   data = Series(0.1 * np.random.random(1000) +
+      0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000)))
+
+   @savefig lag_plot.png width=6in
+   lag_plot(data)
+
+Autocorrelation Plot
+~~~~~~~~~~~~~~~~~~~~
+
+Autocorrelation plots are often used for checking randomness in time series.
+This is done by computing autocorrelations for data values at varying time lags.
+If time series is random, such autocorrelations should be near zero for any and
+all time-lag separations. If time series is non-random then one or more of the
+autocorrelations will be significantly non-zero. The horizontal lines displayed
+in the plot correspond to 95% and 99% confidence bands. The dashed line is 99%
+confidence band.
+
+.. ipython:: python
+
+   from pandas.tools.plotting import autocorrelation_plot
+
+   plt.figure()
+
+   data = Series(0.7 * np.random.random(1000) +
+      0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000)))
+
+   @savefig autocorrelation_plot.png width=6in
+   autocorrelation_plot(data)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -49,6 +49,7 @@ def unique(values):
     -------
     uniques
     """
+    values = com._asarray_tuplesafe(values)
     f = lambda htype, caster: _unique_generic(values, htype, caster)
     return _hashtable_algo(f, values.dtype)
 
@@ -155,6 +156,9 @@ def value_counts(values, sort=True, ascending=False):
     """
     from pandas.core.series import Series
     from collections import defaultdict
+
+    values = np.asarray(values)
+
     if com.is_integer_dtype(values.dtype):
         values = com._ensure_int64(values)
         keys, counts = lib.value_count_int64(values)

diff --git a/pandas/core/factor.py b/pandas/core/factor.py
@@ -1,6 +1,8 @@
 # pylint: disable=E1101,W0232
 
 import numpy as np
+
+from pandas.core.algorithms import factorize
 import pandas.core.common as com
 
 
@@ -51,8 +53,6 @@ def __init__(self, labels, levels, name=None):
 
     @classmethod
     def from_array(cls, data):
-        from pandas.core.algorithms import factorize
-
         try:
             labels, levels, _ = factorize(data, sort=True)
         except TypeError: