diff --git a/.gitignore b/.gitignore index 3782509c5c048..7699d72823d22 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc +*.swp build dist MANIFEST @@ -8,6 +9,7 @@ pandas/src/tseries.c pandas/src/sparse.c pandas/version.py doc/source/generated +doc/source/_static *flymake* scikits .coverage \ No newline at end of file diff --git a/Makefile b/Makefile index a4a700a81959e..a4861c1477d8e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,6 @@ clean: -rm -rf build dist tseries: pandas/src/tseries.pyx - touch pandas/src/tseries.pyx python setup.py build_ext --inplace sparse: pandas/src/sparse.pyx diff --git a/RELEASE.rst b/RELEASE.rst index 97fade2c1b7ca..e65ad6fef3455 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -5,10 +5,247 @@ Release Notes This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/wesm/pandas +What is it +---------- + +pandas is a Python package providing fast, flexible, and expressive data +structures designed to make working with “relational” or “labeled” data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, real world data analysis in Python. Additionally, it has the +broader goal of becoming the most powerful and flexible open source data +analysis / manipulation tool available in any language. + +Where to get it +--------------- + +* Source code: http://github.com/wesm/pandas +* Binary installers on PyPI: http://pypi.python.org/pypi/pandas +* Documentation: http://pandas.sourceforge.net + +pandas 0.5.1 +============ + +**Release date:** Not yet released + +**New features / modules** + + - Add `melt` function to `pandas.core.reshape` + +**Improvements to existing features** + + - Sped up `DataFrame.apply` performance in most cases + +**Bug fixes** + + - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index + name (GH #290) + - DataFrame should clear its Series caches on consolidation, was causing + "stale" Series to be returned in some corner cases (GH #304) + +Thanks +------ + +- Kieran O'Mahony + +pandas 0.5.0 +============ + +**Release date:** 10/24/2011 + +This release of pandas includes a number of API changes (see below) and cleanup +of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, new +features, numerous significant performance enhancements, and includes a new +IPython completer hook to enable tab completion of DataFrame columns accesses +as attributes (a new feature). + +In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases +0.4.1, 0.4.2, and 0.4.3 brought some significant new functionality and +performance improvements that are worth taking a look at. + +Thanks to all for bug reports, contributed patches and generally providing +feedback on the library. + +**API Changes** + + - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now + - Parsing functions like `read_csv` no longer parse dates by default (GH + #225) + - Removed `weights` option in panel regression which was not doing anything + principled (GH #155) + - Changed `buffer` argument name in `Series.to_string` to `buf` + - `Series.to_string` and `DataFrame.to_string` now return strings by default + instead of printing to sys.stdout + - Deprecated `nanRep` argument in various `to_string` and `to_csv` functions + in favor of `na_rep`. Will be removed in 0.6 (GH #275) + - Renamed `delimiter` to `sep` in `DataFrame.from_csv` for consistency + - Changed order of `Series.clip` arguments to match those of `numpy.clip` and + added (unimplemented) `out` argument so `numpy.clip` can be called on a + Series (GH #272) + - Series functions renamed (and thus deprecated) in 0.4 series have been + removed: + + * `asOf`, use `asof` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `merge`, use `map` + * `applymap`, use `apply` + * `combineFirst`, use `combine_first` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + + - DataFrame functions renamed / deprecated in 0.4 series have been removed: + + * `asMatrix` method, use `as_matrix` or `values` attribute + * `combineFirst`, use `combine_first` + * `getXS`, use `xs` + * `merge`, use `join` + * `fromRecords`, use `from_records` + * `fromcsv`, use `from_csv` + * `toRecords`, use `to_records` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + * `toDataMatrix` is no longer needed + * `rows()` method, use `index` attribute + * `cols()` method, use `columns` attribute + * `dropEmptyRows()`, use `dropna(how='all')` + * `dropIncompleteRows()`, use `dropna()` + * `tapply(f)`, use `apply(f, axis=1)` + * `tgroupby(keyfunc, aggfunc)`, use `groupby` with `axis=1` + + - Other outstanding deprecations have been removed: + + * `indexField` argument in `DataFrame.from_records` + * `missingAtEnd` argument in `Series.order`. Use `na_last` instead + * `Series.fromValue` classmethod, use regular `Series` constructor instead + * Functions `parseCSV`, `parseText`, and `parseExcel` methods in + `pandas.io.parsers` have been removed + * `Index.asOfDate` function + * `Panel.getMinorXS` (use `minor_xs`) and `Panel.getMajorXS` (use + `major_xs`) + * `Panel.toWide`, use `Panel.to_wide` instead + +**New features / modules** + + - Added `DataFrame.align` method with standard join options + - Added `parse_dates` option to `read_csv` and `read_table` methods to + optionally try to parse dates in the index columns + - Add `nrows`, `chunksize`, and `iterator` arguments to `read_csv` and + `read_table`. The last two return a new `TextParser` class capable of + lazily iterating through chunks of a flat file (GH #242) + - Added ability to join on multiple columns in `DataFrame.join` (GH #214) + - Added private `_get_duplicates` function to `Index` for identifying + duplicate values more easily + - Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] + if 'A' is a column in the DataFrame (PR #213) + - Added IPython tab completion hook for DataFrame columns. (PR #233, GH #230) + - Implement `Series.describe` for Series containing objects (PR #241) + - Add inner join option to `DataFrame.join` when joining on key(s) (GH #248) + - Can select set of DataFrame columns by passing a list to `__getitem__` (GH + #253) + - Can use & and | to intersection / union Index objects, respectively (GH + #261) + - Added `pivot_table` convenience function to pandas namespace (GH #234) + - Implemented `Panel.rename_axis` function (GH #243) + - DataFrame will show index level names in console output + - Implemented `Panel.take` + - Add `set_eng_float_format` function for setting alternate DataFrame + floating point string formatting + - Add convenience `set_index` function for creating a DataFrame index from + its existing columns + +**Improvements to existing features** + + - Major performance improvements in file parsing functions `read_csv` and + `read_table` + - Added Cython function for converting tuples to ndarray very fast. Speeds up + many MultiIndex-related operations + - File parsing functions like `read_csv` and `read_table` will explicitly + check if a parsed index has duplicates and raise a more helpful exception + rather than deferring the check until later + - Refactored merging / joining code into a tidy class and disabled unnecessary + computations in the float/object case, thus getting about 10% better + performance (GH #211) + - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about + 5x, regression from 0.3.0 (GH #215) + - With new `DataFrame.align` method, speeding up binary operations between + differently-indexed DataFrame objects by 10-25%. + - Significantly sped up conversion of nested dict into DataFrame (GH #212) + - Can pass hierarchical index level name to `groupby` instead of the level + number if desired (GH #223) + - Add support for different delimiters in `DataFrame.to_csv` (PR #244) + - Add more helpful error message when importing pandas post-installation from + the source directory (GH #250) + - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type + DataFrame objects + - Better handling of pyx file dependencies in Cython module build (GH #271) + +**Bug fixes** + + - `read_csv` / `read_table` fixes + - Be less aggressive about converting float->int in cases of floating point + representations of integers like 1.0, 2.0, etc. + - "True"/"False" will not get correctly converted to boolean + - Index name attribute will get set when specifying an index column + - Passing column names should force `header=None` (GH #257) + - Don't modify passed column names when `index_col` is not + None (GH #258) + - Can sniff CSV separator in zip file (since seek is not supported, was + failing before) + - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should + be reported upstream to matplotlib (GH #224) + - DataFrame.iteritems was not returning Series with the name attribute + set. Also neither was DataFrame._series + - Can store datetime.date objects in HDFStore (GH #231) + - Index and Series names are now stored in HDFStore + - Fixed problem in which data would get upcasted to object dtype in + GroupBy.apply operations (GH #237) + - Fixed outer join bug with empty DataFrame (GH #238) + - Can create empty Panel (GH #239) + - Fix join on single key when passing list with 1 entry (GH #246) + - Don't raise Exception on plotting DataFrame with an all-NA column (GH #251, + PR #254) + - Bug min/max errors when called on integer DataFrames (PR #241) + - `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute + - Panel.__repr__ raised exception on length-0 major/minor axes + - `DataFrame.join` on key with empty DataFrame produced incorrect columns + - Implemented `MultiIndex.diff` (GH #260) + - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream + issue GH #262 + - Can pass list of tuples to `Series` (GH #270) + - Can pass level name to `DataFrame.stack` + - Support set operations between MultiIndex and Index + - Fix many corner cases in MultiIndex set operations + - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not + indexed the same + - Fix corner case bugs in DataFrame.apply + - Setting DataFrame index did not cause Series cache to get cleared + - Various int32 -> int64 platform-specific issues + - Don't be too aggressive converting to integer when parsing file with + MultiIndex (GH #285) + - Fix bug when slicing Series with negative indices before beginning + +Thanks +------ + +- Thomas Kluyver +- Daniel Fortunov +- Aman Thakral +- Luca Beltrame +- Wouter Overmeire + pandas 0.4.3 ============ -**Release date:** not yet released +Release notes +------------- + +**Release date:** 10/9/2011 This is largely a bugfix release from 0.4.2 but also includes a handful of new and enhanced features. Also, pandas can now be installed and used on Python 3 @@ -69,6 +306,9 @@ Thanks pandas 0.4.2 ============ +Release notes +------------- + **Release date:** 10/3/2011 This is a performance optimization release with several bug fixes. The new @@ -144,6 +384,9 @@ Thanks pandas 0.4.1 ============ +Release notes +------------- + **Release date:** 9/25/2011 This is primarily a bug fix release but includes some new features and @@ -214,23 +457,6 @@ Thanks pandas 0.4 ========== -What is it ----------- - -**pandas** is a library of powerful labeled-axis data structures, statistical -tools, and general code for working with relational data sets, including time -series and cross-sectional data. It was designed with the practical needs of -statistical modeling and large, inhomogeneous data sets in mind. It is -particularly well suited for, among other things, financial data analysis -applications. - -Where to get it ---------------- - -Source code: http://github.com/wesm/pandas -Binary installers on PyPI: http://pypi.python.org/pypi/pandas -Documentation: http://pandas.sourceforge.net - Release notes ------------- @@ -491,8 +717,8 @@ Thanks - Skipper Seabold - Chris Jordan-Squire -pandas 0.3 -========== +pandas 0.3.0 +============ This major release of pandas represents approximately 1 year of continuous development work and brings with it many new features, bug fixes, speed @@ -500,21 +726,6 @@ enhancements, and general quality-of-life improvements. The most significant change from the 0.2 release has been the completion of a rigorous unit test suite covering all of the core functionality. -What is it ----------- - -**pandas** is a library of labeled data structures, statistical models, and -general code for working with time series and cross-sectional data. It was -designed with the practical needs of statistical modeling and large, -inhomogeneous data sets in mind. - -Where to get it ---------------- - -Source code: http://github.com/wesm/pandas -Binary installers on PyPI: http://pypi.python.org/pypi/pandas -Documentation: http://pandas.sourceforge.net - Release notes ------------- diff --git a/TODO.rst b/TODO.rst index 836c2791e971d..be67694659e98 100644 --- a/TODO.rst +++ b/TODO.rst @@ -1,7 +1,50 @@ +DONE +---- - SparseSeries name integration + tests - Refactor Series.repr -- .name pickling / unpicking / HDFStore handling -- Is there a way to write hierarchical columns to csv? -- Possible to blow away existing name when creating MultiIndex? -- prettytable output with index names -- Add load/save functions to top level pandas namespace + +TODO +---- +- _consolidate, does it always copy? +- Series.align with fill method. Will have to generate more Cython code + +TODO docs +--------- + +- DONE read_csv / read_table + - auto-sniff delimiter + - MultiIndex + - generally more documentation +- DONE pivot_table +- DONE Set mixed-type values with .ix +- DONE get_dtype_counts / dtypes +- DONE save / load functions +- DONE isnull/notnull as instance methods +- DONE DataFrame.to_string +- DONE IPython tab complete hook +- DONE ignore_index in DataFrame.append +- DONE describe for Series with dtype=object +- DONE as_index=False in groupby +- DONOTWANT is_monotonic +- DONE DataFrame.to_csv: different delimiters +- DONE combine_first +- DONE groupby with level name +- DONE MultiIndex get_level_values +- DONE & and | for intersection / union +- DONE Update to reflect Python 3 support in intro +- DONE Index / MultiIndex names +- DONE Unstack / stack by level name +- DONE name attribute on Series +- DONE Multi-key joining +- DONE Inner join on key +- DONE align functions +- DONE df[col_list] +- DONE Panel.rename_axis + +Performance blog +---------------- +- Series / Time series data alignment +- DataFrame alignment +- Groupby +- joining +- Take diff --git a/doc/source/api.rst b/doc/source/api.rst index 8760df4608e38..610afb99141b7 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -5,6 +5,88 @@ API Reference ************* +.. _api.functions: + +General functions +----------------- + +Data manipulations +~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.tools.pivot + +.. autosummary:: + :toctree: generated/ + + pivot_table + +Pickling +~~~~~~~~ + +.. currentmodule:: pandas.core.common + +.. autosummary:: + :toctree: generated/ + + load + save + +File IO +~~~~~~~ + +.. currentmodule:: pandas.io.parsers + +.. autosummary:: + :toctree: generated/ + + read_table + read_csv + ExcelFile.parse + +HDFStore: PyTables (HDF5) +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.io.pytables + +.. autosummary:: + :toctree: generated/ + + HDFStore.put + HDFStore.get + +Standard moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: pandas.stats.moments + +.. autosummary:: + :toctree: generated/ + + rolling_count + rolling_sum + rolling_mean + rolling_median + rolling_var + rolling_std + rolling_corr + rolling_cov + rolling_skew + rolling_kurt + rolling_apply + rolling_quantile + +Exponentially-weighted moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + ewma + ewmstd + ewmvar + ewmcorr + ewmcov + +.. currentmodule:: pandas + .. _api.series: Series @@ -20,6 +102,8 @@ Attributes and underlying data Series.values Series.dtype + Series.isnull + Series.notnull Conversion / Constructors ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -89,12 +173,14 @@ Computations / Descriptive Stats Series.std Series.sum Series.var + Series.value_counts Reindexing / Selection / Label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autosummary:: :toctree: generated/ + Series.align Series.drop Series.reindex Series.reindex_like @@ -178,6 +264,8 @@ Attributes and underlying data :toctree: generated/ DataFrame.as_matrix + DataFrame.dtypes + DataFrame.get_dtype_counts DataFrame.values DataFrame.axes DataFrame.ndim @@ -267,6 +355,7 @@ Reindexing / Selection / Label manipulation DataFrame.add_prefix DataFrame.add_suffix + DataFrame.align DataFrame.drop DataFrame.filter DataFrame.reindex @@ -355,75 +444,3 @@ Panel Computations / Descriptive Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Input / Output --------------- - -.. currentmodule:: pandas.io.parsers - -File IO -~~~~~~~ - -.. autosummary:: - :toctree: generated/ - - read_table - read_csv - ExcelFile.parse - -HDFStore: PyTables (HDF5) -~~~~~~~~~~~~~~~~~~~~~~~~~ -.. currentmodule:: pandas.io.pytables - -.. autosummary:: - :toctree: generated/ - - HDFStore.put - HDFStore.get - -GroupBy -------- - -.. currentmodule:: pandas.core.groupby - -.. autosummary:: - :toctree: generated/ - - groupby - -Moving window statistics ------------------------- - -Standard moving window functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: pandas.stats.moments - -.. autosummary:: - :toctree: generated/ - - rolling_count - rolling_sum - rolling_mean - rolling_median - rolling_var - rolling_std - rolling_corr - rolling_cov - rolling_skew - rolling_kurt - rolling_apply - rolling_quantile - -Exponentially-weighted moving window functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: generated/ - - ewma - ewmstd - ewmvar - ewmcorr - ewmcov - diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 0b652e8eacf48..03c663565dc34 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -157,6 +157,29 @@ replace NaN with some other value using ``fillna`` if you wish). df + df2 df.add(df2, fill_value=0) +Combining overlapping data sets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A problem occasionally arising is the combination of two similar data sets +where values in one are preferred over the other. An example would be two data +series representing a particular economic indicator where one is considered to +be of "higher quality". However, the lower quality series might extend further +back in history or have more complete data coverage. As such, we would like to +combine two DataFrame objects where missing values in one DataFrame are +conditionally filled with like-labeled values from the other DataFrame. The +function implementing this operation is ``combine_first``, which we illustrate: + +.. ipython:: python + + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 + df2 + df1.combine_first(df2) + + .. _basics.stats: Descriptive statistics @@ -242,9 +265,9 @@ will exclude NAs on Series input by default: Summarizing data: describe ~~~~~~~~~~~~~~~~~~~~~~~~~~ -For floating point data, there is a convenient ``describe`` function which -computes a variety of summary statistics about a Series or the columns of a -DataFrame (excluding NAs of course): +There is a convenient ``describe`` function which computes a variety of summary +statistics about a Series or the columns of a DataFrame (excluding NAs of +course): .. ipython:: python @@ -255,6 +278,16 @@ DataFrame (excluding NAs of course): frame.ix[::2] = np.nan frame.describe() +For a non-numerical Series object, `describe` will give a simple summary of the +number of unique values and most frequently occurring values: + + +.. ipython:: python + + s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s.describe() + + Correlations between objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -434,7 +467,7 @@ Reindexing to align with another object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You may wish to take an object and reindex its axes to be labeled the same as -another object. While the syntax for this is straightforwad albeit verbose, it +another object. While the syntax for this is straightforward albeit verbose, it is a common enough operation that the ``reindex_like`` method is available to make this simpler: @@ -451,6 +484,36 @@ make this simpler: df2 df.reindex_like(df2) +.. _basics.align: + +Aligning objects with each other with ``align`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``align`` method is the fastest way to simultaneously align two objects. It +supports a ``join`` argument (related to :ref:`joining and merging `): + + - ``join='outer'``: take the union of the indexes + - ``join='left'``: use the calling object's index + - ``join='right'``: use the passed object's index + - ``join='inner'``: intersect the indexes + +It returns a tuple with both of the reindexed Series: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s1 = s[:4] + s2 = s[1:] + s1.align(s2) + s1.align(s2, join='inner') + s1.align(s2, join='left') + +For DataFrames, the join method will be applied to both the + +.. ipython:: python + + df.align(df2, join='inner') + .. _basics.reindex_fill: Filling while reindexing @@ -540,6 +603,9 @@ Series, it need only contain a subset of the labels as keys: df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) +The Panel class has an a related ``rename_axis`` class which can rename any of +its three axes. + Iteration --------- @@ -657,15 +723,28 @@ alternately passing the ``dtype`` keyword argument to the object constructor. Pickling and serialization -------------------------- -All pandas objects are equipped with ``save`` and ``load`` methods which use -Python's ``cPickle`` module to save and load data structures to disk using the -pickle format. +All pandas objects are equipped with ``save`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. .. ipython:: python df df.save('foo.pickle') - DataFrame.load('foo.pickle') + +The ``load`` function in the ``pandas`` namespace can be used to load any +pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + load('foo.pickle') + +There is also a ``save`` function which takes any object as its first argument: + +.. ipython:: python + + save(df, 'foo.pickle') + load('foo.pickle') .. ipython:: python :suppress: diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index eca3e2ccde4c6..c0e5cf073d40e 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -181,6 +181,20 @@ tools for working with labeled data. of course have the option of dropping labels with missing data via the **dropna** function. +Name attribute +~~~~~~~~~~~~~~ + +Series can also have a ``name`` attribute: + +.. ipython:: python + + s = Series(np.random.randn(5), name='something') + s + s.name + +The Series ``name`` will be assigned automatically in many cases, in particular +when taking 1D slices of DataFrame as you will see below. + .. _basics.dataframe: DataFrame @@ -439,12 +453,51 @@ R package): baseball = read_csv('data/baseball.csv') baseball -However, using ``to_string`` will display any DataFrame in tabular form, though -it won't always fit the console width: +However, using ``to_string`` will return a string representation of the +DataFrame in tabular form, though it won't always fit the console width: + +.. ipython:: python + + print baseball.ix[-20:, :12].to_string() + +DataFrame column types +~~~~~~~~~~~~~~~~~~~~~~ + +The four main types stored in pandas objects are float, int, boolean, and +object. A convenient ``dtypes`` attribute return a Series with the data type of +each column: .. ipython:: python - baseball.ix[-20:, :12].to_string() + baseball.dtypes + +The related method ``get_dtype_counts`` will return the number of columns of +each type: + +.. ipython:: python + + baseball.get_dtype_counts() + +DataFrame column attribute access and IPython completion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a DataFrame column label is a valid Python variable name, the column can be +accessed like attributes: + +.. ipython:: python + + df = DataFrame({'foo1' : np.random.randn(5), + 'foo2' : np.random.randn(5)}) + df + df.foo1 + +The columns are also connected to the `IPython `__ +completion mechanism so they can be tab-completed: + +.. code-block:: ipython + + In [5]: df.fo + df.foo1 df.foo2 .. _basics.panel: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index bc2e2c10e9419..57aafef866bed 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -159,7 +159,7 @@ natural to group by one of the levels of the hierarchy. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) tuples - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) s = Series(randn(8), index=index) .. ipython:: python @@ -168,6 +168,13 @@ natural to group by one of the levels of the hierarchy. grouped = s.groupby(level=0) grouped.sum() +If the MultiIndex has names specified, these can be passed instead of the level +number: + +.. ipython:: python + + s.groupby(level='second').sum() + More on the ``sum`` function and aggregation later. Grouping with multiple levels (as opposed to a single level) is not yet supported, though implementing it is not difficult. @@ -250,6 +257,8 @@ changed by using the ``as_index`` option: grouped = df.groupby(['A', 'B'], as_index=False) grouped.aggregate(np.sum) + df.groupby('A', as_index=False).sum() + Note that you could use the ``delevel`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 905f004e955f1..bdab2ce154df6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -187,6 +187,17 @@ As we will see later on, the same operation could be accomplished by reindexing. However, the syntax would be more verbose; hence, the inclusion of this indexing method. +Selecting DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass a list of columns to ``[]`` to select columns in that order: + +.. ipython:: python + + df[['C', 'A', 'B']] + +If a column is not contained in the DataFrame, an exception will be raised: + .. _indexing.advanced: Advanced indexing with labels @@ -302,6 +313,58 @@ values, though setting arbitrary vectors is not yet supported: df2.ix[3] = np.nan df2 +.. _indexing.class: + +Index objects +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create one directly +is to pass a list or other sequence to ``Index``: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(range(5), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) @@ -346,10 +409,18 @@ can think of ``MultiIndex`` an array of tuples where each tuple is unique. A ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) tuples - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) s = Series(randn(8), index=index) s +All of the ``MultiIndex`` constructors accept a ``names`` argument which stores +string names for the levels themselves. If no names are provided, some +arbitrary ones will be assigned: + +.. ipython:: python + + index.names + This index can back any axis of a pandas object, and the number of **levels** of the index is up to you: @@ -376,17 +447,17 @@ can find yourself working with hierarchically-indexed data without creating a ``MultiIndex`` explicitly yourself. However, when loading data from a file, you may wish to generate your own ``MultiIndex`` when preparing the data set. -Level names -~~~~~~~~~~~ +Reconstructing the level labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -All of the ``MultiIndex`` constructors accept a ``names`` argument which stores -string names for the levels themselves. This will get increasingly integrated -in to groupby and reshaping routines. If no names are provided, some arbitrary -ones will be assigned: +The method ``get_level_values`` will return a vector of the labels for each +location at a particular level: .. ipython:: python - index.names + index.get_level_values(0) + index.get_level_values(1) + Basic indexing on axis with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -550,14 +621,15 @@ attribute. These will get automatically assigned in various places where Some gory internal details ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Internally, the ``MultiIndex`` consists of two things: the **levels** and the -**labels**: +Internally, the ``MultiIndex`` consists of a few things: the **levels**, the +integer **labels**, and the level **names**: .. ipython:: python index index.levels index.labels + index.names You can probably guess that the labels determine which unique element is identified with that location at each layer of the index. It's important to @@ -585,13 +657,15 @@ Indexing internal details codebase. And the source code is still the best place to look at the specifics of how things are implemented. -In pandas there are 3 distinct objects which can serve as valid containers for -the axis labels: +In pandas there are a few objects implemented which can serve as valid +containers for the axis labels: - ``Index``: the generic "ordered set" object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do :math:`O(1)` lookups. + - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer + data, such as time stamps - ``MultiIndex``: the standard hierarchical index object - ``DateRange``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects diff --git a/doc/source/install.rst b/doc/source/install.rst index cd1c814c4d8c2..444d8d5c3cfd2 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -16,9 +16,9 @@ compiler (MinGW or Visual Studio) installed. `How-to install MinGW on Windows Python version support ~~~~~~~~~~~~~~~~~~~~~~ -Officially Python 2.5 to 2.7. I will aim for Python 3.x support in the next -release. Python 2.4 support is being phased out since the userbase has shrunk -significantly. Continuing Python 2.4 support will require either monetary +Officially Python 2.5 to 2.7 and Python 3.1+, although Python 3 support is less +well tested. Python 2.4 support is being phased out since the userbase has +shrunk significantly. Continuing Python 2.4 support will require either monetary development support or someone contributing to the project to maintain compatibility. diff --git a/doc/source/io.rst b/doc/source/io.rst index 6e92a5ec166d9..ff5896bdcff7a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -6,6 +6,7 @@ :suppress: import numpy as np + import os np.random.seed(123456) from pandas import * from StringIO import StringIO @@ -19,83 +20,117 @@ IO Tools (Text, CSV, HDF5, ...) ******************************* -Text files ----------- - -The two workhorse functions for reading text (a.k.a. flat) files are -``read_csv`` and ``read_table``. They both utilize the same parsing code for -intelligently converting tabular data into a DataFrame object. They take a -number of different arguments: - - - ``path_or_buffer``: Either a string path to a file or any object (such as - an open ``file`` or ``StringIO``) with a ``read`` method. - - ``delimiter``: For ``read_table`` only, a regular expression to split - fields on. ``read_csv`` uses the ``csv`` module to do this and hence only - supports comma-separated values - - ``skiprows``: Rows in the file to skip - - ``header``: row number to use as the columns, defaults to 0 (first row) - - ``index_col``: integer, defaulting to 0 (the first column), instructing the - parser to use a particular column as the ``index`` (row labels) of the - resulting DataFrame - - ``na_values``: optional list of strings to recognize as NA/NaN +CSV & Text files +---------------- + +The two workhorse functions for reading text files (a.k.a. flat files) are +:func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table`. +They both use the same parsing code to intelligently convert tabular +data into a DataFrame object. They can take a number of arguments: + + - ``path_or_buffer``: Either a string path to a file, or any object with a + ``read`` method (such as an open file or ``StringIO``). + - ``sep``: A delimiter / separator to split fields on. `read_csv` is capable + of inferring automatically "sniffing" the delimiter in some cases + - ``header``: row number to use as the column names, and the start of the data. + Defaults to 0 (first row); specify None if there is no header row. + - ``names``: List of column names to use. If passed, header will be + implicitly set to None. + - ``skiprows``: A collection of numbers for rows in the file to skip. + - ``index_col``: column number, or list of column numbers, to use as the + ``index`` (row labels) of the resulting DataFrame. By default, it will number + the rows without using any column, unless there is one more data column than + there are headers, in which case the first column is taken as the index. + - ``parse_dates``: If True, attempt to parse the index column as dates. False + by default. - ``date_parser``: function to use to parse strings into datetime - objects. Defaults to the very robust ``dateutil.parser`` - - ``names``: optional list of column names for the data. Otherwise will be - read from the file + objects. If ``parse_dates`` is True, it defaults to the very robust + ``dateutil.parser``. Specifying this implicitly sets ``parse_dates`` as True. + - ``na_values``: optional list of strings to recognize as NaN (missing values), + in addition to a default set. + - ``nrows``: Number of rows to read out of the file. Useful to only read a + small portion of a large file + - ``chunksize``: An number of rows to be used to "chunk" a file into + pieces. Will cause an ``TextParser`` object to be returned. More on this + below in the section on :ref:`iterating and chunking ` + - ``iterator``: If True, return a ``TextParser`` to enable reading a file + into memory piece by piece + +.. ipython:: python + :suppress: -.. code-block:: ipython + f = open('foo.csv', 'w') + f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() - In [2]: print open('foo.csv').read() - A,B,C - 20090101,a,1,2 - 20090102,b,3,4 - 20090103,c,4,5 +Consider a typical CSV file containing, in this case, some time series data: - In [3]: df = read_csv('foo.csv') +.. ipython:: python - In [4]: df - Out[4]: - A B C - 2009-01-01 a 1 2 - 2009-01-02 b 3 4 - 2009-01-03 c 4 5 + print open('foo.csv').read() - # dates parsed to datetime - In [16]: df.index - Out[16]: Index([2009-01-01 00:00:00, 2009-01-02 00:00:00, - 2009-01-03 00:00:00], dtype=object) +The default for `read_csv` is to create a DataFrame with simple numbered rows: -If ``index_col=None``, the index will be a generic ``0...nrows-1``: +.. ipython:: python -.. code-block:: ipython + read_csv('foo.csv') - In [1]: print open('foo.csv').read() - index,A,B,C - 20090101,a,1,2 - 20090102,b,3,4 - 20090103,c,4,5 +In the case of indexed data, you can pass the column number (or a list of +column numbers, for a hierarchical index) you wish to use as the index. If the +index values are dates and you want them to be converted to ``datetime`` +objects, pass ``parse_dates=True``: - In [2]: read_csv('foo.csv') - Out[2]: - A B C - 2009-01-01 a 1 2 - 2009-01-02 b 3 4 - 2009-01-03 c 4 5 +.. ipython:: python + # Use a column as an index, and parse it as dates. + df = read_csv('foo.csv', index_col=0, parse_dates=True) + df + # These are python datetime objects + df.index - In [3]: read_csv('foo.csv', index_col=None) - Out[3]: - index A B C - 0 20090101 a 1 2 - 1 20090102 b 3 4 - 2 20090103 c 4 5 +.. ipython:: python + :suppress: + os.remove('foo.csv') The parsers make every attempt to "do the right thing" and not be very fragile. Type inference is a pretty big deal. So if a column can be coerced to integer dtype without altering the contents, it will do so. Any non-numeric columns will come through as object dtype as with the rest of pandas objects. +Files with an "implicit" index column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + f = open('foo.csv', 'w') + f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a file with one less entry in the header than the number of data +column: + +.. ipython:: python + + print open('foo.csv').read() + +In this special case, ``read_csv`` assumes that the first column is to be used +as the index of the DataFrame: + +.. ipython:: python + + read_csv('foo.csv') + +Note that the dates weren't automatically parsed. In that case you would need +to do as before: + +.. ipython:: python + + df = read_csv('foo.csv', parse_dates=True) + df.index + + Reading DataFrame objects with ``MultiIndex`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -114,6 +149,65 @@ column numbers to turn multiple columns into a ``MultiIndex``: df df.ix[1978] +.. .. _io.sniff: + +.. Automatically "sniffing" the delimiter +.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ``read_csv`` is capable of inferring delimited, but not necessarily +.. comma-separated, files in some cases: + +.. .. ipython:: python + +.. print open('tmp.csv').read() +.. read_csv('tmp.csv') + + + +.. _io.chunking: + +Iterating through files chunk by chunk +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose you wish to iterate through a (potentially very large) file lazily +rather than reading the entire file into memory, such as the following: + +.. ipython:: python + :suppress: + + df[:7].to_csv('tmp.sv', sep='|') + +.. ipython:: python + + print open('tmp.sv').read() + table = read_table('tmp.sv', sep='|') + table + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +By specifiying a ``chunksize`` to ``read_csv`` or ``read_table``, the return +value will be an iterable object of type ``TextParser``: + +.. ipython:: + + In [1]: reader = read_table('tmp.sv', sep='|', chunksize=4) + + In [1]: reader + + In [2]: for chunk in reader: + ...: print chunk + ...: + +Specifying ``iterator=True`` will also return the ``TextParser`` object: + +.. ipython:: python + + reader = read_table('tmp.sv', sep='|', iterator=True) + reader.get_chunk(5) + Excel 2003 files ---------------- @@ -142,7 +236,6 @@ performance HDF5 format using the excellent `PyTables .. ipython:: python :suppress: - import os os.remove('store.h5') .. ipython:: python diff --git a/doc/source/merging.rst b/doc/source/merging.rst index c404e53554d38..a5b639f3d18f2 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -14,8 +14,8 @@ Merging / Joining data sets *************************** -Appending disjoint objects --------------------------- +Appending DataFrame objects +--------------------------- Series and DataFrame have an ``append`` method which will glue together objects each of whose ``index`` (Series labels or DataFrame rows) is mutually @@ -40,6 +40,27 @@ In the case of DataFrame, the indexes must be disjoint but the columns do not ne df2 df1.append(df2) +Appending record-array like DataFrames +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For DataFrames which don't have a meaningful index, you may wish to append them +and ignore the fact that they may have overlapping indexes: + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(3, 4), columns=['A', 'B', 'C', 'D']) + + df1 + df2 + +To do this, use the ``ignore_index`` argument: + +.. ipython:: python + + df1.append(df2, ignore_index=True) + + Joining / merging DataFrames ---------------------------- @@ -68,8 +89,9 @@ Joining on a key ~~~~~~~~~~~~~~~~ ``join`` takes an optional ``on`` argument which should be a column name in the -calling DataFrame which will be used to "align" the passed DataFrame. This is -best illustrated by example: +calling DataFrame which will be used to "align" the passed DataFrame. The +joining currently aligns the calling DataFrame's column (or columns) on the +passed DataFrame's index. This is best illustrated by example: .. ipython:: python @@ -80,6 +102,44 @@ best illustrated by example: to_join df.join(to_join, on='key') +To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + data + to_join + + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2']) + +This is by default a "many-to-one" or "VLOOKUP"-style left join operation. An +inner join is also supported: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2'], how='inner') + +This drops any rows where there was no match. + Merging ordered records ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index c4df50b169f2b..272ad9ec64397 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -64,13 +64,15 @@ not ``NaN``, I think you will find this is a worthwhile trade-off (Zen of Python: "practicality beats purity"). To make detecting missing values easier (and across different array dtypes), -pandas provides the ``isnull`` and ``notnull`` functions: +pandas provides the :func:`~pandas.core.common.isnull` and +:func:`~pandas.core.common.notnull` functions, which are also methods on +``Series`` objects: .. ipython:: python df2['one'] isnull(df2['one']) - notnull(df2['four']) + df2['four'].notnull() **Summary:** ``NaN``, ``inf``, ``-inf``, and ``None`` (in object arrays) are all considered missing by the ``isnull`` and ``notnull`` functions. @@ -125,8 +127,6 @@ Cleaning / filling missing data pandas objects are equipped with various data manipulation methods for dealing with missing data. -dropna: - .. _missing_data.fillna: Filling missing values: fillna @@ -165,6 +165,8 @@ To remind you, these are the available filling methods: With time series data, using pad/ffill is extremely common so that the "last known value" is available at every time point. +.. _missing_data.dropna: + Dropping axis labels with missing data: dropna ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index c1388edc789e2..c2227511d9d40 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -11,15 +11,9 @@ randn = np.random.randn np.set_printoptions(precision=4, suppress=True) -*************************** -Pivoting and reshaping data -*************************** - -.. note:: - - Since some of the functionality documented in this section is very new, the - user should keep an eye on any changes to the API or behavior which may - occur by the next release. +************************** +Reshaping and Pivot Tables +************************** Reshaping by pivoting DataFrame objects --------------------------------------- @@ -119,7 +113,7 @@ take a prior example data set from the hierarchical indexing section: 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]) - index = MultiIndex.from_tuples(tuples) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] df2 @@ -148,6 +142,13 @@ unstacks the **last level**: stacked.unstack(1) stacked.unstack(0) +If the indexes have names, you can use the level names instead of specifying +the level numbers: + +.. ipython:: python + + stacked.unstack('second') + These functions are very intelligent about handling missing data and do not expect each subgroup within the hierarchical index to have the same set of labels. They also can handle the index being unsorted (but you can make it @@ -156,7 +157,8 @@ sorted by calling ``sortlevel``, of course). Here is a more complex example: .. ipython:: python columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')]) + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) df = DataFrame(randn(8, 4), index=index, columns=columns) df2 = df.ix[[0, 1, 2, 4, 5, 7]] df2 @@ -166,8 +168,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack(1) - df2.stack(0) + df2.stack('exp') + df2.stack('animal') Unstacking when the columns are a ``MultiIndex`` is also careful about doing the right thing: @@ -195,3 +197,52 @@ some very expressive and fast data manipulations. df.stack().groupby(level=1).mean() df.mean().unstack(0) + + +********************************** +Pivot tables and cross-tabulations +********************************** + +The function ``pandas.pivot_table`` can be used to create spreadsheet-style pivot +tables. It takes a number of arguments + +- ``data``: A DataFrame object +- ``values``: column to aggregate +- ``rows``: list of columns to group by on the table rows +- ``cols``: list of columns to group by on the table columns +- ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` + +Consider a data set like this: + +.. ipython:: python + + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, + 'B' : ['A', 'B', 'C'] * 4, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D' : np.random.randn(12), + 'E' : np.random.randn(12)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', rows=['A', 'B'], cols=['C']) + pivot_table(df, values='D', rows=['B'], cols=['A', 'C'], aggfunc=np.sum) + +The result object is a DataFrame having potentially hierarchical indexes on the +rows and columns. If the ``values`` column name is not given, the pivot table +will include all of the data that can be aggregated in an additional level of +hierarchy in the columns: + +.. ipython:: python + + pivot_table(df, rows=['A', 'B'], cols=['C']) + +You can render a nice output of the table omitting the missing values by +calling ``to_string`` if you wish: + +.. ipython:: python + + table = pivot_table(df, rows=['A', 'B'], cols=['C']) + print table.to_string(na_rep='') diff --git a/doc/source/stats.rst b/doc/source/stats.rst index 037bd9734ca3c..8ed0de04f740f 100755 --- a/doc/source/stats.rst +++ b/doc/source/stats.rst @@ -200,7 +200,7 @@ Let's pull in some sample data: data = dict((sym, DataReader(sym, "yahoo")) for sym in symbols) panel = Panel(data).swapaxes('items', 'minor') - close_px = panel['close'] + close_px = panel['Close'] # convert closing prices to returns rets = close_px / close_px.shift(1) - 1 @@ -289,7 +289,7 @@ actually quite easy: .. ipython:: python # make the units somewhat comparable - volume = panel['volume'] / 1e8 + volume = panel['Volume'] / 1e8 model = ols(y=volume, x={'return' : np.abs(rets)}) model diff --git a/doc/sphinxext/autosummary.py b/doc/sphinxext/autosummary.py deleted file mode 100755 index 2f8a00a3035c5..0000000000000 --- a/doc/sphinxext/autosummary.py +++ /dev/null @@ -1,349 +0,0 @@ -""" -=========== -autosummary -=========== - -Sphinx extension that adds an autosummary:: directive, which can be -used to generate function/method/attribute/etc. summary lists, similar -to those output eg. by Epydoc and other API doc generation tools. - -An :autolink: role is also provided. - -autosummary directive ---------------------- - -The autosummary directive has the form:: - - .. autosummary:: - :nosignatures: - :toctree: generated/ - - module.function_1 - module.function_2 - ... - -and it generates an output table (containing signatures, optionally) - - ======================== ============================================= - module.function_1(args) Summary line from the docstring of function_1 - module.function_2(args) Summary line from the docstring - ... - ======================== ============================================= - -If the :toctree: option is specified, files matching the function names -are inserted to the toctree with the given prefix: - - generated/module.function_1 - generated/module.function_2 - ... - -Note: The file names contain the module:: or currentmodule:: prefixes. - -.. seealso:: autosummary_generate.py - - -autolink role -------------- - -The autolink role functions as ``:obj:`` when the name referred can be -resolved to a Python object, and otherwise it becomes simple emphasis. -This can be used as the default role to make links 'smart'. - -""" -import sys, os, posixpath, re - -from docutils.parsers.rst import directives -from docutils.statemachine import ViewList -from docutils import nodes - -import sphinx.addnodes, sphinx.roles -from sphinx.util import patfilter - -from docscrape_sphinx import get_doc_object - -import warnings -warnings.warn( - "The numpydoc.autosummary extension can also be found as " - "sphinx.ext.autosummary in Sphinx >= 0.6, and the version in " - "Sphinx >= 0.7 is superior to the one in numpydoc. This numpydoc " - "version of autosummary is no longer maintained.", - DeprecationWarning, stacklevel=2) - -def setup(app): - app.add_directive('autosummary', autosummary_directive, True, (0, 0, False), - toctree=directives.unchanged, - nosignatures=directives.flag) - app.add_role('autolink', autolink_role) - - app.add_node(autosummary_toc, - html=(autosummary_toc_visit_html, autosummary_toc_depart_noop), - latex=(autosummary_toc_visit_latex, autosummary_toc_depart_noop)) - app.connect('doctree-read', process_autosummary_toc) - -#------------------------------------------------------------------------------ -# autosummary_toc node -#------------------------------------------------------------------------------ - -class autosummary_toc(nodes.comment): - pass - -def process_autosummary_toc(app, doctree): - """ - Insert items described in autosummary:: to the TOC tree, but do - not generate the toctree:: list. - - """ - env = app.builder.env - crawled = {} - def crawl_toc(node, depth=1): - crawled[node] = True - for j, subnode in enumerate(node): - try: - if (isinstance(subnode, autosummary_toc) - and isinstance(subnode[0], sphinx.addnodes.toctree)): - env.note_toctree(env.docname, subnode[0]) - continue - except IndexError: - continue - if not isinstance(subnode, nodes.section): - continue - if subnode not in crawled: - crawl_toc(subnode, depth+1) - crawl_toc(doctree) - -def autosummary_toc_visit_html(self, node): - """Hide autosummary toctree list in HTML output""" - raise nodes.SkipNode - -def autosummary_toc_visit_latex(self, node): - """Show autosummary toctree (= put the referenced pages here) in Latex""" - pass - -def autosummary_toc_depart_noop(self, node): - pass - -#------------------------------------------------------------------------------ -# .. autosummary:: -#------------------------------------------------------------------------------ - -def autosummary_directive(dirname, arguments, options, content, lineno, - content_offset, block_text, state, state_machine): - """ - Pretty table containing short signatures and summaries of functions etc. - - autosummary also generates a (hidden) toctree:: node. - - """ - - names = [] - names += [x.strip().split()[0] for x in content - if x.strip() and re.search(r'^[a-zA-Z_]', x.strip()[0])] - - table, warnings, real_names = get_autosummary(names, state, - 'nosignatures' in options) - node = table - - env = state.document.settings.env - suffix = env.config.source_suffix - all_docnames = env.found_docs.copy() - dirname = posixpath.dirname(env.docname) - - if 'toctree' in options: - tree_prefix = options['toctree'].strip() - docnames = [] - for name in names: - name = real_names.get(name, name) - - docname = tree_prefix + name - if docname.endswith(suffix): - docname = docname[:-len(suffix)] - docname = posixpath.normpath(posixpath.join(dirname, docname)) - if docname not in env.found_docs: - warnings.append(state.document.reporter.warning( - 'toctree references unknown document %r' % docname, - line=lineno)) - docnames.append(docname) - - tocnode = sphinx.addnodes.toctree() - tocnode['includefiles'] = docnames - tocnode['maxdepth'] = -1 - tocnode['glob'] = None - tocnode['entries'] = [(None, docname) for docname in docnames] - - tocnode = autosummary_toc('', '', tocnode) - return warnings + [node] + [tocnode] - else: - return warnings + [node] - -def get_autosummary(names, state, no_signatures=False): - """ - Generate a proper table node for autosummary:: directive. - - Parameters - ---------- - names : list of str - Names of Python objects to be imported and added to the table. - document : document - Docutils document object - - """ - document = state.document - - real_names = {} - warnings = [] - - prefixes = [''] - prefixes.insert(0, document.settings.env.currmodule) - - table = nodes.table('') - group = nodes.tgroup('', cols=2) - table.append(group) - group.append(nodes.colspec('', colwidth=10)) - group.append(nodes.colspec('', colwidth=90)) - body = nodes.tbody('') - group.append(body) - - def append_row(*column_texts): - row = nodes.row('') - for text in column_texts: - node = nodes.paragraph('') - vl = ViewList() - vl.append(text, '') - state.nested_parse(vl, 0, node) - try: - if isinstance(node[0], nodes.paragraph): - node = node[0] - except IndexError: - pass - row.append(nodes.entry('', node)) - body.append(row) - - for name in names: - try: - obj, real_name = import_by_name(name, prefixes=prefixes) - except ImportError: - warnings.append(document.reporter.warning( - 'failed to import %s' % name)) - append_row(":obj:`%s`" % name, "") - continue - - real_names[name] = real_name - - doc = get_doc_object(obj) - - if doc['Summary']: - title = " ".join(doc['Summary']) - else: - title = "" - - col1 = u":obj:`%s <%s>`" % (name, real_name) - if doc['Signature']: - sig = re.sub('^[^(\[]*', '', doc['Signature'].strip()) - if '=' in sig: - # abbreviate optional arguments - sig = re.sub(r', ([a-zA-Z0-9_]+)=', r'[, \1=', sig, count=1) - sig = re.sub(r'\(([a-zA-Z0-9_]+)=', r'([\1=', sig, count=1) - sig = re.sub(r'=[^,)]+,', ',', sig) - sig = re.sub(r'=[^,)]+\)$', '])', sig) - # shorten long strings - sig = re.sub(r'(\[.{16,16}[^,]*?),.*?\]\)', r'\1, ...])', sig) - else: - sig = re.sub(r'(\(.{16,16}[^,]*?),.*?\)', r'\1, ...)', sig) - # make signature contain non-breaking spaces - col1 += u"\\ \u00a0" + unicode(sig).replace(u" ", u"\u00a0") - col2 = title - append_row(col1, col2) - - return table, warnings, real_names - -def import_by_name(name, prefixes=[None]): - """ - Import a Python object that has the given name, under one of the prefixes. - - Parameters - ---------- - name : str - Name of a Python object, eg. 'numpy.ndarray.view' - prefixes : list of (str or None), optional - Prefixes to prepend to the name (None implies no prefix). - The first prefixed name that results to successful import is used. - - Returns - ------- - obj - The imported object - name - Name of the imported object (useful if `prefixes` was used) - - """ - for prefix in prefixes: - try: - if prefix: - prefixed_name = '.'.join([prefix, name]) - else: - prefixed_name = name - return _import_by_name(prefixed_name), prefixed_name - except ImportError: - pass - raise ImportError - -def _import_by_name(name): - """Import a Python object given its full name""" - try: - # try first interpret `name` as MODNAME.OBJ - name_parts = name.split('.') - try: - modname = '.'.join(name_parts[:-1]) - __import__(modname) - return getattr(sys.modules[modname], name_parts[-1]) - except (ImportError, IndexError, AttributeError): - pass - - # ... then as MODNAME, MODNAME.OBJ1, MODNAME.OBJ1.OBJ2, ... - last_j = 0 - modname = None - for j in reversed(range(1, len(name_parts)+1)): - last_j = j - modname = '.'.join(name_parts[:j]) - try: - __import__(modname) - except ImportError: - continue - if modname in sys.modules: - break - - if last_j < len(name_parts): - obj = sys.modules[modname] - for obj_name in name_parts[last_j:]: - obj = getattr(obj, obj_name) - return obj - else: - return sys.modules[modname] - except (ValueError, ImportError, AttributeError, KeyError), e: - raise ImportError(e) - -#------------------------------------------------------------------------------ -# :autolink: (smart default role) -#------------------------------------------------------------------------------ - -def autolink_role(typ, rawtext, etext, lineno, inliner, - options={}, content=[]): - """ - Smart linking role. - - Expands to ":obj:`text`" if `text` is an object that can be imported; - otherwise expands to "*text*". - """ - r = sphinx.roles.xfileref_role('obj', rawtext, etext, lineno, inliner, - options, content) - pnode = r[0][0] - - prefixes = [None] - #prefixes.insert(0, inliner.document.settings.env.currmodule) - try: - obj, name = import_by_name(pnode['reftarget'], prefixes) - except ImportError: - content = pnode[0] - r[0][0] = nodes.emphasis(rawtext, content[0].astext(), - classes=content['classes']) - return r diff --git a/doc/sphinxext/autosummary_generate.py b/doc/sphinxext/autosummary_generate.py deleted file mode 100755 index a327067488a7c..0000000000000 --- a/doc/sphinxext/autosummary_generate.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python -r""" -autosummary_generate.py OPTIONS FILES - -Generate automatic RST source files for items referred to in -autosummary:: directives. - -Each generated RST file contains a single auto*:: directive which -extracts the docstring of the referred item. - -Example Makefile rule:: - - generate: - ./ext/autosummary_generate.py -o source/generated source/*.rst - -""" -import glob, re, inspect, os, optparse, pydoc -from autosummary import import_by_name - -try: - from phantom_import import import_phantom_module -except ImportError: - import_phantom_module = lambda x: x - -def main(): - p = optparse.OptionParser(__doc__.strip()) - p.add_option("-p", "--phantom", action="store", type="string", - dest="phantom", default=None, - help="Phantom import modules from a file") - p.add_option("-o", "--output-dir", action="store", type="string", - dest="output_dir", default=None, - help=("Write all output files to the given directory (instead " - "of writing them as specified in the autosummary:: " - "directives)")) - options, args = p.parse_args() - - if len(args) == 0: - p.error("wrong number of arguments") - - if options.phantom and os.path.isfile(options.phantom): - import_phantom_module(options.phantom) - - # read - names = {} - for name, loc in get_documented(args).items(): - for (filename, sec_title, keyword, toctree) in loc: - if toctree is not None: - path = os.path.join(os.path.dirname(filename), toctree) - names[name] = os.path.abspath(path) - - # write - for name, path in sorted(names.items()): - if options.output_dir is not None: - path = options.output_dir - - if not os.path.isdir(path): - os.makedirs(path) - - try: - obj, name = import_by_name(name) - except ImportError, e: - print "Failed to import '%s': %s" % (name, e) - continue - - fn = os.path.join(path, '%s.rst' % name) - - if os.path.exists(fn): - # skip - continue - - f = open(fn, 'w') - - try: - f.write('%s\n%s\n\n' % (name, '='*len(name))) - - if inspect.isclass(obj): - if issubclass(obj, Exception): - f.write(format_modulemember(name, 'autoexception')) - else: - f.write(format_modulemember(name, 'autoclass')) - elif inspect.ismodule(obj): - f.write(format_modulemember(name, 'automodule')) - elif inspect.ismethod(obj) or inspect.ismethoddescriptor(obj): - f.write(format_classmember(name, 'automethod')) - elif callable(obj): - f.write(format_modulemember(name, 'autofunction')) - elif hasattr(obj, '__get__'): - f.write(format_classmember(name, 'autoattribute')) - else: - f.write(format_modulemember(name, 'autofunction')) - finally: - f.close() - -def format_modulemember(name, directive): - parts = name.split('.') - mod, name = '.'.join(parts[:-1]), parts[-1] - return ".. currentmodule:: %s\n\n.. %s:: %s\n" % (mod, directive, name) - -def format_classmember(name, directive): - parts = name.split('.') - mod, name = '.'.join(parts[:-2]), '.'.join(parts[-2:]) - return ".. currentmodule:: %s\n\n.. %s:: %s\n" % (mod, directive, name) - -def get_documented(filenames): - """ - Find out what items are documented in source/*.rst - See `get_documented_in_lines`. - - """ - documented = {} - for filename in filenames: - f = open(filename, 'r') - lines = f.read().splitlines() - documented.update(get_documented_in_lines(lines, filename=filename)) - f.close() - return documented - -def get_documented_in_docstring(name, module=None, filename=None): - """ - Find out what items are documented in the given object's docstring. - See `get_documented_in_lines`. - - """ - try: - obj, real_name = import_by_name(name) - lines = pydoc.getdoc(obj).splitlines() - return get_documented_in_lines(lines, module=name, filename=filename) - except AttributeError: - pass - except ImportError, e: - print "Failed to import '%s': %s" % (name, e) - return {} - -def get_documented_in_lines(lines, module=None, filename=None): - """ - Find out what items are documented in the given lines - - Returns - ------- - documented : dict of list of (filename, title, keyword, toctree) - Dictionary whose keys are documented names of objects. - The value is a list of locations where the object was documented. - Each location is a tuple of filename, the current section title, - the name of the directive, and the value of the :toctree: argument - (if present) of the directive. - - """ - title_underline_re = re.compile("^[-=*_^#]{3,}\s*$") - autodoc_re = re.compile(".. auto(function|method|attribute|class|exception|module)::\s*([A-Za-z0-9_.]+)\s*$") - autosummary_re = re.compile(r'^\.\.\s+autosummary::\s*') - module_re = re.compile(r'^\.\.\s+(current)?module::\s*([a-zA-Z0-9_.]+)\s*$') - autosummary_item_re = re.compile(r'^\s+([_a-zA-Z][a-zA-Z0-9_.]*)\s*.*?') - toctree_arg_re = re.compile(r'^\s+:toctree:\s*(.*?)\s*$') - - documented = {} - - current_title = [] - last_line = None - toctree = None - current_module = module - in_autosummary = False - - for line in lines: - try: - if in_autosummary: - m = toctree_arg_re.match(line) - if m: - toctree = m.group(1) - continue - - if line.strip().startswith(':'): - continue # skip options - - m = autosummary_item_re.match(line) - if m: - name = m.group(1).strip() - if current_module and not name.startswith(current_module + '.'): - name = "%s.%s" % (current_module, name) - documented.setdefault(name, []).append( - (filename, current_title, 'autosummary', toctree)) - continue - if line.strip() == '': - continue - in_autosummary = False - - m = autosummary_re.match(line) - if m: - in_autosummary = True - continue - - m = autodoc_re.search(line) - if m: - name = m.group(2).strip() - if m.group(1) == "module": - current_module = name - documented.update(get_documented_in_docstring( - name, filename=filename)) - elif current_module and not name.startswith(current_module+'.'): - name = "%s.%s" % (current_module, name) - documented.setdefault(name, []).append( - (filename, current_title, "auto" + m.group(1), None)) - continue - - m = title_underline_re.match(line) - if m and last_line: - current_title = last_line.strip() - continue - - m = module_re.match(line) - if m: - current_module = m.group(2) - continue - finally: - last_line = line - - return documented - -if __name__ == "__main__": - main() diff --git a/pandas/__init__.py b/pandas/__init__.py index be0388908b3cb..7d5ecf84fddf0 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -6,13 +6,25 @@ import numpy as np +try: + import pandas._tseries as lib +except Exception, e: # pragma: no cover + if 'No module named' in e.message: + raise ImportError('C extensions not built: if you installed already ' + 'verify that you are not importing from the source ' + 'directory') + else: + raise + from pandas.version import version as __version__ from pandas.info import __doc__ from pandas.core.api import * from pandas.core.common import set_printoptions +from pandas.core.common import set_eng_float_format from pandas.io.parsers import read_csv, read_table, ExcelFile from pandas.io.pytables import HDFStore from pandas.stats.api import * from pandas.util.testing import debug +from pandas.tools.pivot import pivot_table diff --git a/pandas/core/api.py b/pandas/core/api.py index c6552e2df3caf..3b4e2c4e50cce 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,7 +5,7 @@ from pandas.core.datetools import DateOffset import pandas.core.datetools as datetools -from pandas.core.common import isnull, notnull, set_printoptions +from pandas.core.common import isnull, notnull, set_printoptions, save, load from pandas.core.index import Index, Int64Index, Factor, MultiIndex from pandas.core.daterange import DateRange from pandas.core.series import Series, TimeSeries @@ -17,3 +17,4 @@ DataMatrix = DataFrame WidePanel = Panel + diff --git a/pandas/core/common.py b/pandas/core/common.py index ea2aea1ef7e6b..329648855b362 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1,20 +1,26 @@ """ Misc tools for implementing data structures """ +import cPickle try: from io import BytesIO -except ImportError: # Python < 2.6 +except ImportError: # pragma: no cover + # Python < 2.6 from cStringIO import StringIO as BytesIO import itertools from numpy.lib.format import read_array, write_array import numpy as np +import decimal +import math + import pandas._tseries as lib # XXX: HACK for NumPy 1.5.1 to suppress warnings try: np.seterr(all='ignore') + np.set_printoptions(suppress=True) except Exception: # pragma: no cover pass @@ -352,14 +358,117 @@ def set_printoptions(precision=None, column_space=None): if column_space is not None: _column_space = column_space +class EngFormatter(object): + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y" + } + + def __init__(self, precision=None, use_eng_prefix=False): + self.precision = precision + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num): + """ Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + + >>> format_eng(0) # for self.precision = 0 + '0' + + >>> format_eng(1000000) # for self.precision = 1, + # self.use_eng_prefix = True + '1.0M' + + >>> format_eng("-1e-6") # for self.precision = 2 + # self.use_eng_prefix = False + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + + dnum = decimal.Decimal(str(num)) + + sign = 1 + + if dnum < 0: # pragma: no cover + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = decimal.Decimal(int(math.floor(dnum.log10()/3)*3)) + else: + pow10 = decimal.Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + else: + if int_pow10 < 0: + prefix = 'E-%02d' % (-int_pow10) + else: + prefix = 'E+%02d' % int_pow10 + + mant = sign*dnum/(10**pow10) + + if self.precision is None: # pragma: no cover + format_str = u"%g%s" + elif self.precision == 0: + format_str = u"%i%s" + elif self.precision > 0: + format_str = (u"%%.%if%%s" % self.precision) + + formatted = format_str % (mant, prefix) + + return formatted.strip() + +def set_eng_float_format(precision=3, use_eng_prefix=False): + """ + Alter default behavior on how float is formatted in DataFrame. + Format float in engineering format. + + See also EngFormatter. + """ + global _float_format, _column_space + _float_format = EngFormatter(precision, use_eng_prefix) + _column_space = max(12, precision + 9) + _float_format = lambda x: '%.4g' % x _column_space = 12 -def _pfixed(s, space, nanRep=None, float_format=None): +def _pfixed(s, space, na_rep=None, float_format=None): if isinstance(s, float): - if nanRep is not None and isnull(s): + if na_rep is not None and isnull(s): if np.isnan(s): - s = nanRep + s = na_rep return (' %s' % s).ljust(space) if float_format: @@ -385,11 +494,11 @@ def _stringify(col): else: return '%s' % col -def _format(s, nanRep=None, float_format=None): +def _format(s, na_rep=None, float_format=None): if isinstance(s, float): - if nanRep is not None and isnull(s): + if na_rep is not None and isnull(s): if np.isnan(s): - s = nanRep + s = na_rep return ' %s' % s if float_format: @@ -484,7 +593,8 @@ def __init__(self, seq, key=lambda x:x): self.setdefault(k, []).append(value) try: __iter__ = dict.iteritems - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 def __iter__(self): return iter(dict.items(self)) @@ -514,11 +624,14 @@ def intersection(*seqs): result &= seq return type(seqs[0])(list(result)) -def _asarray_tuplesafe(values): - if not isinstance(values, (list, np.ndarray)): +def _asarray_tuplesafe(values, dtype=None): + if not isinstance(values, (list, tuple, np.ndarray)): values = list(values) - result = np.asarray(values) + if isinstance(values, list) and dtype in [np.object_, object]: + return lib.list_to_object_array(values) + + result = np.asarray(values, dtype=dtype) if issubclass(result.dtype.type, basestring): result = np.asarray(values, dtype=object) @@ -528,3 +641,41 @@ def _asarray_tuplesafe(values): result[:] = values return result + + +def save(obj, path): + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + obj : any object + path : string + File path + """ + f = open(path, 'wb') + try: + cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) + finally: + f.close() + + +def load(path): + """ + Load pickled pandas object (or any other pickled object) from the specified + file path + + Parameters + ---------- + path : string + File path + + Returns + ------- + unpickled : type of object stored in file + """ + f = open(path, 'rb') + try: + return cPickle.load(f) + finally: + f.close() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6338492558254..2c18da8e56428 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12,15 +12,16 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0212,W0231,W0703,W0622 +from itertools import izip from StringIO import StringIO import csv import operator -import warnings +import sys from numpy import nan import numpy as np -from pandas.core.common import (isnull, notnull, PandasError, +from pandas.core.common import (isnull, notnull, PandasError, adjoin, _try_sort, _pfixed, _default_index, _infer_dtype, _stringify, _maybe_upcast) from pandas.core.daterange import DateRange @@ -29,11 +30,10 @@ from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.series import Series, _is_bool_indexer -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas.core.datetools as datetools -import pandas._tseries as _tseries +import pandas._tseries as lib #---------------------------------------------------------------------- # Factory helper methods @@ -90,6 +90,7 @@ def f(self, other): return f + #---------------------------------------------------------------------- # DataFrame class @@ -133,7 +134,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, >>> df = DataFrame(data=d, index=index) >>> df2 = DataFrame(np.random.randn(10, 5)) >>> df3 = DataFrame(np.random.randn(10, 5), - columns=['a', 'b', 'c', 'd', 'e']) + ... columns=['a', 'b', 'c', 'd', 'e']) """ if data is None: @@ -276,11 +277,10 @@ def __iter__(self): def iteritems(self): """Iterator over (column, series) pairs""" - series = self._series - return ((k, series[k]) for k in self.columns) + return ((k, self[k]) for k in self.columns) iterkv = iteritems - if py3compat.PY3: + if py3compat.PY3: # pragma: no cover items = iteritems def __len__(self): @@ -353,8 +353,7 @@ def to_dict(self): return dict((k, v.to_dict()) for k, v in self.iteritems()) @classmethod - def from_records(cls, data, index=None, indexField=None, - exclude=None): + def from_records(cls, data, index=None, exclude=None): """ Convert structured or record ndarray to DataFrame @@ -369,11 +368,6 @@ def from_records(cls, data, index=None, indexField=None, ------- df : DataFrame """ - if indexField is not None: # pragma: no cover - warnings.warn("indexField argument is deprecated. Use index " - "instead", FutureWarning) - index = indexField - columns, sdict = _rec_to_dict(data) if exclude is None: @@ -429,7 +423,8 @@ def to_records(self, index=True): return np.rec.fromarrays(arrays, names=names) @classmethod - def from_csv(cls, path, header=0, delimiter=',', index_col=0): + def from_csv(cls, path, header=0, sep=',', index_col=0, + parse_dates=True): """ Read delimited file into DataFrame @@ -438,24 +433,27 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0): path : string header : int, default 0 Row to use at header (skip prior rows) - delimiter : string, default ',' + sep : string, default ',' + Field delimiter index_col : int or sequence, default 0 Column to use for index. If a sequence is given, a MultiIndex - is used. + is used. Different default from read_table + parse_dates : boolean, default True + Parse dates. Different default from read_table Notes ----- - Will attempt to convert index to datetimes for time series - data. Use read_csv for more options + Preferable to use read_table for most general purposes but from_csv + makes for an easy roundtrip to and from file, especially with a + DataFrame of time series data Returns ------- - y : DataFrame or DataFrame + y : DataFrame """ from pandas.io.parsers import read_table - df = read_table(path, header=header, sep=delimiter, - index_col=index_col) - return df + return read_table(path, header=header, sep=sep, + parse_dates=parse_dates, index_col=index_col) def to_sparse(self, fill_value=None, kind='block'): """ @@ -475,8 +473,8 @@ def to_sparse(self, fill_value=None, kind='block'): default_kind=kind, default_fill_value=fill_value) - def to_csv(self, path, nanRep='', cols=None, header=True, - index=True, index_label=None, mode='w'): + def to_csv(self, path, sep=",", na_rep='', cols=None, header=True, + index=True, index_label=None, mode='w', nanRep=None): """ Write DataFrame to a comma-separated values (csv) file @@ -487,6 +485,7 @@ def to_csv(self, path, nanRep='', cols=None, header=True, nanRep : string, default '' Missing data rep'n cols : sequence, optional + Columns to write header : boolean, default True Write out column names index : boolean, default True @@ -496,9 +495,17 @@ def to_csv(self, path, nanRep='', cols=None, header=True, `header` and `index` are True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. mode : Python write mode, default 'w' + sep : character, default "," + Field delimiter for the output file. """ f = open(path, mode) - csvout = csv.writer(f, lineterminator='\n') + csvout = csv.writer(f, lineterminator='\n', delimiter=sep) + + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep if cols is None: cols = self.columns @@ -515,10 +522,11 @@ def to_csv(self, path, nanRep='', cols=None, header=True, name = 'level_%d' % i index_label.append(name) else: - if self.index.name is None: - index_label = self.index.name - if index_label is None: - index_label = ['index'] + index_label = self.index.name + if index_label is None: + index_label = ['index'] + else: + index_label = [index_label] elif not isinstance(index_label, (list, tuple, np.ndarray)): # given a string for a DF with Index index_label = [index_label] @@ -537,7 +545,7 @@ def to_csv(self, path, nanRep='', cols=None, header=True, for i, col in enumerate(cols): val = series[col].get(idx) if isnull(val): - val = nanRep + val = na_rep row_fields.append(val) @@ -546,77 +554,26 @@ def to_csv(self, path, nanRep='', cols=None, header=True, f.close() def to_string(self, buf=None, columns=None, colSpace=None, - nanRep='NaN', formatters=None, float_format=None, - sparsify=True): - from pandas.core.common import _format, adjoin - import sys - - if buf is None: # pragma: no cover - buf = sys.stdout - - if colSpace is None: - def _myformat(v): - return _format(v, nanRep=nanRep, - float_format=float_format) - else: - def _myformat(v): - return _pfixed(v, colSpace, nanRep=nanRep, - float_format=float_format) - - if formatters is None: - formatters = {} - - def _format_col(col): - formatter = formatters.get(col, _myformat) - return [formatter(x) for x in self[col]] + na_rep='NaN', formatters=None, float_format=None, + sparsify=True, nanRep=None, index_names=True): - if columns is None: - columns = self.columns - else: - columns = [c for c in columns if c in self] - to_write = [] + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep - if len(columns) == 0 or len(self.index) == 0: - to_write.append('Empty %s' % type(self).__name__) - to_write.append(repr(self.index)) - else: - (str_index, - str_columns) = self._get_formatted_labels(sparsify=sparsify) - stringified = [str_columns[i] + _format_col(c) - for i, c in enumerate(columns)] - to_write.append(adjoin(1, str_index, *stringified)) - for s in to_write: - if isinstance(s, unicode): - to_write = [unicode(s) for s in to_write] - break + formatter = _DataFrameFormatter(self, buf=buf, columns=columns, + col_space=colSpace, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names) - for s in to_write: - print >> buf, s - - def _get_formatted_labels(self, sparsify=True): - from pandas.core.index import _sparsify - - if isinstance(self.index, MultiIndex): - fmt_index = self.index.format(sparsify=sparsify) - else: - fmt_index = self.index.format() - - if isinstance(self.columns, MultiIndex): - fmt_columns = self.columns.format(sparsify=False, adjoin=False) - str_columns = zip(*[[' %s' % y for y in x] - for x in zip(*fmt_columns)]) - if sparsify: - str_columns = _sparsify(str_columns) - - str_columns = [list(x) for x in zip(*str_columns)] - str_index = [''] * self.columns.nlevels + fmt_index - else: - str_columns = [[' %s' % x] for x in self.columns.format()] - str_index = [''] + fmt_index - - return str_index, str_columns + if buf is None: + return formatter.buf.getvalue() def info(self, verbose=True, buf=None): """ @@ -628,7 +585,6 @@ def info(self, verbose=True, buf=None): If False, don't print column count summary buf : writable buffer, defaults to sys.stdout """ - import sys if buf is None: # pragma: no cover buf = sys.stdout @@ -686,11 +642,23 @@ def _get_columns(self): def _set_columns(self, value): self._data.set_axis(0, value) - self._series_cache.clear() + self._clear_caches() columns = property(fset=_set_columns, fget=_get_columns) - # reference underlying BlockManager - index = AxisProperty(1) + def _get_index(self): + return self._data.axes[1] + + def _set_index(self, value): + self._data.set_axis(1, value) + self._clear_caches() + index = property(fset=_set_index, fget=_get_index) + + def _clear_caches(self): + self._series_cache.clear() + + def _consolidate_inplace(self): + self._clear_caches() + NDFrame._consolidate_inplace(self) def as_matrix(self, columns=None): """ @@ -772,25 +740,6 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover self._data = dm._data - #---------------------------------------------------------------------- - # Private helper methods - - def _intersect_index(self, other): - common_index = self.index - - if not common_index.equals(other.index): - common_index = common_index.intersection(other.index) - - return common_index - - def _intersect_columns(self, other): - common_cols = self.columns - - if not common_cols.equals(other.columns): - common_cols = common_cols.intersection(other.columns) - - return common_cols - #---------------------------------------------------------------------- # Array interface @@ -810,22 +759,34 @@ def __getitem__(self, key): new_data = self._data.get_slice(key, axis=1) return self._constructor(new_data) # either boolean or fancy integer index - elif isinstance(key, np.ndarray): - if len(key) != len(self.index): - raise ValueError('Item wrong length %d instead of %d!' % - (len(key), len(self.index))) + elif isinstance(key, (np.ndarray, list)): + if isinstance(key, list): + key = np.array(key, dtype=object) # also raises Exception if object array with NA values if _is_bool_indexer(key): key = np.asarray(key, dtype=bool) - - new_index = self.index[key] - return self.reindex(new_index) + return self._getitem_array(key) elif isinstance(self.columns, MultiIndex): return self._getitem_multilevel(key) else: return self._getitem_single(key) + def _getitem_array(self, key): + if key.dtype == np.bool_: + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + + new_index = self.index[key] + return self.reindex(new_index) + else: + indexer = self.columns.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise Exception("No column(s) named: %s" % str(key[mask])) + return self.reindex(columns=key) + def _slice(self, slobj, axis=0): if axis == 0: mgr_axis = 1 @@ -861,6 +822,14 @@ def _getitem_single(self, key): self._series_cache[key] = res return res + def __getattr__(self, name): + """After regular attribute access, try looking up the name of a column. + This allows simpler access to columns for interactive use.""" + if name in self.columns: + return self[name] + raise AttributeError("'%s' object has no attribute '%s'" % \ + (type(self).__name__, name)) + def __setitem__(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 @@ -1001,16 +970,66 @@ def xs(self, key, axis=0, copy=True): return data self._consolidate_inplace() - new_data = self._data.xs(key, axis=1, copy=copy) - if new_data.ndim == 1: - return Series(new_data.as_matrix(), index=self.columns, name=key) + loc = self.index.get_loc(key) + if np.isscalar(loc): + new_values = self._data.fast_2d_xs(loc, copy=copy) + return Series(new_values, index=self.columns, name=key) else: + new_data = self._data.xs(key, axis=1, copy=copy) result = DataFrame(new_data) result.index = _maybe_droplevels(result.index, key) return result #---------------------------------------------------------------------- - # Reindexing + # Reindexing and alignment + + def align(self, other, join='outer', copy=True): + """ + Align two DataFrame object on their index and columns with the specified + join method for each axis Index + + Parameters + ---------- + other : DataFrame + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + + Returns + ------- + (left, right) : (Series, Series) + Aligned Series + """ + if self.index.equals(other.index): + join_index = self.index + ilidx, iridx = None, None + else: + join_index, ilidx, iridx = self.index.join(other.index, how=join, + return_indexers=True) + + if self.columns.equals(other.columns): + join_columns = self.columns + clidx, cridx = None, None + else: + join_columns, clidx, cridx = self.columns.join(other.columns, + how=join, + return_indexers=True) + + def _align_frame(frame, row_idx, col_idx): + new_data = frame._data + if row_idx is not None: + new_data = new_data.reindex_indexer(join_index, row_idx, axis=1) + + if col_idx is not None: + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_items(join_columns) + + if copy and new_data is frame._data: + new_data = new_data.copy() + + return DataFrame(new_data) + + left = _align_frame(self, ilidx, clidx) + right = _align_frame(other, iridx, cridx) + return left, right def reindex(self, index=None, columns=None, method=None, copy=True): """Conform Series to new index with optional filling logic, placing @@ -1094,6 +1113,50 @@ def reindex_like(self, other, method=None, copy=True): return self.reindex(index=other.index, columns=other.columns, method=method, copy=copy) + def set_index(self, col_or_cols, drop=True, inplace=False): + """ + Set the DataFrame index (row labels) using one or more existing + columns. By default yields a new object. + + Parameters + ---------- + col_or_cols : column label or list of column labels + drop : boolean, default True + Delete columns to be used as the new index + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + + Returns + ------- + dataframe : DataFrame + """ + cols = col_or_cols + if not isinstance(col_or_cols, (list, tuple)): + cols = [col_or_cols] + + if inplace: + frame = self + + else: + frame = self.copy() + + arrays = [] + for col in cols: + level = frame[col] + if drop: + del frame[col] + arrays.append(level) + + index = MultiIndex.from_arrays(arrays, names=cols) + + if not index._verify_integrity(): + duplicates = index._get_duplicates() + raise Exception('Index has duplicate keys: %s' % duplicates) + + frame.index = index + + return frame + def take(self, indices, axis=0): """ Analogous to ndarray.take, return DataFrame corresponding to requested @@ -1402,27 +1465,14 @@ def rename(self, index=None, columns=None, copy=True): ------- renamed : DataFrame (new object) """ - if isinstance(index, (dict, Series)): - def index_f(x): - if x in index: - return index[x] - else: - return x - else: - index_f = index - - if isinstance(columns, (dict, Series)): - def columns_f(x): - if x in columns: - return columns[x] - else: - return x - else: - columns_f = columns + from pandas.core.series import _get_rename_function if index is None and columns is None: raise Exception('must pass either index or columns') + index_f = _get_rename_function(index) + columns_f = _get_rename_function(columns) + self._consolidate_inplace() result = self.copy(deep=copy) @@ -1437,38 +1487,18 @@ def columns_f(x): def _rename_index_inplace(self, mapper): self._data = self._data.rename_axis(mapper, axis=1) - self._series_cache.clear() + self._clear_caches() def _rename_columns_inplace(self, mapper): self._data = self._data.rename_items(mapper, copydata=False) - self._series_cache.clear() + self._clear_caches() #---------------------------------------------------------------------- # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None): - new_index = self.index.union(other.index) - - # some shortcuts - if fill_value is None: - if not self and not other: - return self._constructor(index=new_index) - elif not self: - return other * nan - elif not other: - return self * nan - - need_reindex = False - new_columns = self.columns.union(other.columns) - need_reindex = (need_reindex or not new_index.equals(self.index) - or not new_index.equals(other.index)) - need_reindex = (need_reindex or not new_columns.equals(self.columns) - or not new_columns.equals(other.columns)) - - this = self - if need_reindex: - this = self.reindex(index=new_index, columns=new_columns) - other = other.reindex(index=new_index, columns=new_columns) + this, other = self.align(other, join='outer', copy=False) + new_index, new_columns = this.index, this.columns this_vals = this.values other_vals = other.values @@ -1735,6 +1765,11 @@ def stack(self, level=-1, dropna=True): Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index + Parameters + ---------- + level : int or string, default last level + Level to stack, can pass level name + Returns ------- stacked : Series @@ -1748,8 +1783,8 @@ def unstack(self, level=-1): Parameters ---------- - level : int, default last level - Level to unstack + level : int or string, default last level + Level to unstack, can pass level name Examples -------- @@ -1933,7 +1968,7 @@ def apply(self, func, axis=0, broadcast=False): Examples -------- - >>> df.apply(numpy.sqrt) --> DataFrame + >>> df.apply(numpy.sqrt) # returns DataFrame >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) @@ -1945,7 +1980,7 @@ def apply(self, func, axis=0, broadcast=False): ------- applied : Series or DataFrame """ - if not len(self.columns): + if len(self.columns) == 0 and len(self.index) == 0: return self if isinstance(func, np.ufunc): @@ -1960,26 +1995,29 @@ def apply(self, func, axis=0, broadcast=False): def _apply_standard(self, func, axis): if axis == 0: - target = self - agg_index = self.columns + series_gen = ((c, self[c]) for c in self.columns) + res_index = self.columns + res_columns = self.index elif axis == 1: - target = self.T - agg_index = self.index + res_index = self.index + res_columns = self.columns + series_gen = ((i, Series(v, self.columns)) + for i, v in izip(self.index, self.values)) results = {} - for k in target.columns: - results[k] = func(target[k]) + for k, v in series_gen: + results[k] = func(v) if hasattr(results.values()[0], '__iter__'): - result = self._constructor(data=results, index=target.index, - columns=target.columns) + result = self._constructor(data=results, index=res_columns, + columns=res_index) if axis == 1: result = result.T return result else: - return Series(results, index=agg_index) + return Series(results, index=res_index) def _apply_broadcast(self, func, axis): if axis == 0: @@ -2081,7 +2119,7 @@ def append(self, other, ignore_index=False): new_index = None else: new_index = self.index.append(other.index) - new_index._verify_integrity() + assert(new_index._verify_integrity()) if self.columns.equals(other.columns): return self._append_same_columns(other, new_index) @@ -2132,14 +2170,15 @@ def _get_raw_column(self, col): def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): """ Join columns with other DataFrame either on index or on a key - column + column. Parameters ---------- other : DataFrame Index should be similar to one of the columns in this one on : string, default None - Column name to use, otherwise join on index + Column name to use, otherwise join on index. Just like an Excel + VLOOKUP operation how : {'left', 'right', 'outer', 'inner'} How to handle indexes of the two objects. Default: 'left' for joining on index, None otherwise @@ -2156,26 +2195,32 @@ def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): ------- joined : DataFrame """ + if how is None: + how = 'left' if on is not None: - if how is not None: - raise Exception('how parameter is not valid when ' - '*on* specified') - return self._join_on(other, on, lsuffix, rsuffix) + return self._join_on(other, on, how, lsuffix, rsuffix) else: - if how is None: - how = 'left' - return self._join_index(other, how, lsuffix, rsuffix) - def _join_on(self, other, on, lsuffix, rsuffix): + def _join_on(self, other, on, how, lsuffix, rsuffix): + if how not in ('left', 'inner'): # pragma: no cover + raise Exception('Only inner / left joins currently supported') + if isinstance(other, Series): assert(other.name is not None) other = DataFrame({other.name : other}) - if len(other.index) == 0: - return self + if isinstance(on, (list, tuple)): + if len(on) == 1: + join_key = self[on[0]].values + else: + join_key = zip(*[self[k] for k in on]) + join_key = common._asarray_tuplesafe(join_key, + dtype=np.object_) + else: + join_key = self[on].values - new_data = self._data.join_on(other._data, self[on], axis=1, + new_data = self._data.join_on(other._data, join_key, how=how, axis=1, lsuffix=lsuffix, rsuffix=rsuffix) return self._constructor(new_data) @@ -2245,21 +2290,7 @@ def corrwith(self, other, axis=0, drop=False): this = self._get_numeric_data() other = other._get_numeric_data() - com_index = this._intersect_index(other) - com_cols = this._intersect_columns(other) - - # feels hackish - if axis == 0: - result_index = com_index - if not drop: - result_index = this.columns.union(other.columns) - else: - result_index = com_cols - if not drop: - result_index = this.index.union(other.index) - - left = this.reindex(index=com_index, columns=com_cols) - right = other.reindex(index=com_index, columns=com_cols) + left, right = this.align(other, join='inner', copy=False) # mask missing values left = left + right * 0 @@ -2279,6 +2310,8 @@ def corrwith(self, other, axis=0, drop=False): correl = num / dom if not drop: + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) correl = correl.reindex(result_index) return correl @@ -2331,10 +2364,18 @@ def count(self, axis=0, level=None, numeric_only=False): return self._count_level(level, axis=axis, numeric_only=numeric_only) - y, axis_labels = self._get_agg_data(axis, numeric_only=numeric_only, - copy=False) - mask = notnull(y) - return Series(mask.sum(axis), index=axis_labels) + if numeric_only: + frame = self.ix[:, self._get_numeric_columns()] + else: + frame = self + + result = DataFrame.apply(frame, Series.count, axis=axis) + + # what happens with empty DataFrame + if isinstance(result, DataFrame): + result = Series({}) + + return result def _count_level(self, level, axis=0, numeric_only=False): # TODO: deal with sortedness?? @@ -2426,7 +2467,7 @@ def sum(self, axis=0, numeric_only=False, skipna=True): mask = np.isfinite(y) if skipna: - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): np.putmask(y, -mask, 0) the_sum = y.sum(axis) @@ -2455,7 +2496,7 @@ def min(self, axis=0, skipna=True): min : Series """ values = self.values.copy() - if skipna: + if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), np.inf) return Series(values.min(axis), index=self._get_agg_axis(axis)) @@ -2476,7 +2517,7 @@ def max(self, axis=0, skipna=True): max : Series """ values = self.values.copy() - if skipna: + if skipna and not issubclass(values.dtype.type, np.integer): np.putmask(values, -np.isfinite(values), -np.inf) return Series(values.max(axis), index=self._get_agg_axis(axis)) @@ -2498,7 +2539,7 @@ def prod(self, axis=0, skipna=True): """ y = np.array(self.values, subok=True) if skipna: - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): y[np.isnan(y)] = 1 result = y.prod(axis) count = self.count(axis) @@ -2815,12 +2856,14 @@ def plot(self, subplots=False, sharex=True, sharey=False, use_index=True, x = range(len(self)) for i, col in enumerate(_try_sort(self.columns)): + empty = self[col].count() == 0 + y = self[col].values if not empty else np.zeros(x.shape) if subplots: ax = axes[i] - ax.plot(x, self[col].values, 'k', label=col, **kwds) + ax.plot(x, y, 'k', label=col, **kwds) ax.legend(loc='best') else: - ax.plot(x, self[col].values, label=col, **kwds) + ax.plot(x, y, label=col, **kwds) ax.grid(grid) @@ -2892,123 +2935,144 @@ def combineMult(self, other): """ return self.mul(other, fill_value=1.) - def toDataMatrix(self): # pragma: no cover - warnings.warn("toDataMatrix will disappear in next release " - "as there is no longer a DataMatrix class", - FutureWarning) - return self.copy() - - def rows(self): # pragma: no cover - """Alias for the frame's index""" - warnings.warn("Replace usage of .rows() with .index, will be removed " - "in next release", FutureWarning) - return self.index - - def cols(self): # pragma: no cover - """Return sorted list of frame's columns""" - warnings.warn("Replace usage of .cols() with .columns, will be " - "removed in next release", FutureWarning) - return list(self.columns) - - def asMatrix(self, *args, **kwargs): # pragma: no cover - warnings.warn("asMatrix is deprecated. Use 'as_matrix' or .values " - "instead", FutureWarning) - return self.as_matrix(*args, **kwargs) - @classmethod - def fromRecords(cls, *args, **kwargs): # pragma: no cover - warnings.warn("fromRecords is deprecated. Use 'from_records' " - "instead", FutureWarning) - return cls.from_records(*args, **kwargs) +class _DataFrameFormatter(object): + """ + Render a console-friendly tabular output of a DataFrame + """ + def __init__(self, frame, buf=None, columns=None, col_space=None, + na_rep='NaN', formatters=None, float_format=None, + sparsify=True, index_names=True): + + self.frame = frame + self.buf = buf if buf is not None else StringIO() + self.show_index_names = index_names + self.sparsify = sparsify + self.float_format = float_format + self.formatters = formatters + self.na_rep = na_rep + self.col_space = col_space + self.column_filter = frame.columns if columns is None else set(columns) + + self._write_to_buffer() + + def _write_to_buffer(self): + frame = self.frame + format_col = self._get_column_formatter() - @classmethod - def fromcsv(cls, *args, **kwargs): # pragma: no cover - warnings.warn("fromcsv is deprecated. Use 'from_csv' " - "instead", FutureWarning) - return cls.from_csv(*args, **kwargs) - - combineFirst = deprecate('combineFirst', combine_first) - getXS = deprecate('getXS', xs) - merge = deprecate('merge', join) - toRecords = deprecate('toRecords', to_records) - toDict = deprecate('toDict', to_dict) - toString = deprecate('toString', to_string) - _firstTimeWithValue = deprecate('_firstTimeWithValue', first_valid_index) - _lastTimeWithValue = deprecate('_lastTimeWithValue', last_valid_index) - toCSV = deprecate('toCSV', to_csv) - - def dropEmptyRows(self, specificColumns=None): # pragma: no cover - """ - Return DataFrame with rows omitted containing ALL NaN values - for optionally specified set of columns. + to_write = [] - Parameters - ---------- - specificColumns : list-like, optional keyword - Columns to consider in removing NaN values. As a typical - application, you might provide the list of the columns involved in - a regression to exlude all the missing data in one shot. + if len(frame.columns) == 0 or len(frame.index) == 0: + to_write.append('Empty %s\n' % type(self.frame).__name__) + to_write.append(repr(frame.index)) + else: + # may include levels names also + str_index = self._get_formatted_index() + str_columns = self._get_formatted_column_labels() - Returns - ------- - This DataFrame with rows containing any NaN values deleted - """ - warnings.warn("dropEmptyRows is deprecated. Use dropna(how='all')", - FutureWarning) - return self.dropna(axis=0, subset=specificColumns, how='all') + stringified = [str_columns[i] + format_col(c) + for i, c in enumerate(frame.columns) + if c in self.column_filter] - def dropIncompleteRows(self, specificColumns=None, - minObs=None): # pragma: no cover - """ - Return DataFrame with rows omitted containing ANY NaN values for - optionally specified set of columns. + to_write.append(adjoin(1, str_index, *stringified)) - Parameters - ---------- - minObs : int or None (default) - Instead of requiring all the columns to have observations, require - only minObs observations - specificColumns : list-like, optional keyword - Columns to consider in removing NaN values. As a typical - application, you might provide the list of the columns involved in - a regression to exlude all the missing data in one shot. + for s in to_write: + if isinstance(s, unicode): + to_write = [unicode(s) for s in to_write] + break - Returns - ------- - This DataFrame with rows containing any NaN values deleted + self.buf.writelines(to_write) - """ - warnings.warn("dropEmptyRows is deprecated. Use dropna()", - FutureWarning) - if minObs is None: - return self.dropna(axis=0, subset=specificColumns, how='any') + def _get_column_formatter(self): + from pandas.core.common import _format + + col_space = self.col_space + + if col_space is None: + def _myformat(v): + return _format(v, na_rep=self.na_rep, + float_format=self.float_format) + else: + def _myformat(v): + return _pfixed(v, col_space, na_rep=self.na_rep, + float_format=self.float_format) + + formatters = {} if self.formatters is None else self.formatters + + def _format_col(col): + formatter = formatters.get(col, _myformat) + return [formatter(x) for x in self.frame[col]] + + return _format_col + + def _get_formatted_column_labels(self): + from pandas.core.index import _sparsify + + columns = self.frame.columns + + if isinstance(columns, MultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) + str_columns = zip(*[[' %s' % y for y in x] + for x in zip(*fmt_columns)]) + if self.sparsify: + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] else: - return self.dropna(axis=0, subset=specificColumns, thresh=minObs) + str_columns = [[' %s' % x] for x in columns.format()] - def tapply(self, func): # pragma: no cover - """ - Apply func to the transposed DataFrame, results as per apply - """ - warnings.warn("tapply is deprecated. Use apply(f, axis=1)", - FutureWarning) - return self.apply(func, axis=1) + if self.show_index_names and self.has_index_names: + for x in str_columns: + x.append('') - def tgroupby(self, keyfunc, applyfunc): # pragma: no cover - """ - Aggregate columns based on passed function + return str_columns - Parameters - ---------- - keyfunc : function - applyfunc : function + @property + def has_index_names(self): + return _has_names(self.frame.index) - Returns - ------- - y : DataFrame - """ - warnings.warn("tgroupby is deprecated. Use groupby with axis=1", - FutureWarning) - return self.T.groupby(keyfunc).aggregate(applyfunc).T + @property + def has_column_names(self): + return _has_names(self.frame.columns) + + def _get_formatted_index(self): + index = self.frame.index + columns = self.frame.columns + + show_index_names = self.show_index_names and self.has_index_names + show_col_names = self.show_index_names and self.has_column_names + + if isinstance(index, MultiIndex): + fmt_index = index.format(sparsify=self.sparsify, adjoin=False, + names=show_index_names) + else: + fmt_index = [index.format(name=show_index_names)] + + adjoined = adjoin(1, *fmt_index).split('\n') + + # empty space for columns + if show_col_names: + col_header = [' %s' % x for x in self._get_column_name_list()] + else: + col_header = [''] * columns.nlevels + + return col_header + adjoined + + def _get_column_name_list(self): + names = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend('' if name is None else name + for name in columns.names) + else: + names.append('' if columns.name is None else columns.name) + return names + +def _has_names(index): + if isinstance(index, MultiIndex): + return any([x is not None for x in index.names]) + else: + return index.name is not None def group_agg(values, bounds, f): """ @@ -3078,7 +3142,7 @@ def _get_index(v): if isinstance(v, Series): return v.index elif isinstance(v, dict): - return Index(_try_sort(v)) + return v.keys() index = None if len(data) == 0: @@ -3107,27 +3171,49 @@ def _get_index(v): def _union_indexes(indexes): + if len(indexes) == 0: + return Index([]) + if len(indexes) == 1: - index = indexes[0] - if _any_special_indexes(indexes): + result = indexes[0] + if isinstance(result, list): + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + if kind == 'special': result = indexes[0] for other in indexes[1:]: result = result.union(other) return result - else: + elif kind == 'array': index = indexes[0] for other in indexes[1:]: if not index.equals(other): - return Index(_tseries.fast_unique_multiple(indexes)) + return Index(lib.fast_unique_multiple(indexes)) return index + else: + return Index(lib.fast_unique_multiple_list(indexes)) -def _any_special_indexes(indexes): - for index in indexes: - if type(index) != Index: - return True - return False +def _sanitize_and_check(indexes): + kinds = list(set([type(index) for index in indexes])) + + if list in kinds: + if len(kinds) > 1: + indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x + for x in indexes] + kinds.remove(list) + else: + return indexes, 'list' + + + if len(kinds) > 1 or Index not in kinds: + return indexes, 'special' + else: + return indexes, 'array' def _check_data_types(data): @@ -3238,6 +3324,25 @@ def _homogenize(data, index, columns, dtype=None): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) +def install_ipython_completers(): # pragma: no cover + """Register the DataFrame type with IPython's tab completion machinery, so + that it knows about accessing column names as attributes.""" + from IPython.utils.generics import complete_object + + @complete_object.when_type(DataFrame) + def complete_dataframe(obj, prev_completions): + return prev_completions + [c for c in obj.columns \ + if isinstance(c, basestring) and py3compat.isidentifier(c)] + +# Importing IPython brings in about 200 modules, so we want to avoid it unless +# we're in IPython (when those modules are loaded anyway). +if "IPython" in sys.modules: # pragma: no cover + try: + install_ipython_completers() + except Exception: + pass + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be61619b74424..90a3b1c2c20b2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,6 +1,7 @@ import numpy as np import cPickle +from pandas.core.common import save, load from pandas.core.index import Index, MultiIndex, _ensure_index import pandas.core.datetools as datetools @@ -9,20 +10,12 @@ class Picklable(object): - def save(self, fileName): - f = open(fileName, 'wb') - try: - cPickle.dump(self, f, protocol=cPickle.HIGHEST_PROTOCOL) - finally: - f.close() + def save(self, path): + save(self, path) @classmethod - def load(cls, fileName): - f = open(fileName, 'rb') - try: - return cPickle.load(f) - finally: - f.close() + def load(cls, path): + return load(path) class PandasError(Exception): pass @@ -211,7 +204,7 @@ def sort_index(self, axis=0, ascending=True): def ix(self): raise NotImplementedError - def reindex(self, **kwds): + def reindex(self, *args, **kwds): raise NotImplementedError class NDFrame(PandasObject): @@ -328,7 +321,7 @@ def cumsum(self, axis=None, skipna=True): axis = self._get_axis_number(axis) y = self.values.copy() - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): mask = np.isnan(self.values) if skipna: @@ -367,7 +360,7 @@ def cumprod(self, axis=None, skipna=True): axis = self._get_axis_number(axis) y = self.values.copy() - if not issubclass(y.dtype.type, np.int_): + if not issubclass(y.dtype.type, np.integer): mask = np.isnan(self.values) if skipna: @@ -458,3 +451,60 @@ def add_suffix(self, suffix): """ new_data = self._data.add_suffix(suffix) return self._constructor(new_data) + + def rename_axis(self, mapper, axis=0, copy=True): + """ + Alter index and / or columns using input function or functions. + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. + + Parameters + ---------- + mapper : dict-like or function, optional + axis : int, default 0 + copy : boolean, default True + Also copy underlying data + + See also + -------- + DataFrame.rename + + Returns + ------- + renamed : type of caller + """ + # should move this at some point + from pandas.core.series import _get_rename_function + + mapper_f = _get_rename_function(mapper) + + if axis == 0: + new_data = self._data.rename_items(mapper_f, copydata=copy) + else: + new_data = self._data.rename_axis(mapper_f, axis=axis) + if copy: + new_data = new_data.copy() + + return self._constructor(new_data) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + + Parameters + ---------- + indices : list / array of ints + axis : int, default 0 + + Returns + ------- + taken : type of caller + """ + if axis == 0: + labels = self._get_axis(axis) + new_items = labels.take(indices) + new_data = self._data.reindex_items(new_items) + else: + new_data = self._data.take(indices, axis=axis) + return self._constructor(new_data) + diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 02b572f97302c..78d2c9f4ecbfb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -333,9 +333,10 @@ def _cython_agg_general(self, how): output = {} cannot_agg = [] for name, obj in self._iterate_slices(): - try: - obj = np.asarray(obj, dtype=float) - except ValueError: + if issubclass(obj.dtype.type, np.number): + if obj.dtype != np.float64: + obj = obj.astype('f8') + else: cannot_agg.append(name) continue @@ -411,8 +412,13 @@ def _python_apply_general(self, func, *args, **kwargs): not_indexed_same = False for key, group in self: group.name = key + + # group might be modified + group_axes = _get_axes(group) + res = func(group, *args, **kwargs) - if not _is_indexed_like(res, group): + + if not _is_indexed_like(res, group_axes): not_indexed_same = True result_keys.append(key) @@ -459,18 +465,19 @@ def groupby(obj, by, **kwds): return klass(obj, by, **kwds) groupby.__doc__ = GroupBy.__doc__ -def _is_indexed_like(obj, other): +def _get_axes(group): + if isinstance(group, Series): + return [group.index] + else: + return group.axes + +def _is_indexed_like(obj, axes): if isinstance(obj, Series): - if not isinstance(other, Series): + if len(axes) > 1: return False - return obj.index.equals(other.index) + return obj.index.equals(axes[0]) elif isinstance(obj, DataFrame): - if isinstance(other, Series): - return obj.index.equals(other.index) - - # deal with this when a case arises - assert(isinstance(other, DataFrame)) - return obj._indexed_same(other) + return obj.index.equals(axes[0]) return False @@ -503,6 +510,10 @@ def __init__(self, index, grouper=None, name=None, level=None): self.index = index if level is not None: + if not isinstance(level, int): + assert(level in index.names) + level = index.names.index(level) + inds = index.labels[level] labels = index.levels[level].take(inds) @@ -658,7 +669,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs): q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), - 'total' : np.sum}) + ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 @@ -685,13 +696,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs): except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) - if len(result) > 0: - if isinstance(result.values()[0], Series): - ret = DataFrame(result).T - else: - ret = Series(result) - else: - ret = Series({}) + ret = Series(result) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' @@ -755,7 +760,10 @@ def _aggregate_simple(self, func, *args, **kwargs): values = self.obj.values result = {} for k, v in self.primary.indices.iteritems(): - result[k] = func(values.take(v), *args, **kwargs) + agged = func(values.take(v), *args, **kwargs) + if isinstance(agged, np.ndarray): + raise Exception('Must produce aggregated value') + result[k] = agged return result @@ -766,6 +774,8 @@ def _aggregate_named(self, func, *args, **kwargs): grp = self.get_group(name) grp.name = name output = func(grp, *args, **kwargs) + if isinstance(output, np.ndarray): + raise Exception('Must produce aggregated value') result[name] = output return result @@ -1065,22 +1075,31 @@ def transform(self, func, *args, **kwargs): axis=self.axis) def _concat_frames(frames, index, columns=None, axis=0): - if axis == 0: - all_index = [np.asarray(x.index) for x in frames] - new_index = Index(np.concatenate(all_index)) + if len(frames) == 1: + return frames[0] + if axis == 0: + new_index = _concat_indexes([x.index for x in frames]) if columns is None: new_columns = frames[0].columns else: new_columns = columns else: - all_columns = [np.asarray(x.columns) for x in frames] - new_columns = Index(np.concatenate(all_columns)) + new_columns = _concat_indexes([x.columns for x in frames]) new_index = index - new_values = np.concatenate([x.values for x in frames], axis=axis) - result = DataFrame(new_values, index=new_index, columns=new_columns) - return result.reindex(index=index, columns=columns) + if frames[0]._is_mixed_type: + new_data = {} + for col in new_columns: + new_data[col] = np.concatenate([x[col].values for x in frames]) + return DataFrame(new_data, index=new_index, columns=new_columns) + else: + new_values = np.concatenate([x.values for x in frames], axis=axis) + result = DataFrame(new_values, index=new_index, columns=new_columns) + return result.reindex(index=index, columns=columns) + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) def _concat_frames_hierarchical(frames, keys, groupings, axis=0): if axis == 0: @@ -1092,8 +1111,14 @@ def _concat_frames_hierarchical(frames, keys, groupings, axis=0): new_columns = _make_concat_multiindex(all_columns, keys, groupings) new_index = frames[0].index - new_values = np.concatenate([x.values for x in frames], axis=axis) - return DataFrame(new_values, index=new_index, columns=new_columns) + if frames[0]._is_mixed_type: + new_data = {} + for col in new_columns: + new_data[col] = np.concatenate([x[col].values for x in frames]) + return DataFrame(new_data, index=new_index, columns=new_columns) + else: + new_values = np.concatenate([x.values for x in frames], axis=axis) + return DataFrame(new_values, index=new_index, columns=new_columns) def _make_concat_multiindex(indexes, keys, groupings): if not _all_indexes_same(indexes): @@ -1112,8 +1137,14 @@ def _make_concat_multiindex(indexes, keys, groupings): to_concat.append(np.repeat(k, len(index))) label_list.append(np.concatenate(to_concat)) - # these go in the last level - label_list.append(np.concatenate(indexes)) + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + for level in range(concat_index.nlevels): + label_list.append(concat_index.get_level_values(level)) + else: + label_list.append(concat_index.values) return MultiIndex.from_arrays(label_list) diff --git a/pandas/core/index.py b/pandas/core/index.py index daa9592c6e66e..e70d8a36b55db 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,7 +7,7 @@ from pandas.core.common import (adjoin as _adjoin, _stringify, _is_bool_indexer, _asarray_tuplesafe) -from pandas.util.decorators import deprecate, cache_readonly +from pandas.util.decorators import cache_readonly import pandas._tseries as lib __all__ = ['Index'] @@ -39,6 +39,12 @@ class Index(np.ndarray): ---- An Index instance can **only** contain hashable objects """ + _map_indices = lib.map_indices_object + _is_monotonic = lib.is_monotonic_object + _groupby = lib.groupby_object + _arrmap = lib.arrmap_object + + name = None def __new__(cls, data, dtype=None, copy=False, name=None): if isinstance(data, np.ndarray): if dtype is None and issubclass(data.dtype.type, np.integer): @@ -49,10 +55,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None): 'of some kind, %s was passed' % repr(data)) else: # other iterable of some kind - if not isinstance(data, (list, tuple)): - data = list(data) - subarr = np.empty(len(data), dtype=object) - subarr[:] = data + subarr = _asarray_tuplesafe(data, dtype=object) subarr = subarr.view(cls) subarr.name = name @@ -65,6 +68,14 @@ def __array_finalize__(self, obj): def dtype(self): return self.values.dtype + @property + def nlevels(self): + return 1 + + @property + def _constructor(self): + return Index + def summary(self): if len(self) > 0: index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) @@ -80,18 +91,43 @@ def values(self): @cache_readonly def is_monotonic(self): - return lib.is_monotonic_object(self) + return self._is_monotonic(self) _indexMap = None + _integrity = False + @property def indexMap(self): "{label -> location}" if self._indexMap is None: - self._indexMap = lib.map_indices_object(self) - self._verify_integrity() + self._indexMap = self._map_indices(self) + self._integrity = len(self._indexMap) == len(self) + if not self._integrity: + raise Exception('Index cannot contain duplicate values!') return self._indexMap + def _get_level_number(self, level): + if not isinstance(level, int): + assert(level == self.name) + level = 0 + return level + + def _verify_integrity(self): + if self._indexMap is None: + try: + self.indexMap + except Exception: + return False + return len(self.indexMap) == len(self) + + def _get_duplicates(self): + from collections import defaultdict + counter = defaultdict(lambda: 0) + for k in self.values: + counter[k] += 1 + return sorted(k for k, v in counter.iteritems() if v > 1) + _allDates = None def is_all_dates(self): """ @@ -102,10 +138,6 @@ def is_all_dates(self): return self._allDates - def _verify_integrity(self): - if len(self.indexMap) < len(self): - raise Exception('Index cannot contain duplicate values!') - def __iter__(self): return iter(self.view(np.ndarray)) @@ -163,22 +195,28 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ taken = self.view(np.ndarray).take(*args, **kwargs) - return Index(taken, name=self.name) + return self._constructor(taken, name=self.name) - def format(self, vertical=False): + def format(self, name=False): """ Render a string representation of the Index """ + result = [] + + if name: + result.append(self.name if self.name is not None else '') + if self.is_all_dates(): - to_join = [] zero_time = time(0, 0) for dt in self: if dt.time() != zero_time or dt.tzinfo is not None: return ['%s' % x for x in self] - to_join.append(dt.strftime("%Y-%m-%d")) - return to_join + result.append(dt.strftime("%Y-%m-%d")) + return result - return [_stringify(x) for x in self] + result.extend(_stringify(x) for x in self) + + return result def equals(self, other): """ @@ -247,6 +285,15 @@ def __add__(self, other): __le__ = _indexOp('__le__') __ge__ = _indexOp('__ge__') + def __sub__(self, other): + return self.diff(other) + + def __and__(self, other): + return self.intersection(other) + + def __or__(self, other): + return self.union(other) + def union(self, other): """ Form the union of two Index objects and sorts if possible @@ -268,7 +315,7 @@ def union(self, other): return _ensure_index(other) if self.is_monotonic and other.is_monotonic: - result = lib.outer_join_indexer_object(self, other)[0] + result = lib.outer_join_indexer_object(self, other.values)[0] else: indexer = self.get_indexer(other) indexer = (indexer == -1).nonzero()[0] @@ -319,9 +366,10 @@ def intersection(self, other): other = other.astype(object) if self.is_monotonic and other.is_monotonic: - return Index(lib.inner_join_indexer_object(self, other)[0]) + return Index(lib.inner_join_indexer_object(self, + other.values)[0]) else: - indexer = self.get_indexer(other) + indexer = self.get_indexer(other.values) indexer = indexer.take((indexer != -1).nonzero()[0]) return self.take(indexer) @@ -351,8 +399,6 @@ def diff(self, other): theDiff = sorted(set(self) - set(otherArr)) return Index(theDiff) - __sub__ = diff - def get_loc(self, key): """ Get integer location for requested label @@ -361,7 +407,6 @@ def get_loc(self, key): ------- loc : int """ - self._verify_integrity() return self.indexMap[key] def get_indexer(self, target, method=None): @@ -392,10 +437,10 @@ def get_indexer(self, target, method=None): ------- (indexer, mask) : (ndarray, ndarray) """ - target = _ensure_index(target) - method = self._get_method(method) + target = _ensure_index(target) + if self.dtype != target.dtype: target = Index(target, dtype=object) @@ -412,10 +457,10 @@ def get_indexer(self, target, method=None): return indexer def groupby(self, to_groupby): - return lib.groupby_object(self.values, to_groupby) + return self._groupby(self.values, to_groupby) def map(self, mapper): - return lib.arrmap_object(self.values, mapper) + return self._arrmap(self.values, mapper) def _get_method(self, method): if method: @@ -441,6 +486,10 @@ def reindex(self, target, method=None): return target, indexer def join(self, other, how='left', return_indexers=False): + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + if how == 'left': join_index = self elif how == 'right': @@ -465,6 +514,29 @@ def join(self, other, how='left', return_indexers=False): else: return join_index + def _join_monotonic(self, other, how='left', return_indexers=False): + if how == 'left': + join_index = self + lidx = None + ridx = lib.left_join_indexer_object(self, other) + elif how == 'right': + join_index = other + lidx = lib.left_join_indexer_object(other, self) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = lib.inner_join_indexer_object(self, other) + join_index = Index(join_index) + elif how == 'outer': + join_index, lidx, ridx = lib.outer_join_indexer_object(self, other) + join_index = Index(join_index) + else: # pragma: no cover + raise Exception('do not recognize join method %s' % how) + + if return_indexers: + return join_index, lidx, ridx + else: + return join_index + def slice_locs(self, start=None, end=None): """ For an ordered Index, compute the slice locations for input labels @@ -557,14 +629,14 @@ def copy(self, order='C'): cp.__dict__.update(self.__dict__) return cp - #---------------------------------------------------------------------- - # deprecated stuff - - asOfDate = deprecate('asOfDate', asof) - class Int64Index(Index): + _map_indices = lib.map_indices_int64 + _is_monotonic = lib.is_monotonic_int64 + _groupby = lib.groupby_int64 + _arrmap = lib.arrmap_int64 + def __new__(cls, data, dtype=None, copy=False, name=None): if not isinstance(data, np.ndarray): if np.isscalar(data): @@ -592,6 +664,10 @@ def __new__(cls, data, dtype=None, copy=False, name=None): subarr.name = name return subarr + @property + def _constructor(self): + return Int64Index + def astype(self, dtype): return Index(self.values.astype(dtype)) @@ -599,19 +675,6 @@ def astype(self, dtype): def dtype(self): return np.dtype('int64') - @cache_readonly - def is_monotonic(self): - return lib.is_monotonic_int64(self) - - @property - def indexMap(self): - "{label -> location}" - if self._indexMap is None: - self._indexMap = lib.map_indices_int64(self) - self._verify_integrity() - - return self._indexMap - def is_all_dates(self): """ Checks that all the labels are datetime objects @@ -712,19 +775,6 @@ def union(self, other): return Int64Index(result) union.__doc__ = Index.union.__doc__ - def groupby(self, to_groupby): - return lib.groupby_int64(self, to_groupby) - - def map(self, mapper): - return lib.arrmap_int64(self, mapper) - - def take(self, *args, **kwargs): - """ - Analogous to ndarray.take - """ - taken = self.values.take(*args, **kwargs) - return Int64Index(taken) - class DateIndex(Index): pass @@ -839,6 +889,11 @@ def __iter__(self): def _get_level_number(self, level): if not isinstance(level, int): + count = self.names.count(level) + if count > 1: + raise Exception('The name %s occurs multiple times, use a ' + 'level number' % level) + level = self.names.index(level) elif level < 0: level += self.nlevels @@ -874,16 +929,21 @@ def __contains__(self, key): except Exception: return False - def format(self, space=2, sparsify=True, vertical=False, adjoin=True): + def format(self, space=2, sparsify=True, adjoin=True, names=False): if len(self) == 0: return [] stringified_levels = [lev.format() for lev in self.levels] result_levels = [] - for lab, lev in zip(self.labels, stringified_levels): - taken = np.array(lev, dtype=object).take(lab) - result_levels.append(taken) + for lab, lev, name in zip(self.labels, stringified_levels, self.names): + level = [] + + if names: + level.append(name if name is not None else '') + + level.extend(np.array(lev, dtype=object).take(lab)) + result_levels.append(level) if sparsify: result_levels = _sparsify(result_levels) @@ -956,6 +1016,8 @@ def from_tuples(cls, tuples, sortorder=None, names=None): ------- index : MultiIndex """ + if len(tuples) == 0: + raise Exception('Cannot infer number of levels from empty list') arrays = zip(*tuples) return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @@ -965,7 +1027,10 @@ def indexMap(self): if self._indexMap is None: zipped = zip(*self.labels) self._indexMap = lib.map_indices_list(zipped) - self._verify_integrity() + self._integrity = len(self._indexMap) == len(self) + + if not self._integrity: + raise Exception('Index cannot contain duplicate values!') return self._indexMap @@ -1025,7 +1090,8 @@ def take(self, *args, **kwargs): Analogous to ndarray.take """ new_labels = [lab.take(*args, **kwargs) for lab in self.labels] - return MultiIndex(levels=self.levels, labels=new_labels) + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) def append(self, other): """ @@ -1151,6 +1217,8 @@ def sortlevel(self, level=0, ascending=True): ------- sorted_index : MultiIndex """ + # TODO: check if lexsorted when level=0 + labels = list(self.labels) level = self._get_level_number(level) primary = labels.pop(level) @@ -1197,16 +1265,9 @@ def get_indexer(self, target, method=None): """ method = self._get_method(method) + target_index = target if isinstance(target, MultiIndex): target_index = target.get_tuple_index() - else: - if len(target) > 0: - val = target[0] - if not isinstance(val, tuple) or len(val) != self.nlevels: - raise ValueError('can only pass MultiIndex or ' - 'array of tuples') - - target_index = target self_index = self.get_tuple_index() @@ -1219,6 +1280,7 @@ def get_indexer(self, target, method=None): else: indexer = lib.merge_indexer_object(target_index, self_index.indexMap) + return indexer def reindex(self, target, method=None): @@ -1443,12 +1505,14 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - # TODO: optimize / make less wasteful + result_names = self.names if self.names == other.names else None + self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() uniq_tuples = lib.fast_unique_multiple([self_tuples, other_tuples]) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0) + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) def intersection(self, other): """ @@ -1467,11 +1531,45 @@ def intersection(self, other): if self.equals(other): return self - # TODO: optimize / make less wasteful + result_names = self.names if self.names == other.names else None + self_tuples = self.get_tuple_index() other_tuples = other.get_tuple_index() uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) - return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0) + if len(uniq_tuples) == 0: + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) + else: + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) + + def diff(self, other): + """ + Compute sorted set difference of two MultiIndex objects + + Returns + ------- + diff : MultiIndex + """ + self._assert_can_do_setop(other) + + result_names = self.names if self.names == other.names else None + + if self.equals(other): + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) + + difference = sorted(set(self.values) - set(other.values)) + + if len(difference) == 0: + return MultiIndex(levels=[[]]*self.nlevels, + labels=[[]]*self.nlevels, + names=result_names) + else: + return MultiIndex.from_tuples(difference, sortorder=0, + names=result_names) def _assert_can_do_setop(self, other): if not isinstance(other, MultiIndex): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a3df3b16b3b10..cee8b51c635cc 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4,6 +4,7 @@ import numpy as np from pandas.core.index import Index, _ensure_index +from pandas.util.decorators import cache_readonly import pandas.core.common as common import pandas._tseries as lib @@ -170,22 +171,24 @@ def fillna(self, value): class FloatBlock(Block): - def can_store(self, value): - return issubclass(value.dtype.type, (np.integer, np.floating)) + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return issubclass(value.dtype.type, np.floating) class IntBlock(Block): - def can_store(self, value): + def should_store(self, value): return issubclass(value.dtype.type, np.integer) class BoolBlock(Block): - def can_store(self, value): + def should_store(self, value): return issubclass(value.dtype.type, np.bool_) class ObjectBlock(Block): - def can_store(self, value): + def should_store(self, value): return not issubclass(value.dtype.type, (np.integer, np.floating, np.bool_)) @@ -196,6 +199,8 @@ def make_block(values, items, ref_items, do_integrity_check=False): if issubclass(vtype, np.floating): klass = FloatBlock elif issubclass(vtype, np.integer): + if vtype != np.int64: + values = values.astype('i8') klass = IntBlock elif dtype == np.bool_: klass = BoolBlock @@ -470,6 +475,33 @@ def xs(self, key, axis=1, copy=True): return BlockManager(new_blocks, new_axes) + def fast_2d_xs(self, loc, copy=False): + """ + + """ + if len(self.blocks) == 1: + result = self.blocks[0].values[:, loc] + if copy: + result = result.copy() + return result + + if not copy: + raise Exception('cannot get view of mixed-type or ' + 'non-consolidated DataFrame') + + dtype = _interleaved_dtype(self.blocks) + + items = self.items + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + values = blk.values + for j, item in enumerate(blk.items): + i = items.get_loc(item) + result[i] = values[j, loc] + + return result + def consolidate(self): """ Join together blocks having same dtype @@ -506,7 +538,7 @@ def set(self, item, value): assert(value.shape[1:] == self.shape[1:]) if item in self.items: i, block = self._find_block(item) - if not block.can_store(value): + if not block.should_store(value): # delete from block, create and append new block self._delete_from_block(i, item) self._add_new_block(item, value) @@ -579,6 +611,24 @@ def reindex_axis(self, new_axis, method=None, axis=0): new_axes[axis] = new_axis return BlockManager(new_blocks, new_axes) + def reindex_indexer(self, new_axis, indexer, axis=1): + """ + pandas-indexer with -1's only + """ + if axis == 0: + raise NotImplementedError + + new_axes = list(self.axes) + new_axes[axis] = new_axis + new_blocks = [] + for blk in self.blocks: + new_values = common.take_fast(blk.values, indexer, None, + False, axis=axis) + newb = make_block(new_values, blk.items, self.items) + new_blocks.append(newb) + + return BlockManager(new_blocks, new_axes) + def reindex_items(self, new_items): """ @@ -616,16 +666,24 @@ def reindex_items(self, new_items): return BlockManager(new_blocks, new_axes) - def take(self, indices, axis=1): + def take(self, indexer, axis=1): if axis == 0: raise NotImplementedError + indexer = np.asarray(indexer, dtype='i4') + + n = len(self.axes[axis]) + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') + new_axes = list(self.axes) - new_axes[axis] = self.axes[axis].take(indices) + new_axes[axis] = self.axes[axis].take(indexer) new_blocks = [] for blk in self.blocks: - newb = make_block(blk.values.take(indices, axis=axis), blk.items, - self.items) + new_values = common.take_fast(blk.values, indexer, + None, False, axis=axis) + newb = make_block(new_values, blk.items, self.items) new_blocks.append(newb) return BlockManager(new_blocks, new_axes) @@ -678,16 +736,23 @@ def _is_indexed_like(self, other): return False return True - def join_on(self, other, on, axis=1, lsuffix=None, rsuffix=None): + def join_on(self, other, on, how='left', axis=1, lsuffix=None, + rsuffix=None): this, other = self._maybe_rename_join(other, lsuffix, rsuffix) other_axis = other.axes[axis] - indexer = lib.merge_indexer_object(on.astype(object), - other_axis.indexMap) + indexer = other_axis.get_indexer(on) + + if how == 'left': + mask = indexer == -1 + needs_masking = len(on) > 0 and mask.any() + else: + mask = indexer != -1 + this = this.take(mask.nonzero()[0], axis=axis) + indexer = indexer[mask] + mask = None + needs_masking = False - # TODO: deal with length-0 case? or does it fall out? - mask = indexer == -1 - needs_masking = len(on) > 0 and mask.any() other_blocks = [] for block in other.blocks: newb = block.reindex_axis(indexer, mask, needs_masking, axis=axis) @@ -750,7 +815,6 @@ def block_id_vector(self): assert((result >= 0).all()) return result -_data_types = [np.float_, np.int_] def form_blocks(data, axes): # pre-filter out items if we passed it items = axes[0] @@ -782,7 +846,7 @@ def form_blocks(data, axes): blocks.append(float_block) if len(int_dict): - int_block = _simple_blockify(int_dict, items, np.int_) + int_block = _simple_blockify(int_dict, items, np.int64) blocks.append(int_block) if len(bool_dict): @@ -825,7 +889,7 @@ def _blocks_to_series_dict(blocks, index=None): for block in blocks: for item, vec in zip(block.items, block.values): - series_dict[item] = Series(vec, index=index) + series_dict[item] = Series(vec, index=index, name=item) return series_dict def _interleaved_dtype(blocks): @@ -855,7 +919,7 @@ def _interleaved_dtype(blocks): elif have_bool: return np.bool_ elif have_int and not have_float: - return np.int_ + return np.int64 else: return np.float64 @@ -915,143 +979,206 @@ def _union_items_slow(all_items): seen = seen.union(items) return seen +def join_managers(left, right, axis=1, how='left', copy=True): + op = _JoinOperation(left, right, axis=axis, how=how) + return op.get_result(copy=copy) -def join_managers(left, right, axis=1, how='left'): +class _JoinOperation(object): """ - Parameters - ---------- - other - lindexer - lmask - rindexer - rmask - - Returns - ------- - merged : BlockManager + Object responsible for orchestrating efficient join operation between two + BlockManager data structures """ - assert(left.is_consolidated()) - assert(right.is_consolidated()) + def __init__(self, left, right, axis=1, how='left'): + self.left = left + self.right = right + self.axis = axis + self.how = how - laxis = left.axes[axis] - raxis = right.axes[axis] + assert(left.is_consolidated()) + assert(right.is_consolidated()) - join_index, lindexer, rindexer = laxis.join(raxis, how=how, - return_indexers=True) + laxis = left.axes[axis] + raxis = right.axes[axis] - N = len(join_index) + (self.join_index, + self.lindexer, + self.rindexer) = laxis.join(raxis, how=how, return_indexers=True) - if lindexer is None: - lmask = None - lneed_masking = None - else: - lmask = lindexer == -1 - lneed_masking = lmask.any() + # do NOT sort + self.result_items = left.items.append(right.items) + self.result_axes = list(left.axes) + self.result_axes[0] = self.result_items + self.result_axes[axis] = self.join_index - if rindexer is None: - rmask = None - rneed_masking = None - else: - rmask = rindexer == -1 - rneed_masking = rmask.any() - - lblocks = _maybe_upcast_blocks(left.blocks, lneed_masking) - rblocks = _maybe_upcast_blocks(right.blocks, rneed_masking) - - left_blockmap = dict((type(blk), blk) for blk in lblocks) - right_blockmap = dict((type(blk), blk) for blk in rblocks) - - # do NOT sort - result_items = left.items.append(right.items) - - result_axes = list(left.axes) - result_axes[0] = result_items - result_axes[axis] = join_index - - result_blocks = [] - - # copies all data by definition - - kinds = set(left_blockmap) | set(right_blockmap) - for klass in kinds: - if klass in left_blockmap and klass in right_blockmap: - # true merge, do not produce intermediate copy - lblk = left_blockmap[klass] - rblk = right_blockmap[klass] - new_values = _merge_blocks_fast(lblk, rblk, - lindexer, lmask, lneed_masking, - rindexer, rmask, rneed_masking, - axis=axis) - new_items = lblk.items.append(rblk.items) - res_blk = make_block(new_values, new_items, result_items) - elif klass in left_blockmap: - # only take necessary - blk = left_blockmap[klass] - if lindexer is None: - res_blk = blk.copy() - else: - res_blk = blk.reindex_axis(lindexer, lmask, lneed_masking, - axis=axis) - res_blk.ref_items = result_items - elif klass in right_blockmap: - # only take necessary - blk = right_blockmap[klass] - if rindexer is None: - res_blk = blk.copy() + def get_result(self, copy=False): + """ + Parameters + ---------- + other + lindexer + lmask + rindexer + rmask + + Returns + ------- + merged : BlockManager + """ + left_blockmap, right_blockmap = self._prepare_blocks() + + result_blocks = [] + + # maybe want to enable flexible copying + + kinds = set(left_blockmap) | set(right_blockmap) + for klass in kinds: + lblk = left_blockmap.get(klass) + rblk = right_blockmap.get(klass) + + if lblk and rblk: + # true merge, do not produce intermediate copy + res_blk = self._merge_blocks(lblk, rblk) + elif lblk: + res_blk = self._reindex_block(lblk, side='left') else: - res_blk = blk.reindex_axis(rindexer, rmask, rneed_masking, - axis=axis) - res_blk.ref_items = result_items + res_blk = self._reindex_block(rblk, side='right') - result_blocks.append(res_blk) + result_blocks.append(res_blk) - return BlockManager(result_blocks, result_axes) + return BlockManager(result_blocks, self.result_axes) -def _maybe_upcast_blocks(blocks, needs_masking): - """ - Upcast and consolidate if necessary - """ - if not needs_masking: - return blocks - new_blocks = [] - for block in blocks: - if isinstance(block, IntBlock): - newb = make_block(block.values.astype(float), block.items, - block.ref_items) - elif isinstance(block, BoolBlock): - newb = make_block(block.values.astype(object), block.items, - block.ref_items) + def _prepare_blocks(self): + lblocks = self.left.blocks + rblocks = self.right.blocks + + # will short-circuit and not compute lneed_masking + if self.lneed_masking: + lblocks = self._upcast_blocks(lblocks) + + if self.rneed_masking: + rblocks = self._upcast_blocks(rblocks) + + left_blockmap = dict((type(blk), blk) for blk in lblocks) + right_blockmap = dict((type(blk), blk) for blk in rblocks) + + return left_blockmap, right_blockmap + + def _reindex_block(self, block, side='left', copy=True): + if side == 'left': + indexer = self.lindexer + mask, need_masking = self.lmask_info else: - newb = block - new_blocks.append(newb) + indexer = self.rindexer + mask, need_masking = self.rmask_info - # use any ref_items - return _consolidate(new_blocks, newb.ref_items) + # still some inefficiency here for bool/int64 because in the case where + # no masking is needed, take_fast will recompute the mask -def _merge_blocks_fast(left, right, lindexer, lmask, lneed_masking, - rindexer, rmask, rneed_masking, axis=1): + if indexer is None and copy: + result = block.copy() + else: + result = block.reindex_axis(indexer, mask, need_masking, + axis=self.axis) - n = left.values.shape[axis] if lindexer is None else len(lindexer) - lk = len(left.items) - rk = len(right.items) + result.ref_items = self.result_items + return result - out_shape = list(left.shape) - out_shape[0] = lk + rk - out_shape[axis] = n + @cache_readonly + def lmask_info(self): + if (self.lindexer is None or + not self._may_need_upcasting(self.left.blocks)): + lmask = None + lneed_masking = False + else: + lmask = self.lindexer == -1 + lneed_masking = lmask.any() - out = np.empty(out_shape, dtype=left.values.dtype) + return lmask, lneed_masking - if lindexer is None: - common.take_fast(left.values, np.arange(n, dtype=np.int32), - None, False, axis=axis, out=out[:lk]) - else: - common.take_fast(left.values, lindexer, lmask, lneed_masking, - axis=axis, out=out[:lk]) + @cache_readonly + def rmask_info(self): + if (self.rindexer is None or + not self._may_need_upcasting(self.right.blocks)): + rmask = None + rneed_masking = False + else: + rmask = self.rindexer == -1 + rneed_masking = rmask.any() + + return rmask, rneed_masking + + @property + def lneed_masking(self): + return self.lmask_info[1] + + @property + def rneed_masking(self): + return self.rmask_info[1] + + @staticmethod + def _may_need_upcasting(blocks): + for block in blocks: + if isinstance(block, (IntBlock, BoolBlock)): + return True + return False + + def _merge_blocks(self, lblk, rblk): + lidx = self.lindexer + ridx = self.rindexer + + n = lblk.values.shape[self.axis] if lidx is None else len(lidx) + lk = len(lblk.items) + rk = len(rblk.items) + + out_shape = list(lblk.shape) + out_shape[0] = lk + rk + out_shape[self.axis] = n + + out = np.empty(out_shape, dtype=lblk.values.dtype) + + # is this really faster than assigning to arr.flat? + if lidx is None: + # out[:lk] = lblk.values + common.take_fast(lblk.values, np.arange(n, dtype='i4'), + None, False, + axis=self.axis, out=out[:lk]) + else: + # write out the values to the result array + common.take_fast(lblk.values, lidx, None, False, + axis=self.axis, out=out[:lk]) + if ridx is None: + # out[lk:] = lblk.values + common.take_fast(rblk.values, np.arange(n, dtype='i4'), + None, False, + axis=self.axis, out=out[lk:]) + else: + common.take_fast(rblk.values, ridx, None, False, + axis=self.axis, out=out[lk:]) + + # does not sort + new_items = lblk.items.append(rblk.items) + return make_block(out, new_items, self.result_items) + + @staticmethod + def _upcast_blocks(blocks): + """ + Upcast and consolidate if necessary + """ + # if not need_masking: + # return blocks + + new_blocks = [] + for block in blocks: + if isinstance(block, IntBlock): + newb = make_block(block.values.astype(float), block.items, + block.ref_items) + elif isinstance(block, BoolBlock): + newb = make_block(block.values.astype(object), block.items, + block.ref_items) + else: + newb = block + new_blocks.append(newb) + + # use any ref_items + return _consolidate(new_blocks, newb.ref_items) - if rindexer is None: - common.take_fast(right.values, np.arange(n, dtype=np.int32), - None, False, axis=axis, out=out[lk:]) - else: - common.take_fast(right.values, rindexer, rmask, rneed_masking, - axis=axis, out=out[lk:]) - return out diff --git a/pandas/core/panel.py b/pandas/core/panel.py index d0378821cac14..95bba59e77cd3 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -14,7 +14,6 @@ from pandas.core.frame import DataFrame, _union_indexes from pandas.core.generic import AxisProperty, NDFrame from pandas.core.series import Series -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas._tseries as _tseries @@ -187,7 +186,7 @@ class Panel(NDFrame): __div__ = _arith_method(operator.div, '__div__') __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__') - def __init__(self, data, items=None, major_axis=None, minor_axis=None, + def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): """ Represents wide format panel data, stored as 3-dimensional array @@ -206,6 +205,9 @@ def __init__(self, data, items=None, major_axis=None, minor_axis=None, copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input """ + if data is None: + data = {} + passed_axes = [items, major_axis, minor_axis] if isinstance(data, BlockManager): mgr = data @@ -315,11 +317,17 @@ def __repr__(self): dims = 'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) - major = 'Major axis: %s to %s' % (self.major_axis[0], - self.major_axis[-1]) + if len(self.major_axis) > 0: + major = 'Major axis: %s to %s' % (self.major_axis[0], + self.major_axis[-1]) + else: + major = 'Major axis: None' - minor = 'Minor axis: %s to %s' % (self.minor_axis[0], - self.minor_axis[-1]) + if len(self.minor_axis) > 0: + minor = 'Minor axis: %s to %s' % (self.minor_axis[0], + self.minor_axis[-1]) + else: + minor = 'Minor axis: None' if len(self.items) > 0: items = 'Items: %s to %s' % (self.items[0], self.items[-1]) @@ -657,7 +665,8 @@ def fillna(self, value=None, method='pad'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def major_xs(self, key, copy=True): @@ -1080,12 +1089,6 @@ def _get_join_index(self, other, how): join_minor = self.minor_axis.union(other.minor_axis) return join_major, join_minor - #---------------------------------------------------------------------- - # Deprecated stuff - - getMinorXS = deprecate('getMinorXS', minor_xs) - getMajorXS = deprecate('getMajorXS', major_xs) - WidePanel = Panel #------------------------------------------------------------------------------- @@ -1233,7 +1236,8 @@ def _combine_panel_frame(self, other, func, axis='items'): try: divide = div = _panel_arith_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 divide = div = _panel_arith_method(operator.truediv, 'divide') def to_wide(self): @@ -1277,8 +1281,6 @@ def _to_wide_mixed(self, mask): columns=self.minor_axis) return Panel.from_dict(data) - toWide = deprecate('toWide', to_wide) - def toCSV(self, path): def format_cols(items): cols = ['Major', 'Minor'] + list(items) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 3fe653819f690..4e13737a76f60 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -61,8 +61,8 @@ def __init__(self, values, index, level=-1, value_columns=None): self.new_index_levels = list(index.levels) self.new_index_names = list(index.names) - self.removed_name = self.new_index_names.pop(level) - self.removed_level = self.new_index_levels.pop(level) + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) v = self.level lshape = self.index.levshape @@ -287,6 +287,11 @@ def stack(frame, level=-1, dropna=True): stacked : Series """ N, K = frame.shape + if isinstance(level, int) and level < 0: + level += frame.columns.nlevels + + level = frame.columns._get_level_number(level) + if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level=level, dropna=True) elif isinstance(frame.index, MultiIndex): @@ -316,8 +321,6 @@ def stack(frame, level=-1, dropna=True): def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() - if level < 0: - level += frame.columns.nlevels # this makes life much simpler if level != frame.columns.nlevels - 1: @@ -388,3 +391,52 @@ def _stack_multi_columns(frame, level=-1, dropna=True): return result + +def melt(frame, id_vars=None, value_vars=None): + """ + "Unpivots" a DataFrame from wide format to long format, optionally leaving + id variables set + + Parameters + ---------- + frame : DataFrame + id_vars : + value_vars : + + Examples + -------- + >>> df + A B C + a 1 2 + b 3 4 + c 5 6 + + >>> melt(df, ['A']) + A variable value + a B 1 + b B 3 + c B 5 + a C 2 + b C 4 + c C 6 + """ + # TODO: what about the existing index? + + N, K = frame.shape + + mdata = {} + + if id_vars is not None: + idvars = list(idvars) + frame = frame.copy() + K -= len(idvars) + for col in idvars: + mdata[col] = np.tile(frame.pop(col).values, K) + else: + idvars = [] + + mcolumns = idvars + ['variable', 'value'] + + mdata['value'] = frame.values.ravel('F') + mdata['variable'] = np.asarray(frame.columns).repeat(N) + return DataFrame(mdata, columns=mcolumns) diff --git a/pandas/core/series.py b/pandas/core/series.py index 184a6d894c6fb..295fc126e7618 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -8,19 +8,17 @@ import csv import itertools import operator -import sys -import warnings from numpy import nan, ndarray import numpy as np from pandas.core.common import (isnull, notnull, _is_bool_indexer, - _default_index, _maybe_upcast) + _default_index, _maybe_upcast, + _asarray_tuplesafe) from pandas.core.daterange import DateRange from pandas.core.generic import PandasObject from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _SeriesIndexer, _maybe_droplevels -from pandas.util.decorators import deprecate from pandas.util import py3compat import pandas.core.common as common import pandas.core.datetools as datetools @@ -106,7 +104,6 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False): index = Index(sorted(data.keys())) data = [data.get(idx, np.nan) for idx in index] - # Create array, do *not* copy data by default, infer type try: subarr = np.array(data, dtype=dtype, copy=copy) except ValueError: @@ -136,9 +133,11 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False): subarr.fill(value) else: return subarr.item() - elif subarr.ndim > 1: - raise Exception('Data must be 1-dimensional') + if isinstance(data, np.ndarray): + raise Exception('Data must be 1-dimensional') + else: + subarr = _asarray_tuplesafe(data, dtype=dtype) if index is None: index = _default_index(len(subarr)) @@ -258,6 +257,7 @@ def __getitem__(self, key): if isinstance(self.index, MultiIndex): return self._multilevel_index(key) else: + hash(key) values = self.values try: return values[self.index.get_loc(key)] @@ -288,7 +288,10 @@ def _index_with(indexer): # [slice(0, 5, None)] will break if you convert to ndarray, # e.g. as requested by np.median - return _index_with(key) + try: + return _index_with(key) + except Exception: + return self.values[key] def _multilevel_index(self, key): values = self.values @@ -310,8 +313,12 @@ def _multilevel_index(self, key): _get_val_at = ndarray.__getitem__ def __getslice__(self, i, j): - return self._constructor(self.values[i:j], index=self.index[i:j], - name=self.name) + if i < 0: + i -= len(self) + if j < 0: + j -= len(self) + slobj = slice(i, j) + return self.__getitem__(slobj) def __setitem__(self, key, value): values = self.values @@ -361,11 +368,16 @@ def __setslice__(self, i, j, value): def __repr__(self): """Clean string representation of a Series""" if len(self.index) > 500: - return self._tidy_repr(30) + result = self._tidy_repr(30) elif len(self.index) > 0: - return self._get_repr(name=True) + result = self._get_repr(name=True) else: - return '%s' % ndarray.__repr__(self) + result = '%s' % ndarray.__repr__(self) + + if self.index.name is not None: + result = '%s\n%s' % (self.index.name, result) + + return result def _tidy_repr(self, max_vals=20): num = max_vals // 2 @@ -375,10 +387,20 @@ def _tidy_repr(self, max_vals=20): result = '%s\nName: %s, Length: %d' % (result, self.name, len(self)) return result - def to_string(self, buffer=sys.stdout, nanRep='NaN'): - print >> buffer, self._get_repr(nanRep=nanRep) + def to_string(self, buf=None, na_rep='NaN', nanRep=None): + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + + the_repr = self._get_repr(na_rep=na_rep) + if buf is None: + return the_repr + else: + print >> buf, the_repr - def _get_repr(self, name=False, nanRep='NaN'): + def _get_repr(self, name=False, na_rep='NaN'): vals = self.values index = self.index @@ -388,7 +410,7 @@ def _get_repr(self, name=False, nanRep='NaN'): def _format_float(k, v): if np.isnan(v): - v = nanRep + v = na_rep else: v = str(v) return '%s %s' % (str(k).ljust(padSpace), v) @@ -421,7 +443,7 @@ def iteritems(self): return itertools.izip(iter(self.index), iter(self)) iterkv = iteritems - if py3compat.PY3: + if py3compat.PY3: # pragma: no cover items = iteritems #---------------------------------------------------------------------- @@ -551,36 +573,33 @@ def count(self, level=None): def _count_level(self, level): # TODO: GENERALIZE CODE OVERLAP WITH DATAFRAME - # TODO: deal with sortedness?? - obj = self.sortlevel(level) - mask = notnull(obj.values) - - level_index = obj.index.levels[level] + mask = notnull(self.values) + level_index = self.index.levels[level] if len(self) == 0: return Series(0, index=level_index) - n = len(level_index) - locs = obj.index.labels[level].searchsorted(np.arange(n)) + # call cython function + max_bin = len(level_index) + counts = lib.count_level_1d(mask.view(np.uint8), + self.index.labels[level], max_bin) + return Series(counts, index=level_index) - # WORKAROUND: reduceat fusses about the endpoints. should file ticket? - start = locs.searchsorted(0, side='right') - 1 - end = locs.searchsorted(len(mask), side='left') - - result = np.zeros((n), dtype=int) - out = result[start:end] - np.add.reduceat(mask, locs[start:end], out=out) - - # WORKAROUND: to see why, try this - # arr = np.ones((10, 4), dtype=bool) - # np.add.reduceat(arr, [0, 3, 3, 7, 9], axis=0) - - # this stinks - if len(locs) > 1: - workaround_mask = locs[:-1] == locs[1:] - result[:-1][workaround_mask] = 0 + def value_counts(self): + """ + Returns Series containing counts of unique values. The resulting Series + will be in descending order so that the first element is the most + frequently-occurring element. Excludes NA values - return Series(result, index=level_index) + Returns + ------- + counts : Series + """ + from collections import defaultdict + counter = defaultdict(lambda: 0) + for value in self.dropna().values: + counter[value] += 1 + return Series(counter).order(ascending=False) def sum(self, axis=0, dtype=None, out=None, skipna=True): """ @@ -676,7 +695,7 @@ def min(self, axis=None, out=None, skipna=True): """ arr = self.values.copy() if skipna: - if not issubclass(arr.dtype.type, np.int_): + if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), np.inf) return arr.min() @@ -695,7 +714,7 @@ def max(self, axis=None, out=None, skipna=True): """ arr = self.values.copy() if skipna: - if not issubclass(arr.dtype.type, np.int_): + if not issubclass(arr.dtype.type, np.integer): np.putmask(arr, isnull(arr), -np.inf) return arr.max() @@ -789,7 +808,7 @@ def cumsum(self, axis=0, dtype=None, out=None, skipna=True): """ arr = self.values.copy() - do_mask = skipna and not issubclass(self.dtype.type, np.int_) + do_mask = skipna and not issubclass(self.dtype.type, np.integer) if do_mask: mask = isnull(arr) np.putmask(arr, mask, 0.) @@ -818,7 +837,7 @@ def cumprod(self, axis=0, dtype=None, out=None, skipna=True): """ arr = self.values.copy() - do_mask = skipna and not issubclass(self.dtype.type, np.int_) + do_mask = skipna and not issubclass(self.dtype.type, np.integer) if do_mask: mask = isnull(arr) np.putmask(arr, mask, 1.) @@ -869,12 +888,26 @@ def describe(self): ------- desc : Series """ - names = ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max'] + try: + from collections import Counter + except ImportError: # pragma: no cover + # For Python < 2.7, we include a local copy of this: + from pandas.util.counter import Counter + + if self.dtype == object: + names = ['count', 'unique', 'top', 'freq'] + + objcounts = Counter(self.dropna().values) + top, freq = objcounts.most_common(1)[0] + data = [self.count(), len(objcounts), top, freq] + + else: + names = ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max'] - data = [self.count(), self.mean(), self.std(), self.min(), - self.quantile(.25), self.median(), self.quantile(.75), - self.max()] + data = [self.count(), self.mean(), self.std(), self.min(), + self.quantile(.25), self.median(), self.quantile(.75), + self.max()] return Series(data, index=names) @@ -925,7 +958,7 @@ def autocorr(self): """ return self.corr(self.shift(1)) - def clip(self, upper=None, lower=None): + def clip(self, lower=None, upper=None, out=None): """ Trim values at input threshold(s) @@ -938,6 +971,9 @@ def clip(self, upper=None, lower=None): ------- clipped : Series """ + if out is not None: # pragma: no cover + raise Exception('out argument is not supported yet') + result = self if lower is not None: result = result.clip_lower(lower) @@ -990,7 +1026,7 @@ def append(self, other): y : Series """ new_index = self.index.append(other.index) - new_index._verify_integrity() + assert(new_index._verify_integrity()) new_values = np.concatenate((self.values, other.values)) name = _maybe_match_name(self, other) @@ -1044,7 +1080,8 @@ def _binop(self, other, func, fill_value=None): mul = _flex_method(operator.mul, 'multiply') try: div = _flex_method(operator.div, 'divide') - except AttributeError: # Python 3 + except AttributeError: # pragma: no cover + # Python 3 div = _flex_method(operator.truediv, 'divide') def combine(self, other, func, fill_value=nan): @@ -1149,7 +1186,7 @@ def argsort(self, axis=0, kind='quicksort', order=None): else: return Series(np.argsort(values), index=self.index, name=self.name) - def order(self, na_last=True, ascending=True, **kwds): + def order(self, na_last=True, ascending=True): """ Sorts Series object, by value, maintaining index-value link @@ -1172,11 +1209,6 @@ def _try_mergesort(arr): # stable sort not available for object dtype return arr.argsort() - if 'missingAtEnd' in kwds: # pragma: no cover - warnings.warn("missingAtEnd is deprecated, use na_last", - FutureWarning) - na_last = kwds['missingAtEnd'] - arr = self.values sortedIdx = np.empty(len(self), dtype=np.int32) @@ -1350,29 +1382,29 @@ def align(self, other, join='outer', copy=True): (left, right) : (Series, Series) Aligned Series """ + if self.index.equals(other.index): + left, right = self, other + if copy: + left = left.copy() + right = right.copy() + return left, right + join_index, lidx, ridx = self.index.join(other.index, how=join, return_indexers=True) - if lidx is not None: - left = Series(common.take_1d(self.values, lidx), join_index, - name=self.name) - else: - if copy: - new_values = self.values.copy() + def _align_series(series, indexer): + if indexer is not None: + new_values = common.take_1d(series.values, indexer) else: - new_values = self.values - left = Series(new_values, join_index, name=self.name) - - if ridx is not None: - right = Series(common.take_1d(other.values, ridx), join_index, - name=other.name) - else: - if copy: - new_values = other.values.copy() - else: - new_values = other.values - right = Series(new_values, join_index, name=other.name) + if copy: + new_values = series.values.copy() + else: + new_values = series.values + result = Series(new_values, join_index, name=series.name) + return result + left = _align_series(self, lidx) + right = _align_series(other, ridx) return left, right def reindex(self, index=None, method=None, copy=True): @@ -1827,15 +1859,7 @@ def rename(self, mapper): ------- renamed : Series (new object) """ - if isinstance(mapper, (dict, Series)): - def mapper_f(x): - if x in mapper: - return mapper[x] - else: - return x - else: - mapper_f = mapper - + mapper_f = _get_rename_function(mapper) result = self.copy() result.index = [mapper_f(x) for x in self.index] @@ -1845,24 +1869,6 @@ def mapper_f(x): def weekday(self): return Series([d.weekday() for d in self.index], index=self.index) - #---------------------------------------------------------------------- - # Deprecated stuff - - @classmethod - def fromValue(cls, value=nan, index=None, dtype=None): # pragma: no cover - warnings.warn("'fromValue', can call Series(value, index=index) now", - FutureWarning) - return Series(value, index=index, dtype=dtype) - - asOf = deprecate('asOf', asof) - toDict = deprecate('toDict', to_dict) - toString = deprecate('toString', to_string) - merge = deprecate('merge', map) - applymap = deprecate('applymap', apply) - combineFirst = deprecate('combineFirst', combine_first) - _firstTimeWithValue = deprecate('_firstTimeWithValue', first_valid_index) - _lastTimeWithValue = deprecate('_lastTimeWithValue', last_valid_index) - toCSV = deprecate('toCSV', to_csv) class TimeSeries(Series): pass @@ -1875,3 +1881,16 @@ def remove_na(arr): Return array containing only true/non-NaN values, possibly empty. """ return arr[notnull(arr)] + + +def _get_rename_function(mapper): + if isinstance(mapper, (dict, Series)): + def f(x): + if x in mapper: + return mapper[x] + else: + return x + else: + f = mapper + + return f diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index c02b0cee22011..78f894b476ac3 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -1104,7 +1104,7 @@ def add_suffix(self, suffix): f = ('%s' + ('%s' % suffix)).__mod__ return self.rename(columns=f) - def _join_on(self, other, on): + def _join_on(self, other, on, how, lsuffix, rsuffix): # need to implement? raise NotImplementedError @@ -1162,7 +1162,7 @@ def transpose(self): T = property(transpose) def count(self, axis=0, **kwds): - return self.apply(SparseSeries.count, axis=axis) + return self.apply(lambda x: x.count(), axis=axis) count.__doc__ = DataFrame.count.__doc__ def cumsum(self, axis=0): @@ -1178,7 +1178,7 @@ def cumsum(self, axis=0): ------- y : SparseDataFrame """ - return self.apply(SparseSeries.cumsum, axis=axis) + return self.apply(lambda x: x.cumsum(), axis=axis) def shift(self, periods, offset=None, timeRule=None): """ diff --git a/pandas/io/data.py b/pandas/io/data.py index 3ecd3caf3ef4f..fa7d8aa519f2d 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -11,7 +11,7 @@ from zipfile import ZipFile from StringIO import StringIO -from pandas import DataFrame, Index +from pandas import DataFrame, read_csv class DataReader(list): @@ -19,7 +19,7 @@ class DataReader(list): Imports data from a number of online sources. Currently supports Yahoo! finance, St. Louis FED (FRED), and Kenneth French's data library. - + Parameters ---------- name : str @@ -52,8 +52,8 @@ def __new__(cls, name, data_source=None, start=None, end=None, **kwds): start = dt.datetime(2010, 1, 1) if(end is None): end = dt.datetime.today() - - self = super(DataReader, cls).__new__(cls) + + self = super(DataReader, cls).__new__(cls) if(data_source == "yahoo"): return self.get_data_yahoo(name=name, start=start, end=end) @@ -86,20 +86,11 @@ def get_data_yahoo(self, name=None, start=None, end=None): '&g=d' + \ '&ignore=.csv' - days = urllib.urlopen(url).readlines() - - data = np.array([day[:-2].split(',') for day in days]) - header = [str.lower(name) for name in data[0]] - index = Index([dt.datetime.strptime(row[0], "%Y-%m-%d") for row in data[1:]]) - data = np.array([[row[1], row[2], row[3], row[4], int(row[5]), row[6]] for row in data[1:]], dtype=float) - - data = DataFrame(data, index, columns=header[1:]).sort() - - return data - - + lines = urllib.urlopen(url).read() + return read_csv(StringIO(lines), index_col=0, parse_dates=True) - def get_data_fred(self, name=None, start=dt.datetime(2010, 1, 1), end=dt.datetime.today()): + def get_data_fred(self, name=None, start=dt.datetime(2010, 1, 1), + end=dt.datetime.today()): """ Get data for the given name from the St. Louis FED (FRED). Date format is datetime @@ -144,7 +135,7 @@ def get_data_famafrench(self, name): dataset = [d.split() for d in data[(file_edges[i] + 1):file_edges[i+1]]] if(len(dataset) > 10): ncol = np.median(np.array([len(d) for d in dataset])) - header_index = np.where(np.array([len(d) for d in dataset]) == (ncol-1))[0][-1] + header_index = np.where(np.array([len(d) for d in dataset]) == (ncol-1))[0][-1] header = dataset[header_index] # to ensure the header is unique header = [str(j + 1) + " " + header[j] for j in range(len(header))] diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1ec552555036f..d685724398200 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,43 +2,18 @@ Module contains tools for processing files into DataFrames or other objects """ -from datetime import datetime -from itertools import izip +from StringIO import StringIO +import zipfile + import numpy as np from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame +import pandas._tseries as lib -def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, - na_values=None, date_parser=None, names=None): - """ - Read CSV file into DataFrame - - Parameters - ---------- - filepath_or_buffer : string or file handle / StringIO - sep : string, default None - Delimiter to use. By default will try to automatically determine - this - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence., default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ +def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None, + skiprows=None, na_values=None, parse_dates=False, + date_parser=None, nrows=None, iterator=False, chunksize=None): import csv if hasattr(filepath_or_buffer, 'read'): @@ -58,223 +33,402 @@ def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, dia.delimiter = sep # attempt to sniff the delimiter if sniff_sep: - sample = f.readline() - sniffed = csv.Sniffer().sniff(sample) + line = f.readline() + sniffed = csv.Sniffer().sniff(line) dia.delimiter = sniffed.delimiter - f.seek(0) + buf = list(csv.reader(StringIO(line), dialect=dia)) + else: + buf = [] reader = csv.reader(f, dialect=dia) - if skiprows is not None: - skiprows = set(skiprows) - lines = [l for i, l in enumerate(reader) if i not in skiprows] - else: - lines = [l for l in reader] - f.close() - return _simple_parser(lines, header=header, indexCol=index_col, - colNames=names, na_values=na_values, - date_parser=date_parser) - -def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, - index_col=0, na_values=None, date_parser=None, names=None): - """ - Read delimited file into DataFrame + if date_parser is not None: + parse_dates = True + + parser = TextParser(reader, header=header, index_col=index_col, + names=names, na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize, buf=buf) + + if nrows is not None: + return parser.get_chunk(nrows) + elif chunksize or iterator: + return parser + + return parser.get_chunk() + + +def read_table(filepath_or_buffer, sep='\t', header=0, index_col=None, + names=None, skiprows=None, na_values=None, parse_dates=False, + date_parser=None, nrows=None, iterator=False, chunksize=None): + return read_csv(filepath_or_buffer, sep=sep, header=header, + skiprows=skiprows, index_col=index_col, + na_values=na_values, date_parser=date_parser, + names=names, parse_dates=parse_dates, + nrows=nrows, iterator=iterator, chunksize=chunksize) + +_parser_params = """Also supports optionally iterating or breaking of the file +into chunks. + +Parameters +---------- +filepath_or_buffer : string or file handle / StringIO +%s +header : int, default 0 + Row to use for the column labels of the parsed DataFrame +skiprows : list-like + Row numbers to skip (0-indexed) +index_col : int or sequence, default None + Column to use as the row labels of the DataFrame. If a sequence is + given, a MultiIndex is used. +names : array-like + List of column names +na_values : list-like, default None + List of additional strings to recognize as NA/NaN +parse_dates : boolean, default False + Attempt to parse dates in the index column(s) +date_parser : function + Function to use for converting dates to strings. Defaults to + dateutil.parser +nrows : int, default None + Number of rows of file to read. Useful for reading pieces of large files +iterator : boolean, default False + Return TextParser object +chunksize : int, default None + Return TextParser object for iteration + +Returns +------- +result : DataFrame or TextParser +""" - Parameters - ---------- - filepath_or_buffer : string or file handle - sep : string, default '\t' - Delimiter to use - header : int, default 0 - Row to use for the column labels of the parsed DataFrame - skiprows : list-like - Row numbers to skip (0-indexed) - index_col : int or sequence, default 0 - Column to use as the row labels of the DataFrame. Pass None if there is - no such column. If a sequence is given, a MultiIndex is used. - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - date_parser : function - Function to use for converting dates to strings. Defaults to - dateutil.parser - names : array-like - List of column names - - Returns - ------- - parsed : DataFrame - """ - return read_csv(filepath_or_buffer, sep, header, skiprows, - index_col, na_values, date_parser, names) +_csv_sep = """sep : string, default None + Delimiter to use. By default will try to automatically determine + this""" -def _simple_parser(lines, colNames=None, header=0, indexCol=0, - na_values=None, date_parser=None, parse_dates=True): - """ - Workhorse function for processing nested list into DataFrame +_table_sep = """sep : string, default \\t (tab-stop) + Delimiter to use""" - Should be replaced by np.genfromtxt eventually? - """ - if header is not None: - columns = [] - for i, c in enumerate(lines[header]): - if c == '': - columns.append('Unnamed: %d' % i) - else: - columns.append(c) +read_csv.__doc__ = """ +Read CSV (comma-separated) file into DataFrame - content = lines[header+1:] +%s - counts = {} - for i, col in enumerate(columns): - cur_count = counts.get(col, 0) - if cur_count > 0: - columns[i] = '%s.%d' % (col, cur_count) - counts[col] = cur_count + 1 - else: - ncols = len(lines[0]) - if not colNames: - columns = ['X.%d' % (i + 1) for i in range(ncols)] - else: - assert(len(colNames) == ncols) - columns = colNames - content = lines +Returns +------- +parsed : DataFrame +""" % (_parser_params % _csv_sep) - zipped_content = zip(*content) +read_table.__doc__ = """ +Read delimited file into DataFrame - if len(content) == 0: # pragma: no cover - raise Exception('No content to parse') +%s - # no index column specified, so infer that's what is wanted - if indexCol is not None: - if np.isscalar(indexCol): - if indexCol == 0 and len(content[0]) == len(columns) + 1: - index = zipped_content[0] - zipped_content = zipped_content[1:] - else: - index = zipped_content.pop(indexCol) - columns.pop(indexCol) - else: # given a list of index - idx_names = [] - index = [] - for idx in indexCol: - idx_names.append(columns[idx]) - index.append(zipped_content[idx]) - #remove index items from content and columns, don't pop in loop - for i in range(len(indexCol)): - columns.remove(idx_names[i]) - zipped_content.remove(index[i]) - - - if np.isscalar(indexCol): - if parse_dates: - index = _try_parse_dates(index, parser=date_parser) - index = Index(_maybe_convert_int(np.array(index, dtype=object))) - else: - index = MultiIndex.from_arrays(_maybe_convert_int_mindex(index, - parse_dates, date_parser), - names=idx_names) - else: - index = Index(np.arange(len(content))) +Returns +------- +parsed : DataFrame +""" % (_parser_params % _table_sep) - if len(columns) != len(zipped_content): - raise Exception('wrong number of columns') - data = dict(izip(columns, zipped_content)) - data = _floatify(data, na_values=na_values) - data = _convert_to_ndarrays(data) - return DataFrame(data=data, columns=columns, index=index) +class BufferedReader(object): + """ + For handling different kinds of files, e.g. zip files where reading out a + chunk of lines is faster than reading out one line at a time. + """ + + def __init__(self, fh, delimiter=','): + pass + +class BufferedCSVReader(BufferedReader): + pass -def _floatify(data_dict, na_values=None): +class TextParser(object): """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + Parameters + ---------- + data : list or csv reader-like object + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, default None + Column or columns to use as the (possibly hierarchical) index + na_values : iterable, defualt None + Custom NA values + parse_dates : boolean, default False + date_parser : function, default None + skiprows """ + # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', 'nan', '']) - if na_values is None: - na_values = NA_VALUES - else: - na_values = set(list(na_values)) | NA_VALUES - def _convert_float(val): - if val in na_values: - return np.nan + def __init__(self, data, names=None, header=0, index_col=None, + na_values=None, parse_dates=False, date_parser=None, + chunksize=None, skiprows=None, buf=None): + """ + Workhorse function for processing nested list into DataFrame + + Should be replaced by np.genfromtxt eventually? + """ + self.data = data + + # can pass rows read so far + self.buf = [] if buf is None else buf + self.pos = len(self.buf) + + self.names = list(names) if names is not None else names + self.header = header + self.index_col = index_col + self.parse_dates = parse_dates + self.date_parser = date_parser + self.chunksize = chunksize + self.passed_names = names is not None + self.skiprows = set() if skiprows is None else set(skiprows) + + if na_values is None: + self.na_values = self.NA_VALUES else: - try: - return np.float64(val) - except Exception: - return val + self.na_values = set(list(na_values)) | self.NA_VALUES - result = {} - for col, values in data_dict.iteritems(): - result[col] = [_convert_float(val) for val in values] + self.columns = self._infer_columns() + self.index_name = self._get_index_name() + self._first_chunk = True - return result + def _infer_columns(self): + names = self.names + passed_names = self.names is not None + if passed_names: + self.header = None + + if self.header is not None: + if len(self.buf) > 0: + line = self.buf[0] + else: + line = self._next_line() + + while self.pos <= self.header: + line = self._next_line() + + columns = [] + for i, c in enumerate(line): + if c == '': + columns.append('Unnamed: %d' % i) + else: + columns.append(c) + + counts = {} + for i, col in enumerate(columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + self._clear_buffer() + else: + line = self._next_line() -def _maybe_convert_int(arr): - if len(arr) == 0: # pragma: no cover - return arr + ncols = len(line) + if not names: + columns = ['X.%d' % (i + 1) for i in range(ncols)] + else: + columns = names - try: - if arr.dtype == np.object_: - return arr.astype(int) - - if abs(arr[0] - int(arr[0])) < 1e-10: - casted = arr.astype(int) - if (np.abs(casted - arr) < 1e-10).all(): - return casted - except (TypeError, ValueError): - pass + return columns - return arr + def _next_line(self): + if isinstance(self.data, list): + if self.pos in self.skiprows: + self.pos += 1 -def _maybe_convert_int_mindex(index, parse_dates, date_parser): - if len(index) == 0: - return index + line = self.data[self.pos] + else: + if self.pos in self.skiprows: + self.data.next() + self.pos += 1 + line = self.data.next() + self.pos += 1 + self.buf.append(line) + return line + + def _clear_buffer(self): + self.buf = [] + + def __iter__(self): + try: + while True: + yield self.get_chunk(self.chunksize) + except StopIteration: + pass + + def _get_index_name(self): + columns = self.columns + + try: + line = self._next_line() + except StopIteration: + line = None + + # implicitly index_col=0 b/c 1 fewer column names + if line is not None: + implicit_first_cols = len(line) - len(columns) + else: + implicit_first_cols = 0 + + index_name = None + if implicit_first_cols > 0: + if self.index_col is None: + if implicit_first_cols == 1: + self.index_col = 0 + else: + self.index_col = range(implicit_first_cols) + index_name = None + elif np.isscalar(self.index_col): + index_name = columns.pop(self.index_col) + elif self.index_col is not None: + cp_cols = list(columns) + index_name = [] + for i in self.index_col: + name = cp_cols[i] + columns.remove(name) + index_name.append(name) + + return index_name + + def get_chunk(self, rows=None): + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + if len(content) == 0: # pragma: no cover + if self.index_col is not None: + if np.isscalar(self.index_col): + index = Index([], name=self.index_name) + else: + index = MultiIndex.from_arrays([[]] * len(self.index_col), + names=self.index_name) + else: + index = Index([]) + + return DataFrame(index=index, columns=self.columns) + + zipped_content = list(lib.to_object_array(content).T) + + # no index column specified, so infer that's what is wanted + if self.index_col is not None: + if np.isscalar(self.index_col): + index = zipped_content.pop(self.index_col) + else: # given a list of index + index = [] + for idx in self.index_col: + index.append(zipped_content[idx]) + #remove index items from content and columns, don't pop in loop + for i in range(len(self.index_col)): + zipped_content.remove(index[i]) + + if np.isscalar(self.index_col): + if self.parse_dates: + index = lib.try_parse_dates(index, parser=self.date_parser) + index = Index(_convert_types(index, self.na_values), + name=self.index_name) + else: + arrays = [] + for arr in index: + if self.parse_dates: + arr = lib.try_parse_dates(arr, parser=self.date_parser) + arrays.append(_convert_types(arr, self.na_values)) + index = MultiIndex.from_arrays(arrays, names=self.index_name) + else: + index = Index(np.arange(len(content))) + + if not index._verify_integrity(): + dups = index._get_duplicates() + raise Exception('Index has duplicates: %s' % str(dups)) + + if len(self.columns) != len(zipped_content): + raise Exception('wrong number of columns') + + data = dict((k, v) for k, v in zip(self.columns, zipped_content)) + data = _convert_to_ndarrays(data, self.na_values) + return DataFrame(data=data, columns=self.columns, index=index) + + def _get_lines(self, rows=None): + source = self.data + lines = self.buf + + # already fetched some number + if rows is not None: + rows -= len(self.buf) + + if isinstance(source, list): + if self.pos >= len(source): + raise StopIteration + if rows is None: + lines.extend(source[self.pos:]) + self.pos = len(source) + else: + lines.extend(source[self.pos:self.pos+rows]) + self.pos += rows + else: + try: + if rows is not None: + for _ in xrange(rows): + lines.append(source.next()) + else: + while True: + lines.append(source.next()) + except StopIteration: + if len(lines) == 0: + raise + self.pos += len(lines) + + self.buf = [] + + return lines + +def _maybe_convert_int_mindex(index, parse_dates, date_parser): for i in range(len(index)): try: int(index[i][0]) index[i] = map(int, index[i]) except ValueError: if parse_dates: - index[i] = _try_parse_dates(index[i], date_parser) + index[i] = lib.try_parse_dates(index[i], date_parser) return index -def _convert_to_ndarrays(dct): +def _convert_to_ndarrays(dct, na_values): result = {} for c, values in dct.iteritems(): - try: - values = np.array(values, dtype=float) - except Exception: - values = np.array(values, dtype=object) - result[c] = _maybe_convert_int(values) + result[c] = _convert_types(values, na_values) return result -def _try_parse_dates(values, parser=None): - if parser is None: - try: - from dateutil import parser - parse_date = parser.parse - except ImportError: # pragma: no cover - def parse_date(s): - try: - return datetime.strptime(s, '%m/%d/%Y') - except Exception: - return s - else: - parse_date = parser - - # EAFP +def _convert_types(values, na_values): try: - return [parse_date(val) for val in values] + values = lib.maybe_convert_numeric(values, na_values) except Exception: - # failed - return values + lib.sanitize_objects(values) + + if values.dtype == np.object_: + return lib.maybe_convert_bool(values) + return values #------------------------------------------------------------------------------- # ExcelFile class @@ -298,8 +452,9 @@ def __init__(self, path): def __repr__(self): return object.__repr__(self) - def parse(self, sheetname, header=0, skiprows=None, index_col=0, - na_values=None): + def parse(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None, + chunksize=None): """ Read Excel table into DataFrame @@ -327,16 +482,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0, datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) - if skiprows is None: - skiprows = set() - else: - skiprows = set(skiprows) - data = [] for i in range(sheet.nrows): - if i in skiprows: - continue - row = [] for value, typ in zip(sheet.row_values(i), sheet.row_types(i)): if typ == XL_CELL_DATE: @@ -348,43 +495,12 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=0, value = datetime(*dt) row.append(value) data.append(row) - return _simple_parser(data, header=header, indexCol=index_col, - na_values=na_values) - -#------------------------------------------------------------------------------- -# Deprecated stuff - -import warnings - -def parseCSV(filepath, header=0, skiprows=None, indexCol=0, - na_values=None): # pragma: no cover - """ - Parse CSV file into a DataFrame object. Try to parse dates if possible. - """ - warnings.warn("parseCSV is deprecated. Use read_csv instead", FutureWarning) - return read_csv(filepath, header=header, skiprows=skiprows, - index_col=indexCol, na_values=na_values) - -def parseText(filepath, sep='\t', header=0, - indexCol=0, colNames=None): # pragma: no cover - """ - Parse whitespace separated file into a DataFrame object. - Try to parse dates if possible. - """ - warnings.warn("parseText is deprecated. Use read_table instead", - FutureWarning) - return read_table(filepath, sep=sep, header=header, index_col=indexCol, - names=colNames) - - -def parseExcel(filepath, header=None, indexCol=0, - sheetname=None, **kwds): # pragma: no cover - """ - - """ - warnings.warn("parseExcel is deprecated. Use the ExcelFile class instead", - FutureWarning) - excel_file = ExcelFile(filepath) - return excel_file.parse(sheetname, header=header, index_col=indexCol) + parser = TextParser(data, header=header, index_col=index_col, + na_values=na_values, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize) + return parser.get_chunk() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 436e61d70b949..8b8ac500c775a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5,12 +5,12 @@ # pylint: disable-msg=E1101,W0613,W0603 -from datetime import datetime +from datetime import datetime, date import time import numpy as np from pandas import (Series, TimeSeries, DataFrame, Panel, LongPanel, - MultiIndex) + Index, MultiIndex) from pandas.core.common import adjoin import pandas._tseries as lib @@ -331,6 +331,7 @@ def _write_to_group(self, key, value, table=False, append=False, def _write_series(self, group, series): self._write_index(group, 'index', series.index) self._write_array(group, 'values', series.values) + group._v_attrs.name = series.name def _write_frame(self, group, df): self._write_block_manager(group, df._data) @@ -440,6 +441,7 @@ def _write_index(self, group, key, index): self._write_array(group, key, converted) node = getattr(group, key) node._v_attrs.kind = kind + node._v_attrs.name = index.name def _read_index(self, group, key): try: @@ -503,7 +505,10 @@ def _read_index_node(self, node): except Exception: name = None - return name, _unconvert_index(data, kind) + index = Index(_unconvert_index(data, kind)) + index.name = name + + return name, index def _write_array(self, group, key, value): if key in group: @@ -617,7 +622,8 @@ def _read_group(self, group, where=None): def _read_series(self, group, where=None): index = self._read_index(group, 'index') values = _read_array(group, 'values') - return Series(values, index=index) + name = getattr(group._v_attrs, 'name', None) + return Series(values, index=index, name=name) def _read_legacy_series(self, group, where=None): index = self._read_index_legacy(group, 'index') @@ -706,11 +712,14 @@ def _convert_index(index): # Let's assume the index is homogeneous values = np.asarray(index) - import time - if isinstance(values[0], datetime): - converted = np.array([time.mktime(v.timetuple()) - for v in values], dtype=np.int64) - return converted, 'datetime', _tables().Time64Col() + if isinstance(values[0], (datetime, date)): + if isinstance(values[0], datetime): + kind = 'datetime' + else: + kind = 'date' + converted = np.array([time.mktime(v.timetuple()) for v in values], + dtype=np.int64) + return converted, kind, _tables().Time64Col() elif isinstance(values[0], basestring): converted = np.array(list(values), dtype=np.str_) itemsize = converted.dtype.itemsize @@ -722,7 +731,6 @@ def _convert_index(index): else: # pragma: no cover raise ValueError('unrecognized index type %s' % type(values[0])) - def _read_array(group, key): import tables node = getattr(group, key) @@ -737,6 +745,10 @@ def _unconvert_index(data, kind): if kind == 'datetime': index = np.array([datetime.fromtimestamp(v) for v in data], dtype=object) + elif kind == 'date': + index = np.array([date.fromtimestamp(v) for v in data], + dtype=object) + elif kind in ('string', 'integer'): index = np.array(data, dtype=object) else: # pragma: no cover diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e6cfce6f32cb3..1ae0876512ab9 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,5 +1,6 @@ from cStringIO import StringIO from datetime import datetime +import csv import os import unittest @@ -8,11 +9,20 @@ from numpy import nan import numpy as np -from pandas import DataFrame -from pandas.io.parsers import read_csv, read_table, ExcelFile +from pandas import DataFrame, Index +from pandas.io.parsers import read_csv, read_table, ExcelFile, TextParser from pandas.util.testing import assert_almost_equal, assert_frame_equal +import pandas._tseries as lib class TestParsers(unittest.TestCase): + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" def setUp(self): self.dirpath = curpath() @@ -34,12 +44,11 @@ def test_custom_na_values(self): [nan, 5, nan], [7, 8, nan]] - df = read_csv(StringIO(data), index_col=None, na_values=['baz'], - skiprows=[1]) + df = read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) assert_almost_equal(df.values, expected) - df2 = read_table(StringIO(data), sep=',', index_col=None, - na_values=['baz'], skiprows=[1]) + df2 = read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) assert_almost_equal(df2.values, expected) def test_unnamed_columns(self): @@ -51,19 +60,33 @@ def test_unnamed_columns(self): expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] - df = read_table(StringIO(data), sep=',', index_col=None) + df = read_table(StringIO(data), sep=',') assert_almost_equal(df.values, expected) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'Unnamed: 3', 'Unnamed: 4'])) + def test_string_nas(self): + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = read_csv(StringIO(data)) + expected = DataFrame([['a', 'b', 'c'], + ['d', np.nan, 'f'], + [np.nan, 'g', 'h']], + columns=['A', 'B', 'C']) + + assert_frame_equal(result, expected) + def test_duplicate_columns(self): data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ - df = read_table(StringIO(data), sep=',', index_col=None) + df = read_table(StringIO(data), sep=',') self.assert_(np.array_equal(df.columns, ['A', 'A.1', 'B', 'B.1', 'B.2'])) @@ -73,7 +96,7 @@ def test_csv_mixed_type(self): b,3,4 c,4,5 """ - df = read_csv(StringIO(data), index_col=None) + df = read_csv(StringIO(data)) # TODO def test_csv_custom_parser(self): @@ -84,7 +107,18 @@ def test_csv_custom_parser(self): """ df = read_csv(StringIO(data), date_parser=lambda x: datetime.strptime(x, '%Y%m%d')) - expected = read_csv(StringIO(data)) + expected = read_csv(StringIO(data), parse_dates=True) + assert_frame_equal(df, expected) + + def test_parse_dates_implicit_first_col(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = read_csv(StringIO(data), parse_dates=True) + expected = read_csv(StringIO(data), index_col=0, parse_dates=True) + self.assert_(isinstance(df.index[0], datetime)) assert_frame_equal(df, expected) def test_no_header(self): @@ -92,30 +126,45 @@ def test_no_header(self): 6,7,8,9,10 11,12,13,14,15 """ - df = read_table(StringIO(data), sep=',', index_col=None, - header=None) + df = read_table(StringIO(data), sep=',', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = read_table(StringIO(data), sep=',', index_col=None, - header=None, names=names) + df2 = read_table(StringIO(data), sep=',', header=None, names=names) expected = [[1,2,3,4,5.], [6,7,8,9,10], [11,12,13,14,15]] assert_almost_equal(df.values, expected) + assert_almost_equal(df.values, df2.values) self.assert_(np.array_equal(df.columns, ['X.1', 'X.2', 'X.3', 'X.4', 'X.5'])) self.assert_(np.array_equal(df2.columns, names)) + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = read_csv(StringIO(data), names=names) + + self.assertEqual(names, ['A', 'B', 'C']) + + values = [[1,2,3],[4,5,6],[7,8,9]] + expected = DataFrame(values, index=['foo','bar','baz'], + columns=['A','B','C']) + assert_frame_equal(df, expected) + def test_read_csv_dataframe(self): - df = read_csv(self.csv1) - df2 = read_table(self.csv1, sep=',') + df = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) + self.assert_(df.index.name == 'index') self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.values.dtype == np.float64) assert_frame_equal(df, df2) def test_read_csv_no_index_name(self): - df = read_csv(self.csv2) - df2 = read_table(self.csv2, sep=',') + df = read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) self.assert_(isinstance(df.index[0], datetime)) self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) @@ -129,12 +178,264 @@ def test_excel_table(self): pth = os.path.join(self.dirpath, 'test.xls') xls = ExcelFile(pth) - df = xls.parse('Sheet1') - df2 = read_csv(self.csv1) - df3 = xls.parse('Sheet2', skiprows=[1]) + df = xls.parse('Sheet1', index_col=0, parse_dates=True) + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) assert_frame_equal(df, df2) assert_frame_equal(df3, df2) + def test_read_table_wrong_num_columns(self): + data = """A,B,C,D,E,F +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + self.assertRaises(Exception, read_csv, StringIO(data)) + + def test_read_table_duplicate_index(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + self.assertRaises(Exception, read_csv, StringIO(data), + index_col=0) + + def test_parse_bools(self): + data = """A,B +True,1 +False,2 +True,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.bool_) + + def test_int_conversion(self): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.float64) + self.assert_(data['B'].dtype == np.int64) + + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = read_csv(StringIO(data)) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + + def test_sniff_delimiter(self): + data = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data = read_csv(StringIO(data), index_col=0) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + + def test_read_nrows(self): + df = read_csv(StringIO(self.data1), nrows=3) + expected = read_csv(StringIO(self.data1))[:3] + assert_frame_equal(df, expected) + + def test_read_chunksize(self): + reader = read_csv(StringIO(self.data1), index_col=0, chunksize=2) + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(reader) + + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + def test_iterator(self): + reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) + + df = read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.get_chunk(3) + assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.get_chunk(5) + assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + treader = read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + self.assert_(isinstance(treader, TextParser)) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = read_csv(StringIO(data), header=2, index_col=0) + expected = read_csv(StringIO(data2), header=0, index_col=0) + assert_frame_equal(df, expected) + + def test_pass_names_with_index(self): + lines = self.data1.split('\n') + no_header = '\n'.join(lines[1:]) + + # regular index + names = ['index', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=0, names=names) + expected = read_csv(StringIO(self.data1), index_col=0) + assert_frame_equal(df, expected) + + # multi index + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['index1', 'index2', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) + + def test_multi_index_no_level_names(self): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + data2 = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) + + # 2 implicit first cols + df2 = read_csv(StringIO(data2)) + assert_frame_equal(df2, df) + + def test_multi_index_parse_dates(self): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + df = read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) + self.assert_(isinstance(df.index.levels[0][0], datetime)) + +class TestParseSQL(unittest.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u'1.5', None, u'3', u'4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u'1.5', np.nan, u'3', u'4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + assert_same_values_and_dtype(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([1L, 2L, 3L, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + + arr = np.array([1L, 2L, 3L, None, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + +def assert_same_values_and_dtype(res, exp): + assert(res.dtype == exp.dtype) + assert_almost_equal(res, exp) + def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ee470d593733b..6d980d8b13e70 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -214,6 +214,11 @@ def test_frame(self): self.assertRaises(ValueError, self._check_roundtrip, df[:0], tm.assert_frame_equal) + def test_can_serialize_dates(self): + rng = [x.date() for x in DateRange('1/1/2000', '1/30/2000')] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + self._check_roundtrip(frame, tm.assert_frame_equal) + def test_store_hierarchical(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], @@ -227,7 +232,7 @@ def test_store_hierarchical(self): self._check_roundtrip(frame.T, tm.assert_frame_equal) self._check_roundtrip(frame['A'], tm.assert_series_equal) - # check that the + # check that the names are stored try: store = HDFStore(self.scratchpath) store['frame'] = frame @@ -237,6 +242,31 @@ def test_store_hierarchical(self): store.close() os.remove(self.scratchpath) + def test_store_index_name(self): + df = tm.makeDataFrame() + df.index.name = 'foo' + try: + store = HDFStore(self.scratchpath) + store['frame'] = df + recons = store['frame'] + assert(recons.index.name == 'foo') + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_series_name(self): + df = tm.makeDataFrame() + series = df['A'] + + try: + store = HDFStore(self.scratchpath) + store['series'] = series + recons = store['series'] + assert(recons.name == 'A') + finally: + store.close() + os.remove(self.scratchpath) + def test_store_mixed(self): def _make_one(): df = tm.makeDataFrame() diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index 534d8f58cf969..afd1f57306b54 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -74,6 +74,9 @@ def _convert_array(obj): def _convert_vector(obj): if isinstance(obj, robj.IntVector): return _convert_int_vector(obj) + elif isinstance(obj, robj.StrVector): + return _convert_str_vector(obj) + return list(obj) NA_INTEGER = -2147483648 @@ -86,6 +89,13 @@ def _convert_int_vector(obj): arr[mask] = np.nan return arr +def _convert_str_vector(obj): + arr = np.asarray(obj, dtype=object) + mask = arr == robj.NA_Character + if mask.any(): + arr[mask] = np.nan + return arr + def _convert_DataFrame(rdf): columns = list(rdf.colnames) rows = np.array(rdf.rownames) diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py index 381a040e12070..30624bdff52a6 100644 --- a/pandas/src/generate_code.py +++ b/pandas/src/generate_code.py @@ -312,13 +312,13 @@ def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): @cython.boundscheck(False) def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -558,6 +558,45 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, """ +#---------------------------------------------------------------------- +# Fast "put" logic for speeding up interleaving logic + +put2d_template = """ +def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, + ndarray[int32_t] indexer, Py_ssize_t loc, + ndarray[%(dest_type2)s] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] +""" + +def generate_put_functions(): + function_list = [ + ('float64', 'float64_t', 'object'), + ('float64', 'float64_t', 'float64_t'), + ('object', 'object', 'object'), + ('int32', 'int32_t', 'int64_t'), + ('int32', 'int32_t', 'float64_t'), + ('int32', 'int32_t', 'object'), + ('int64', 'int64_t', 'int64_t'), + ('int64', 'int64_t', 'float64_t'), + ('int64', 'int64_t', 'object'), + ('bool', 'uint8_t', 'uint8_t'), + ('bool', 'uint8_t', 'object') + ] + + output = StringIO() + for name, c_type, dest_type in function_list: + func = put2d_template % {'name' : name, 'c_type' : c_type, + 'dest_type' : dest_type.replace('_t', ''), + 'dest_type2' : dest_type} + output.write(func) + return output.getvalue() + # name, ctype, capable of holding NA function_list = [ ('float64', 'float64_t', 'np.float64', True), @@ -567,10 +606,10 @@ def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, ('bool', 'uint8_t', 'np.bool', False) ] -def generate_from_template(template, ndim=1, subset=None): +def generate_from_template(template, ndim=1, exclude=None): output = StringIO() for name, c_type, dtype, can_hold_na in function_list: - if subset is not None and name not in subset: + if exclude is not None and name in exclude: continue if ndim == 1: @@ -582,25 +621,34 @@ def generate_from_template(template, ndim=1, subset=None): output.write(func) return output.getvalue() +templates_1d = [map_indices_template, + merge_indexer_template, + pad_template, + backfill_template, + take_1d_template, + is_monotonic_template, + groupby_template, + arrmap_template] + +nobool_1d_templates = [left_join_template, + outer_join_template, + inner_join_template] + +templates_2d = [take_2d_axis0_template, + take_2d_axis1_template] + def generate_take_cython_file(path='generated.pyx'): with open(path, 'w') as f: - print >> f, generate_from_template(map_indices_template) - print >> f, generate_from_template(merge_indexer_template) - print >> f, generate_from_template(pad_template) - print >> f, generate_from_template(backfill_template) - print >> f, generate_from_template(take_1d_template) - print >> f, generate_from_template(take_2d_axis0_template, ndim=2) - print >> f, generate_from_template(take_2d_axis1_template, ndim=2) - print >> f, generate_from_template(is_monotonic_template) - print >> f, generate_from_template(groupby_template) - print >> f, generate_from_template(arrmap_template) - - print >> f, generate_from_template(left_join_template, - subset=['object', 'int64']) - print >> f, generate_from_template(outer_join_template, - subset=['object', 'int64']) - print >> f, generate_from_template(inner_join_template, - subset=['object', 'int64']) + for template in templates_1d: + print >> f, generate_from_template(template) + + for template in templates_2d: + print >> f, generate_from_template(template, ndim=2) + + for template in nobool_1d_templates: + print >> f, generate_from_template(template, exclude=['bool']) + + # print >> f, generate_put_functions() if __name__ == '__main__': generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx index 8608064100338..f851d9c1ddd8f 100644 --- a/pandas/src/generated.pyx +++ b/pandas/src/generated.pyx @@ -833,442 +833,170 @@ def take_1d_bool(ndarray[uint8_t] values, ndarray[int32_t] indexer, outbuf[i] = values[idx] -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + float64_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - outbuf[i, j] = NaN - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_object(ndarray[object, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + object prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - outbuf[i, j] = NaN - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + int32_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + int64_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True -@cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr): cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - - n = len(indexer) - k = values.shape[1] + Py_ssize_t i, n + uint8_t prev, cur - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + n = len(arr) - for i from 0 <= i < n: - idx = indexer[i] + if n < 2: + return True - if idx == -1: - for j from 0 <= j < k: - raise ValueError('No NA values allowed') - else: - for j from 0 <= j < k: - outbuf[i, j] = values[idx, j] + prev = arr[0] + for i from 1 <= i < n: + cur = arr[i] + if cur < prev: + return False + prev = cur + return True @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[float64_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) +def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): + cdef dict result = {} + cdef ndarray[uint8_t] mask + cdef int i, length + cdef list members + cdef object idx, key - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + length = len(index) + mask = isnullobj(labels).view(np.uint8) - for j from 0 <= j < k: - idx = indexer[j] + for i from 0 <= i < length: + if mask[i]: + continue - if idx == -1: - for i from 0 <= i < n: - outbuf[i, j] = NaN + key = labels[i] + idx = index[i] + if key in result: + members = result[key] + members.append(idx) else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] + result[key] = [idx] + + return result @cython.wraparound(False) @cython.boundscheck(False) -def take_2d_axis1_object(ndarray[object, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[object, ndim=2] outbuf - - n = len(values) - k = len(indexer) +def groupby_object(ndarray[object] index, ndarray[object] labels): + cdef dict result = {} + cdef ndarray[uint8_t] mask + cdef int i, length + cdef list members + cdef object idx, key - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out + length = len(index) + mask = isnullobj(labels).view(np.uint8) - for j from 0 <= j < k: - idx = indexer[j] + for i from 0 <= i < length: + if mask[i]: + continue - if idx == -1: - for i from 0 <= i < n: - outbuf[i, j] = NaN + key = labels[i] + idx = index[i] + if key in result: + members = result[key] + members.append(idx) else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int32_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int32_t] indexer, - out=None): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[uint8_t, ndim=2] outbuf - - n = len(values) - k = len(indexer) - - if out is None: - outbuf = np.empty((n, k), dtype=values.dtype) - else: - outbuf = out - - for j from 0 <= j < k: - idx = indexer[j] - - if idx == -1: - for i from 0 <= i < n: - raise ValueError('No NA values allowed') - else: - for i from 0 <= i < n: - outbuf[i, j] = values[i, idx] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr): - cdef: - Py_ssize_t i, n - float64_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr): - cdef: - Py_ssize_t i, n - object prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr): - cdef: - Py_ssize_t i, n - int32_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr): - cdef: - Py_ssize_t i, n - int64_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr): - cdef: - Py_ssize_t i, n - uint8_t prev, cur - - n = len(arr) - - if n < 2: - return True - - prev = arr[0] - for i from 1 <= i < n: - cur = arr[i] - if cur < prev: - return False - prev = cur - return True - - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray[object] labels): - cdef dict result = {} - cdef ndarray[int8_t] mask - cdef int i, length - cdef list members - cdef object idx, key - - length = len(index) - mask = isnullobj(labels) - - for i from 0 <= i < length: - if mask[i]: - continue - - key = labels[i] - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray[object] labels): - cdef dict result = {} - cdef ndarray[int8_t] mask - cdef int i, length - cdef list members - cdef object idx, key - - length = len(index) - mask = isnullobj(labels) - - for i from 0 <= i < length: - if mask[i]: - continue - - key = labels[i] - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result + result[key] = [idx] + + return result @cython.wraparound(False) @cython.boundscheck(False) def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -1288,13 +1016,13 @@ def groupby_int32(ndarray[int32_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -1314,13 +1042,13 @@ def groupby_int64(ndarray[int64_t] index, ndarray[object] labels): @cython.boundscheck(False) def groupby_bool(ndarray[uint8_t] index, ndarray[object] labels): cdef dict result = {} - cdef ndarray[int8_t] mask + cdef ndarray[uint8_t] mask cdef int i, length cdef list members cdef object idx, key length = len(index) - mask = isnullobj(labels) + mask = isnullobj(labels).view(np.uint8) for i from 0 <= i < length: if mask[i]: @@ -1400,97 +1128,639 @@ def arrmap_bool(ndarray[uint8_t] index, object func): for i from 0 <= i < length: result[i] = func(index[i]) - return result + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + outbuf[i, j] = NaN + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object(ndarray[object, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + outbuf[i, j] = NaN + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + outbuf[i, j] = NaN + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object(ndarray[object, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + outbuf[i, j] = NaN + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + raise ValueError('No NA values allowed') + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int32_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int32) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + lval = left[i] + rval = right[j] + + if lval == right[j]: + indexer[i] = j + i += 1 + j += 1 + elif lval > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 + break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 + + count += 1 + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + + count += 1 + return result, lindexer, rindexer @cython.wraparound(False) @cython.boundscheck(False) -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int32_t] indexer + Py_ssize_t i, j, nright, nleft, count object lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[object] result - i = 0 - j = 0 nleft = len(left) nright = len(right) - indexer = np.empty(nleft, dtype=np.int32) + i = 0 + j = 0 + count = 0 while True: if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 - if j == nright: - indexer[i] = -1 - i += 1 - continue - - lval = left[i] - rval = right[j] + count += 1 - if lval == right[j]: - indexer[i] = j - i += 1 - j += 1 - elif lval > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=object) -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int32_t] indexer - int64_t lval, rval + # do it again, but populate the indexers / result i = 0 j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int32) + count = 0 while True: if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - lval = left[i] - rval = right[j] - - if lval == right[j]: - indexer[i] = j - i += 1 - j += 1 - elif lval > rval: - indexer[i] = -1 - j += 1 else: - indexer[i] = -1 - i += 1 - return indexer + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + count += 1 + + return result, lindexer, rindexer @cython.wraparound(False) @cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): cdef: Py_ssize_t i, j, nright, nleft, count - object lval, rval + int32_t lval, rval ndarray[int32_t] lindexer, rindexer - ndarray[object] result + ndarray[int32_t] result nleft = len(left) nright = len(right) @@ -1526,7 +1796,7 @@ def outer_join_indexer_object(ndarray[object] left, lindexer = np.empty(count, dtype=np.int32) rindexer = np.empty(count, dtype=np.int32) - result = np.empty(count, dtype=object) + result = np.empty(count, dtype=np.int32) # do it again, but populate the indexers / result @@ -1675,6 +1945,69 @@ def outer_join_indexer_int64(ndarray[int64_t] left, return result, lindexer, rindexer +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm? + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + @cython.wraparound(False) @cython.boundscheck(False) def inner_join_indexer_object(ndarray[object] left, @@ -1738,6 +2071,69 @@ def inner_join_indexer_object(ndarray[object] left, return result, lindexer, rindexer +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm? + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft or j == nright: + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + count += 1 + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + @cython.wraparound(False) @cython.boundscheck(False) def inner_join_indexer_int64(ndarray[int64_t] left, diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx index cffac35edc48c..b89a18e0f8c42 100644 --- a/pandas/src/groupby.pyx +++ b/pandas/src/groupby.pyx @@ -213,27 +213,6 @@ def group_labels2(ndarray[object] values): return reverse, labels -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique(ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < n: - val = values[i] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - @cython.wraparound(False) @cython.boundscheck(False) def get_unique_labels(ndarray[object] values, dict idMap): @@ -248,32 +227,6 @@ def get_unique_labels(ndarray[object] values, dict idMap): return fillVec -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_unique_multiple(list arrays): - cdef: - ndarray[object] buf - Py_ssize_t k = len(arrays) - Py_ssize_t i, j, n - list uniques = [] - dict table = {} - object val, stub = 0 - - for i from 0 <= i < k: - buf = arrays[i] - n = len(buf) - for j from 0 <= j < n: - val = buf[j] - if val not in table: - table[val] = stub - uniques.append(val) - try: - uniques.sort() - except Exception: - pass - - return uniques - # from libcpp.set cimport set as stlset # cdef fast_unique_int32(ndarray arr): @@ -500,6 +453,23 @@ def _bucket_locs(index, buckets, inclusive=False): return locs +def count_level_1d(ndarray[uint8_t, cast=True] mask, + ndarray[int32_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, n + ndarray[int64_t] counts + + counts = np.zeros(max_bin, dtype='i8') + + n = len(mask) + + for i from 0 <= i < n: + if mask[i]: + counts[labels[i]] += 1 + + return counts + + ''' def ts_upsample_mean(ndarray[object] indices, diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx new file mode 100644 index 0000000000000..cfc81b1a30b23 --- /dev/null +++ b/pandas/src/parsing.pyx @@ -0,0 +1,225 @@ +cimport cpython + +cdef extern from "math.h": + double fabs(double) + +def to_object_array(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + list row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i from 0 <= i < n: + row = rows[i] + + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + +def to_object_array_tuples(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i from 0 <= i < n: + row = rows[i] + + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + +def maybe_convert_numeric(ndarray[object] values, set na_values): + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[int64_t] ints + bint seen_float = 0 + object val + float64_t fval + + n = len(values) + + floats = np.empty(n, dtype='f8') + ints = np.empty(n, dtype='i8') + + for i from 0 <= i < n: + val = values[i] + + if cpython.PyFloat_Check(val): + floats[i] = val + seen_float = 1 + elif val in na_values: + floats[i] = nan + seen_float = 1 + elif val is None: + floats[i] = nan + seen_float = 1 + elif len(val) == 0: + floats[i] = nan + seen_float = 1 + else: + fval = float(val) + floats[i] = fval + if not seen_float: + if '.' in val: + seen_float = 1 + else: + ints[i] = fval + + if seen_float: + return floats + else: + return ints + +def convert_sql_column(ndarray[object] objects): + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[int64_t] ints + ndarray[uint8_t] bools + bint seen_float = 0 + bint seen_int = 0 + bint seen_bool = 0 + bint seen_null = 0 + object val, onan + float64_t fval, fnan + + n = len(objects) + + floats = np.empty(n, dtype='f8') + ints = np.empty(n, dtype='i8') + bools = np.empty(n, dtype=np.uint8) + + onan = np.nan + fnan = np.nan + + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + objects[i] = onan + floats[i] = fnan + elif cpython.PyBool_Check(val): + seen_bool = 1 + bools[i] = val + elif cpython.PyInt_Check(val) or cpython.PyLong_Check(val): + seen_int = 1 + floats[i] = val + if not seen_null: + ints[i] = val + elif cpython.PyFloat_Check(val): + floats[i] = val + seen_float = 1 + elif not (cpython.PyString_Check(val) or cpython.PyUnicode_Check(val)): + # this will convert Decimal objects + try: + floats[i] = float(val) + seen_float = 1 + except Exception: + pass + + if seen_null: + if seen_float or seen_int: + return floats + else: + return objects + else: + if seen_int: + return ints + elif seen_float: + return floats + elif seen_bool: + return bools.view(np.bool_) + else: + return objects + +def try_parse_dates(ndarray[object] values, parser=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import datetime + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + try: + from dateutil import parser + parse_date = parser.parse + except ImportError: # pragma: no cover + def parse_date(s): + try: + return datetime.strptime(s, '%m/%d/%Y') + except Exception: + return s + else: + parse_date = parser + + # EAFP + try: + for i from 0 <= i < n: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + + return result + +def sanitize_objects(ndarray[object] values): + cdef: + Py_ssize_t i, n + object val, onan + + n = len(values) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + if val == '': + values[i] = onan + +def maybe_convert_bool(ndarray[object] arr): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + object val + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + for i from 0 <= i < n: + val = arr[i] + + if val == 'True': + result[i] = 1 + elif val == 'False': + result[i] = 0 + else: + return arr + + return result.view(np.bool_) diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx index f1b5a91c2336e..4ca19f77f5c50 100644 --- a/pandas/src/reindex.pyx +++ b/pandas/src/reindex.pyx @@ -91,7 +91,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): cdef: Py_ssize_t i, j, k, n ndarray[int32_t] indexer - ndarray[uint8_t, cast=True] mask + ndarray[uint8_t] mask object val i = 0 @@ -100,7 +100,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): k = len(right) indexer = np.zeros(n, dtype=np.int32) - mask = np.ones(n, dtype=np.bool) + mask = np.ones(n, dtype=np.uint8) for i from 0 <= i < n: val = left[i] @@ -115,7 +115,7 @@ def ordered_left_join(ndarray[object] left, ndarray[object] right): indexer[i] = j mask[i] = 0 - return indexer, mask + return indexer, mask.view(np.bool_) @cython.wraparound(False) @cython.boundscheck(False) @@ -224,3 +224,21 @@ def take_join_contiguous(ndarray[float64_t, ndim=2] lvalues, for j from 0 <= j < rk: outbuf[0] = rvalues[ridx, j] outbuf = outbuf + 1 + +@cython.wraparound(False) +@cython.boundscheck(False) +def merge_indexer_list(list values, dict oldMap): + cdef int i, j, length, newLength + cdef object idx + cdef ndarray[int32_t] fill_vec + + newLength = len(values) + fill_vec = np.empty(newLength, dtype=np.int32) + for i from 0 <= i < newLength: + idx = values[i] + if idx in oldMap: + fill_vec[i] = oldMap[idx] + else: + fill_vec[i] = -1 + + return fill_vec diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx index ed65a9b39e660..3caab8eb413bc 100644 --- a/pandas/src/tseries.pyx +++ b/pandas/src/tseries.pyx @@ -49,9 +49,6 @@ cdef double_t *get_double_ptr(ndarray arr): cdef extern from "math.h": double sqrt(double x) -#cdef extern from "cobject.h": -# pass # for datetime API - cdef extern from "datetime.h": ctypedef class datetime.datetime [object PyDateTime_DateTime]: @@ -223,42 +220,122 @@ def array_to_datetime(ndarray[int64_t, ndim=1] arr): cdef double INF = np.inf cdef double NEGINF = -INF -cdef inline _isnan(object o): - return o != o - cdef inline _checknull(object val): + return val is None or val != val + +cpdef checknull(object val): if isinstance(val, (float, np.floating)): return val != val or val == INF or val == NEGINF else: - return val is None + return _checknull(val) -cpdef checknull(object val): - return _checknull(val) - -def isnullobj(ndarray input): - cdef int i, length +def isnullobj(ndarray[object] arr): + cdef Py_ssize_t i, n cdef object val - cdef ndarray[npy_int8, ndim=1] result - cdef flatiter iter + cdef ndarray[uint8_t] result - length = PyArray_SIZE(input) + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + val = arr[i] + if _checknull(val): + result[i] = 1 + return result.view(np.bool_) - result = np.zeros(length, dtype=np.int8) +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr - iter= PyArray_IterNew(input) + n = len(obj) + arr = np.empty(n, dtype=object) - for i from 0 <= i < length: - val = PyArray_GETITEM(input, PyArray_ITER_DATA(iter)) + for i from 0 <= i < n: + arr[i] = obj[i] - if _checknull(val): - result[i] = 1 + return arr - PyArray_ITER_NEXT(iter) - return result +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < n: + val = values[i] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple(list arrays): + cdef: + ndarray[object] buf + Py_ssize_t k = len(arrays) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = arrays[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list(list lists): + cdef: + list buf + Py_ssize_t k = len(lists) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = lists[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques include "skiplist.pyx" include "groupby.pyx" include "moments.pyx" include "reindex.pyx" include "generated.pyx" +include "parsing.pyx" diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index 1e1839e24f92c..e632b9b001b07 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -25,7 +25,7 @@ def fama_macbeth(**kwargs): return klass(**kwargs) class FamaMacBeth(object): - def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, + def __init__(self, y, x, intercept=True, nw_lags=None, nw_lags_beta=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies={}, verbose=False): @@ -33,7 +33,7 @@ def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, from pandas.stats.plm import MovingPanelOLS self._ols_result = MovingPanelOLS( - y=y, x=x, weights=weights, window_type='rolling', window=1, + y=y, x=x, window_type='rolling', window=1, intercept=intercept, nw_lags=nw_lags, entity_effects=entity_effects, time_effects=time_effects, x_effects=x_effects, cluster=cluster, @@ -141,7 +141,7 @@ def summary(self): return template % params class MovingFamaMacBeth(FamaMacBeth): - def __init__(self, y, x, weights=None, window_type='rolling', window=10, + def __init__(self, y, x, window_type='rolling', window=10, intercept=True, nw_lags=None, nw_lags_beta=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies={}, verbose=False): @@ -149,7 +149,7 @@ def __init__(self, y, x, weights=None, window_type='rolling', window=10, self._window = window FamaMacBeth.__init__( - self, y=y, x=x, weights=weights, intercept=intercept, + self, y=y, x=x, intercept=intercept, nw_lags=nw_lags, nw_lags_beta=nw_lags_beta, entity_effects=entity_effects, time_effects=time_effects, x_effects=x_effects, cluster=cluster, diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py index b3cc9eb41ed54..89da36c98eea7 100644 --- a/pandas/stats/interface.py +++ b/pandas/stats/interface.py @@ -39,9 +39,6 @@ def ols(**kwargs): Panel OLS options: pool: bool Whether to run pooled panel regression. Defaults to true. - weights: DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. entity_effects: bool Whether to account for entity fixed effects. Defaults to false. time_effects: bool diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index 43d6322f807d2..f477d8aeb95df 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -36,9 +36,6 @@ class PanelOLS(OLS): FULL_SAMPLE, ROLLING, EXPANDING. FULL_SAMPLE by default. window : int size of window (for rolling/expanding OLS) - weights : DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. pool : bool, default True Whether to run pooled panel regression entity_effects : bool, deafult False @@ -65,14 +62,12 @@ class PanelOLS(OLS): 2. There is autocorrelation - use 'entity' """ - def __init__(self, y, x, weights=None, - intercept=True, nw_lags=None, entity_effects=False, + def __init__(self, y, x, intercept=True, nw_lags=None, entity_effects=False, time_effects=False, x_effects=None, cluster=None, dropped_dummies=None, verbose=False, nw_overlap=False): self._x_orig = x self._y_orig = y - self._weights = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap @@ -110,8 +105,7 @@ def _prepare_data(self): The categorical variables will get dropped from x. """ - (x, x_filtered, y, weights, - weights_filt, cat_mapping) = self._filter_data() + (x, x_filtered, y, cat_mapping) = self._filter_data() self.log('Adding dummies to X variables') x = self._add_dummies(x, cat_mapping) @@ -141,12 +135,6 @@ def _prepare_data(self): x_regressor = x y_regressor = y - if weights is not None: - assert(y_regressor.index is weights.index) - assert(x_regressor.index is weights.index) - y_regressor = y_regressor * weights - x_regressor = x_regressor.mul(weights, axis=0) - return x, x_regressor, x_filtered, y, y_regressor def _filter_data(self): @@ -170,9 +158,6 @@ def _filter_data(self): x_names = data.items - if self._weights is not None: - data['__weights__'] = self._weights - # Filter x's without y (so we can make a prediction) filtered = data.to_long() @@ -187,21 +172,10 @@ def _filter_data(self): data_long = data.to_long() x_filt = filtered.filter(x_names) - - if self._weights: - weights_filt = filtered['__weights__'] - else: - weights_filt = None - x = data_long.filter(x_names) y = data_long['__y__'] - if self._weights: - weights = data_long['__weights__'] - else: - weights = None - - return x, x_filt, y, weights, weights_filt, cat_mapping + return x, x_filt, y, cat_mapping def _convert_x(self, x): # Converts non-numeric data in x to floats. x_converted is the @@ -527,9 +501,6 @@ class MovingPanelOLS(MovingOLS, PanelOLS): Minimum number of total observations to require. Default is rank(X matrix) + 1. In some cases we might want to be able to relax this number. - weights : DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. pool : bool Whether to run pooled panel regression. Defaults to true. entity_effects : bool @@ -554,7 +525,7 @@ class MovingPanelOLS(MovingOLS, PanelOLS): 1. Countries are correlated - use 'time' 2. There is autocorrelation - use 'entity' """ - def __init__(self, y, x, weights=None, + def __init__(self, y, x, window_type='expanding', window=None, min_periods=None, min_obs=None, @@ -567,8 +538,7 @@ def __init__(self, y, x, weights=None, dropped_dummies=None, verbose=False): - self._args = dict(weights=weights, - intercept=intercept, + self._args = dict(intercept=intercept, nw_lags=nw_lags, nw_overlap=nw_overlap, entity_effects=entity_effects, diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index dd53183e38b9a..79286c66f9283 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -381,41 +381,6 @@ def testFiltering(self): self.assertTrue(result._x_filtered.major_axis.equals( result.y_fitted.index)) - def testWithWeights(self): - data = np.arange(10).reshape((5, 2)) - index = [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3), - datetime(2000, 1, 4), - datetime(2000, 1, 5)] - cols = ['A', 'B'] - weights = DataFrame(data, index=index, columns=cols) - - result = ols(y=self.panel_y2, x=self.panel_x2, weights=weights) - - assert_almost_equal(result._y_trans.values.flat, [0, 16, 25]) - - exp_x = [[0, 0, 0], - [36, 68, 4], - [150, 240, 5]] - assert_almost_equal(result._x_trans.values, exp_x) - - - exp_x_filtered = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1], - [11, 20, 1], - [12, 21, 1]] -# exp_x_filtered = [[0, 0, 0], -# [36, 68, 4], -# [150, 240, 5], -# [66, 120, 6], -# [84, 147, 7]] - - assert_almost_equal(result._x_filtered.values, exp_x_filtered) - - # _check_non_raw_results(result) - def testWithTimeEffects(self): result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) @@ -513,15 +478,6 @@ def testForSeries(self): self.series_x, self.series_y, nw_lags=1, nw_overlap=True) - def testRollingWithWeights(self): - idx = self.panel_y.index - cols = self.panel_y.columns - - - weights = DataFrame(np.random.standard_normal((len(idx), len(cols))), - index=idx, columns=cols) - self.checkMovingOLS(self.panel_x, - self.panel_y, weights=weights) def testRolling(self): self.checkMovingOLS(self.panel_x, self.panel_y) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 06795d5cafd04..dc5bca73e5749 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11,6 +11,7 @@ from numpy.random import randn import numpy as np +import pandas.core.common as common import pandas.core.datetools as datetools from pandas.core.index import NULL_INDEX from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, @@ -53,6 +54,19 @@ def test_getitem_iterator(self): expected = self.frame.ix[:, ['A', 'B', 'C']] assert_frame_equal(result, expected) + def test_getitem_list(self): + result = self.frame[['B', 'A']] + result2 = self.frame[Index(['B', 'A'])] + + expected = self.frame.ix[:, ['B', 'A']] + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + self.assertRaises(Exception, self.frame.__getitem__, + ['B', 'A', 'foo']) + self.assertRaises(Exception, self.frame.__getitem__, + Index(['B', 'A', 'foo'])) + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] @@ -68,6 +82,11 @@ def test_getitem_boolean(self): subframe_obj = self.tsframe[indexer_obj] assert_frame_equal(subframe_obj, subframe) + def test_getattr(self): + tm.assert_series_equal(self.frame.A, self.frame['A']) + self.assertRaises(AttributeError, getattr, self.frame, + 'NONEXISTENT_NAME') + def test_setitem(self): # not sure what else to do here series = self.frame['A'][::2] @@ -142,6 +161,10 @@ def test_setitem_boolean(self): np.putmask(expected.values, mask.values, df.values * 2) assert_frame_equal(df, expected) + def test_setitem_cast(self): + self.frame['D'] = self.frame['D'].astype('i8') + self.assert_(self.frame['D'].dtype == np.int64) + def test_setitem_boolean_column(self): expected = self.frame.copy() mask = self.frame['A'] > 0 @@ -175,7 +198,7 @@ def test_setitem_corner(self): self.assertEqual(dm.values.dtype, np.object_) dm['C'] = 1 - self.assertEqual(dm['C'].dtype, np.int_) + self.assertEqual(dm['C'].dtype, np.int64) # set existing column dm['A'] = 'bar' @@ -215,6 +238,21 @@ def test_setitem_ambig(self): # self.assert_(dm.objects is not None) self.assert_(dm[2].dtype == np.object_) + def test_setitem_clear_caches(self): + # GH #304 + df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, + index=[0,1,2,3]) + df.insert(2, 'z', np.nan) + + # cache it + foo = df['z'] + + df.ix[2:, 'z'] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index) + self.assert_(df['z'] is not foo) + assert_series_equal(df['z'], expected) + def test_delitem_corner(self): f = self.frame.copy() del f['D'] @@ -266,8 +304,8 @@ def test_setitem_fancy_2d(self): frame = self.frame.copy() expected = frame.copy() frame.ix[:, ['B', 'A']] = 1 - expected['B'] = 1 - expected['A'] = 1 + expected['B'] = 1. + expected['A'] = 1. assert_frame_equal(frame, expected) # case 2 @@ -348,11 +386,11 @@ def test_setitem_fancy_2d(self): def test_fancy_getitem_slice_mixed(self): sliced = self.mixed_frame.ix[:, -3:] - self.assert_(sliced['D'].dtype == np.float_) + self.assert_(sliced['D'].dtype == np.float64) # get view with single block sliced = self.frame.ix[:, -3:] - sliced['C'] = 4 + sliced['C'] = 4. self.assert_((self.frame['C'] == 4).all()) def test_fancy_setitem_int_labels(self): @@ -700,6 +738,23 @@ def test_join_index(self): self.assertRaises(Exception, f.join, f2, how='foo') + def test_join_index_more(self): + af = self.frame.ix[:, ['A', 'B']] + bf = self.frame.ix[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = self.frame['C'][::2] + expected['D'] = self.frame['D'][::2] + + result = af.join(bf) + assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + assert_frame_equal(result, expected.ix[:, result.columns]) + def test_join_index_series(self): df = self.frame.copy() s = df.pop(self.frame.columns[-1]) @@ -895,13 +950,13 @@ def test_constructor_dict_cast(self): } frame = DataFrame(test_data, dtype=float) self.assertEqual(len(frame), 3) - self.assert_(frame['B'].dtype == np.float_) - self.assert_(frame['A'].dtype == np.float_) + self.assert_(frame['B'].dtype == np.float64) + self.assert_(frame['A'].dtype == np.float64) frame = DataFrame(test_data) self.assertEqual(len(frame), 3) self.assert_(frame['B'].dtype == np.object_) - self.assert_(frame['A'].dtype == np.float_) + self.assert_(frame['A'].dtype == np.float64) # can't cast to float test_data = { @@ -911,7 +966,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data, dtype=float) self.assertEqual(len(frame), 20) self.assert_(frame['A'].dtype == np.object_) - self.assert_(frame['B'].dtype == np.float_) + self.assert_(frame['B'].dtype == np.float64) def test_constructor_dict_dont_upcast(self): d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} @@ -933,7 +988,7 @@ def test_constructor_ndarray(self): # cast type frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2], dtype=int) - self.assert_(frame.values.dtype == np.int_) + self.assert_(frame.values.dtype == np.int64) # 1-D input frame = DataFrame(np.zeros(3), columns=['A'], index=[1, 2, 3]) @@ -982,16 +1037,16 @@ def test_constructor_corner(self): # does not error but ends up float df = DataFrame(index=range(10), columns=['a','b'], dtype=int) - self.assert_(df.values.dtype == np.float_) + self.assert_(df.values.dtype == np.float64) def test_constructor_scalar_inference(self): data = {'int' : 1, 'bool' : True, 'float' : 3., 'object' : 'foo'} df = DataFrame(data, index=np.arange(10)) - self.assert_(df['int'].dtype == np.int_) + self.assert_(df['int'].dtype == np.int64) self.assert_(df['bool'].dtype == np.bool_) - self.assert_(df['float'].dtype == np.float_) + self.assert_(df['float'].dtype == np.float64) self.assert_(df['object'].dtype == np.object_) def test_constructor_DataFrame(self): @@ -999,7 +1054,7 @@ def test_constructor_DataFrame(self): assert_frame_equal(df, self.frame) df_casted = DataFrame(self.frame, dtype=int) - self.assert_(df_casted.values.dtype == np.int_) + self.assert_(df_casted.values.dtype == np.int64) def test_constructor_more(self): # used to be in test_matrix.py @@ -1039,7 +1094,7 @@ def test_constructor_more(self): index=np.arange(10)) self.assertEqual(len(dm.columns), 2) - self.assert_(dm.values.dtype == np.float_) + self.assert_(dm.values.dtype == np.float64) def test_constructor_ragged(self): data = {'A' : randn(10), @@ -1056,6 +1111,14 @@ def test_constructor_Series_copy_bug(self): df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) df.copy() + def test_constructor_mixed_dict_and_Series(self): + data = {} + data['A'] = {'foo' : 1, 'bar' : 2, 'baz' : 3} + data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo']) + + result = DataFrame(data) + self.assert_(result.index.is_monotonic) + def test_astype(self): casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), @@ -1198,10 +1261,26 @@ def test_repr(self): index=np.arange(50)) foo = repr(unsortable) - import pandas.core.common as common common.set_printoptions(precision=3, column_space=10) repr(self.frame) + def test_eng_float_formatter(self): + self.frame.ix[5] = 0 + + common.set_eng_float_format() + + repr(self.frame) + + common.set_eng_float_format(use_eng_prefix=True) + + repr(self.frame) + + common.set_eng_float_format(precision=0) + + repr(self.frame) + + common.set_printoptions(precision=4) + def test_repr_tuples(self): buf = StringIO() @@ -1247,20 +1326,25 @@ def test_to_string(self): biggie['A'][:20] = nan biggie['B'][:20] = nan + s = biggie.to_string() + buf = StringIO() - biggie.to_string(buf=buf) + retval = biggie.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue(), s) + + self.assert_(isinstance(s, basestring)) - biggie.to_string(buf=buf, columns=['B', 'A'], colSpace=17) - biggie.to_string(buf=buf, columns=['B', 'A'], - formatters={'A' : lambda x: '%.1f' % x}) + biggie.to_string(columns=['B', 'A'], colSpace=17) + biggie.to_string(columns=['B', 'A'], + formatters={'A' : lambda x: '%.1f' % x}) - biggie.to_string(buf=buf, columns=['B', 'A'], - float_format=str) - biggie.to_string(buf=buf, columns=['B', 'A'], colSpace=12, - float_format=str) + biggie.to_string(columns=['B', 'A'], float_format=str) + biggie.to_string(columns=['B', 'A'], colSpace=12, + float_format=str) frame = DataFrame(index=np.arange(1000)) - frame.to_string(buf=buf) + frame.to_string() def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), @@ -1354,6 +1438,13 @@ def test_arith_flex_frame(self): const_add = self.frame.add(1) assert_frame_equal(const_add, self.frame + 1) + # corner cases + result = self.frame.add(self.frame[:0]) + assert_frame_equal(result, self.frame * np.nan) + + result = self.frame[:0].add(self.frame) + assert_frame_equal(result, self.frame * np.nan) + def test_arith_flex_series(self): df = self.simple @@ -1543,7 +1634,7 @@ def test_to_csv_multiindex(self): # round trip frame.to_csv(path) - df = DataFrame.from_csv(path, index_col=[0,1]) + df = DataFrame.from_csv(path, index_col=[0,1], parse_dates=False) assert_frame_equal(frame, df) self.assertEqual(frame.index.names, df.index.names) @@ -1572,25 +1663,45 @@ def test_to_csv_multiindex(self): os.remove(path) + # empty + tsframe[:0].to_csv(path) + recons = DataFrame.from_csv(path) + assert_frame_equal(recons, tsframe[:0]) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan pth = '__tmp__.csv' - df.to_csv(pth, nanRep=999) + df.to_csv(pth, na_rep=999) lines = open(pth).readlines() self.assert_(lines[1].split(',')[2] == '999') os.remove(pth) - + def test_to_csv_withcommas(self): - "Commas inside fields should be correctly escaped when saving as CSV." + path = '__tmp__' + # Commas inside fields should be correctly escaped when saving as CSV. + df = DataFrame({'A':[1,2,3], 'B':['5,6','7,8','9,0']}) df.to_csv(path) df2 = DataFrame.from_csv(path) assert_frame_equal(df2, df) - + + os.remove(path) + + def test_to_csv_bug(self): + from pandas import read_csv + path = '__tmp__.csv' + f1 = StringIO('a,1.0\nb,2.0') + df = DataFrame.from_csv(f1,header=None) + newdf = DataFrame({'t': df[df.columns[0]]}) + newdf.to_csv(path) + + recons = read_csv(path, index_col=0) + assert_frame_equal(recons, newdf) + os.remove(path) def test_info(self): @@ -1661,6 +1772,19 @@ def test_append_records(self): expected = DataFrame(np.concatenate((arr1, arr2))) assert_frame_equal(result, expected) + def test_append_different_columns(self): + df = DataFrame({'bools' : np.random.randn(10) > 0, + 'ints' : np.random.randint(0, 10, 10), + 'floats' : np.random.randn(10), + 'strings' : ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assert_(isnull(appended['strings'][:5]).all()) + self.assert_(isnull(appended['bools'][5:]).all()) + def test_asfreq(self): offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) rule_monthly = self.tsframe.asfreq('EOM') @@ -2005,11 +2129,6 @@ def test_pivot_duplicates(self): data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo', 'foo'], 'b' : ['one', 'two', 'one', 'one', 'two'], 'c' : [1., 2., 3., 3., 4.]}) - # expected = DataFrame([[1., 2.], [3., 4.]], index=['bar', 'foo'], - # columns=['one', 'two']) - # result = data.pivot('a', 'b', 'c') - # assert_frame_equal(result, expected) - self.assertRaises(Exception, data.pivot, 'a', 'b', 'c') def test_reindex(self): @@ -2071,13 +2190,13 @@ def test_reindex(self): def test_reindex_int(self): smaller = self.intframe.reindex(self.intframe.index[::2]) - self.assert_(smaller['A'].dtype == np.int_) + self.assert_(smaller['A'].dtype == np.int64) bigger = smaller.reindex(self.intframe.index) - self.assert_(bigger['A'].dtype == np.float_) + self.assert_(bigger['A'].dtype == np.float64) smaller = self.intframe.reindex(columns=['A', 'B']) - self.assert_(smaller['A'].dtype == np.int_) + self.assert_(smaller['A'].dtype == np.int64) def test_reindex_like(self): other = self.frame.reindex(index=self.frame.index[:10], @@ -2096,8 +2215,73 @@ def test_reindex_columns(self): newFrame = self.frame.reindex(columns=[]) self.assert_(not newFrame) - def test_reindex_mixed(self): - pass + def test_add_index(self): + df = DataFrame({'A' : ['foo', 'foo', 'foo', 'bar', 'bar'], + 'B' : ['one', 'two', 'three', 'one', 'two'], + 'C' : ['a', 'b', 'c', 'd', 'e'], + 'D' : np.random.randn(5), + 'E' : np.random.randn(5)}) + + # new object, single-column + result = df.set_index('C') + result_nodrop = df.set_index('C', drop=False) + + index = Index(df['C'], name='C') + + expected = df.ix[:, ['A', 'B', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.name, index.name) + + # inplace, single + df2 = df.copy() + df2.set_index('C', inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index('C', drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # create new object, multi-column + result = df.set_index(['A', 'B']) + result_nodrop = df.set_index(['A', 'B'], drop=False) + + index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + + expected = df.ix[:, ['C', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.names, index.names) + + # inplace + df2 = df.copy() + df2.set_index(['A', 'B'], inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index(['A', 'B'], drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # corner case + self.assertRaises(Exception, df.set_index, 'A') + + def test_align(self): + + af, bf = self.frame.align(self.frame) + self.assert_(af._data is not self.frame._data) + + af, bf = self.frame.align(self.frame, copy=False) + self.assert_(af._data is self.frame._data) #---------------------------------------------------------------------- # Transposing @@ -2246,6 +2430,15 @@ def test_apply(self): applied = self.empty.apply(np.mean) self.assert_(not applied) + no_rows = self.frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=self.frame.columns) + assert_series_equal(result, expected) + + no_cols = self.frame.ix[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=self.frame.index) + assert_series_equal(result, expected) def test_apply_broadcast(self): broadcasted = self.frame.apply(np.mean, broadcast=True) @@ -2486,96 +2679,6 @@ def test_combineMult(self): comb = self.empty.combineMult(self.frame) assert_frame_equal(comb, self.frame) - def test_join_on(self): - index, data = tm.getMixedTypeDict() - target = DataFrame(data, index=index) - - # Join on string value - source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']}, - index=data['C']) - merged = target.join(source, on='C') - self.assert_(np.array_equal(merged['MergedA'], target['A'])) - self.assert_(np.array_equal(merged['MergedD'], target['D'])) - - # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'], - 'value' : [0, 0, 1, 1, 2]}) - assert_frame_equal(joined, expected) - - # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - self.assert_(np.isnan(joined['two']['c'])) - self.assert_(np.isnan(joined['three']['c'])) - - # merge column not p resent - self.assertRaises(Exception, target.join, source, on='E') - - # corner cases - - # nothing to merge - merged = target.join(source.reindex([]), on='C') - - # overlap - source_copy = source.copy() - source_copy['A'] = 0 - self.assertRaises(Exception, target.join, source_copy, on='A') - - # can't specify how - self.assertRaises(Exception, target.join, source, on='C', - how='left') - - def test_join_index_mixed(self): - - df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int_) - self.assert_(df1['D'].dtype == np.bool_) - - df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) - - # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] - df1.columns = expected_columns[:4] - df2.columns = expected_columns[4:] - expected = _join_by_hand(df1, df2) - assert_frame_equal(joined, expected) - - # no overlapping blocks - df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' - - df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. - - for kind in JOIN_TYPES: - joined = df1.join(df2, how=kind) - expected = _join_by_hand(df1, df2, how=kind) - assert_frame_equal(joined, expected) - - joined = df2.join(df1, how=kind) - expected = _join_by_hand(df2, df1, how=kind) - assert_frame_equal(joined, expected) - - def test_join_on_series(self): - pass - def test_clip(self): median = self.frame.median().median() @@ -2652,9 +2755,11 @@ def wrapper(x): def test_min(self): self._check_stat_op('min', np.min) + self._check_stat_op('min', np.min, frame=self.intframe) def test_max(self): self._check_stat_op('max', np.max) + self._check_stat_op('max', np.max, frame=self.intframe) def test_mad(self): f = lambda x: np.abs(x - x.mean()).mean() @@ -2889,7 +2994,7 @@ def test_reindex_corner(self): # ints are weird smaller = self.intframe.reindex(columns=['A', 'B', 'E']) - self.assert_(smaller['E'].dtype == np.float_) + self.assert_(smaller['E'].dtype == np.float64) def test_rename_objects(self): renamed = self.mixed_frame.rename(columns=str.upper) @@ -3098,6 +3203,176 @@ def test_take(self): expected = self.mixed_frame.ix[:, ['foo', 'B', 'C', 'A', 'D']] assert_frame_equal(result, expected) + def test_iterkv_names(self): + for k, v in self.mixed_frame.iterkv(): + self.assertEqual(v.name, k) + + def test_series_put_names(self): + series = self.mixed_frame._series + for k, v in series.iteritems(): + self.assertEqual(v.name, k) + + + +class TestDataFrameJoin(unittest.TestCase): + + def setUp(self): + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA' : data['A'], 'MergedD' : data['D']}, + index=data['C']) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_(np.array_equal(merged['MergedA'], target['A'])) + self.assert_(np.array_equal(merged['MergedD'], target['D'])) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c'], + 'value' : [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assert_(np.isnan(joined['two']['c'])) + self.assert_(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(Exception, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(Exception, target.join, source_copy, on='A') + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assert_(col in merged) + self.assert_(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_(merged2.columns.equals(merged.columns)) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key' : ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_(np.array_equal(joined['key'], expected['key'])) + self.assert_(np.array_equal(joined['value'], expected['value'])) + self.assert_(joined.index.equals(expected.index)) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key' : ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value' : [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_multikey(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + + joined = data.join(to_join, on=['key1', 'key2']) + + join_key = Index(zip(key1, key2)) + indexer = to_join.index.get_indexer(join_key) + ex_values = to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = data.join(DataFrame(ex_values, columns=to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + + df1 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assert_(df1['B'].dtype == np.int64) + self.assert_(df1['D'].dtype == np.bool_) + + df2 = DataFrame({'A' : 1., 'B' : 2, 'C' : 'foo', 'D' : True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in JOIN_TYPES: + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + def _join_by_hand(a, b, how='left'): join_index = a.index.join(b.index, how=how) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8caae56dd335c..d6041b8e80998 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -56,7 +56,8 @@ def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]]) + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) @@ -110,6 +111,11 @@ def test_agg_regression1(self): expected = grouped.mean() assert_frame_equal(result, expected) + def test_agg_must_add(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + def test_get_group(self): wp = tm.makePanel() grouped = wp.groupby(lambda x: x.month, axis='major') @@ -297,7 +303,7 @@ def test_series_describe_multikey(self): def test_series_describe_single(self): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) - result = grouped.agg(lambda x: x.describe()) + result = grouped.apply(lambda x: x.describe()) expected = grouped.describe() assert_frame_equal(result, expected) @@ -655,7 +661,7 @@ def _testit(op): def test_grouping_attrs(self): deleveled = self.mframe.delevel() - grouped = deleveled.groupby(['level_0', 'level_1']) + grouped = deleveled.groupby(['first', 'second']) for i, ping in enumerate(grouped.groupings): the_counts = self.mframe.groupby(level=i).count()['A'] @@ -668,12 +674,20 @@ def test_groupby_level(self): result0 = frame.groupby(level=0).sum() result1 = frame.groupby(level=1).sum() - expected0 = frame.groupby(deleveled['level_0']).sum() - expected1 = frame.groupby(deleveled['level_1']).sum() + expected0 = frame.groupby(deleveled['first']).sum() + expected1 = frame.groupby(deleveled['second']).sum() + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + # groupby level name + result0 = frame.groupby(level='first').sum() + result1 = frame.groupby(level='second').sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) + # axis=1 + result0 = frame.T.groupby(level=0, axis=1).sum() result1 = frame.T.groupby(level=1, axis=1).sum() assert_frame_equal(result0, expected0.T) @@ -693,8 +707,8 @@ def test_groupby_level_mapper(self): result0 = frame.groupby(mapper0, level=0).sum() result1 = frame.groupby(mapper1, level=1).sum() - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['level_0']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['level_1']]) + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) expected0 = frame.groupby(mapped_level0).sum() expected1 = frame.groupby(mapped_level1).sum() @@ -729,7 +743,8 @@ def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) expected = grouped.count()['C'] - assert_series_equal(result, expected) + self.assert_(result.index.equals(expected.index)) + self.assert_(np.array_equal(result.values, expected.values)) def test_apply_transform(self): grouped = self.ts.groupby(lambda x: x.month) @@ -863,6 +878,67 @@ def test_grouping_ndarray(self): expected = self.df.groupby('A').sum() assert_frame_equal(result, expected) + def test_apply_typecast_fail(self): + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_multiindex_fail(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_corner(self): + result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = self.tsframe * 2 + assert_frame_equal(result, expected) + + def test_transform_mixed_type(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + self.assert_(result['d'].dtype == np.float64) + + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) + class TestPanelGroupBy(unittest.TestCase): def setUp(self): diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 7a37780b01984..38365ddfaa1b4 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -33,7 +33,8 @@ def test_deepcopy(self): def test_duplicates(self): idx = Index([0, 0, 0]) - self.assertRaises(Exception, idx._verify_integrity) + self.assert_(not idx._verify_integrity()) + self.assertRaises(Exception, getattr, idx, 'indexMap') def test_sort(self): self.assertRaises(Exception, self.strIndex.sort) @@ -174,7 +175,7 @@ def test_add(self): secondCat = self.strIndex + self.strIndex self.assert_(tm.equalContents(np.append(self.strIndex, - self.dateIndex), firstCat)) + self.dateIndex), firstCat)) self.assert_(tm.equalContents(secondCat, self.strIndex)) tm.assert_contains_all(self.strIndex, firstCat.indexMap) tm.assert_contains_all(self.strIndex, secondCat.indexMap) @@ -183,6 +184,17 @@ def test_add(self): # this is valid too shifted = self.dateIndex + timedelta(1) + def test_append_multiple(self): + index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + + foos = [index[:2], index[2:4], index[4:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(index)) + + # empty + result = index.append([]) + self.assert_(result.equals(index)) + def test_add_string(self): # from bug report index = Index(['a', 'b', 'c']) @@ -556,6 +568,11 @@ def test_prevent_casting(self): result = self.index.astype('O') self.assert_(result.dtype == np.object_) + def test_take_preserve_name(self): + index = Int64Index([1,2,3,4], name='foo') + taken = index.take([3,0,1]) + self.assertEqual(index.name, taken.name) + class TestMultiIndex(unittest.TestCase): def setUp(self): @@ -577,6 +594,17 @@ def test_constructor_single_level(self): self.assert_(not isinstance(single_level, MultiIndex)) self.assert_(single_level.name == 'first') + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]]) + self.assert_(single_level.name is None) + + def test_constructor_no_levels(self): + self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + + def test_duplicate_names(self): + self.index.names = ['foo', 'foo'] + self.assertRaises(Exception, self.index._get_level_number, 'foo') + def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): @@ -589,6 +617,14 @@ def test_append(self): result = self.index[:3].append(self.index[3:]) self.assert_(result.equals(self.index)) + foos = [self.index[:1], self.index[1:3], self.index[3:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(self.index)) + + # empty + result = self.index.append([]) + self.assert_(result.equals(self.index)) + def test_get_level_values(self): result = self.index.get_level_values(0) expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] @@ -827,9 +863,17 @@ def test_equals(self): self.assert_(not self.index.equals(self.index.get_tuple_index())) # different number of levels - index = MultiIndex(levels=self.index.levels[:-1], - labels=self.index.labels[:-1]) - self.assert_(not self.index.equals(index)) + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + index2 = MultiIndex(levels=index.levels[:-1], + labels=index.labels[:-1]) + self.assert_(not index.equals(index2)) + self.assert_(not index.equal_levels(index2)) # levels are different major_axis = Index(range(4)) @@ -858,7 +902,7 @@ def test_union(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] - the_union = piece1.union(piece2) + the_union = piece1 | piece2 tups = sorted(self.index.get_tuple_index()) expected = MultiIndex.from_tuples(tups) @@ -872,14 +916,27 @@ def test_union(self): the_union = self.index.union(self.index[:0]) self.assert_(the_union is self.index) - self.assertRaises(TypeError, self.index.union, - self.index.get_tuple_index()) + # won't work in python 3 + # tuples = self.index.get_tuple_index() + # result = self.index[:4] | tuples[4:] + # self.assert_(result.equals(tuples)) + + # not valid for python 3 + # def test_union_with_regular_index(self): + # other = Index(['A', 'B', 'C']) + + # result = other.union(self.index) + # self.assert_(('foo', 'one') in result) + # self.assert_('B' in result) + + # result2 = self.index.union(other) + # self.assert_(result.equals(result2)) def test_intersection(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] - the_int = piece1.intersection(piece2) + the_int = piece1 & piece2 tups = sorted(self.index[3:5].get_tuple_index()) expected = MultiIndex.from_tuples(tups) self.assert_(the_int.equals(expected)) @@ -888,8 +945,60 @@ def test_intersection(self): the_int = self.index.intersection(self.index) self.assert_(the_int is self.index) - self.assertRaises(TypeError, self.index.intersection, - self.index.get_tuple_index()) + # empty intersection: disjoint + empty = self.index[:2] & self.index[2:] + expected = self.index[:0] + self.assert_(empty.equals(expected)) + + # can't do in python 3 + # tuples = self.index.get_tuple_index() + # result = self.index & tuples + # self.assert_(result.equals(tuples)) + + def test_diff(self): + first = self.index + result = first - self.index[-3:] + expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), + sortorder=0, + names=self.index.names) + + self.assert_(isinstance(result, MultiIndex)) + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: reflexive + result = self.index - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: superset + result = self.index[-3:] - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: degenerate + result = self.index[:0] - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # names not the same + chunklet = self.index[-3:] + chunklet.names = ['foo', 'baz'] + result = first - chunklet + self.assertEqual(result.names, [None, None]) + + # empty, but non-equal + result = self.index - self.index.sortlevel(1)[0] + self.assert_(len(result) == 0) + + # raise Exception called with non-MultiIndex + self.assertRaises(Exception, first.diff, first.get_tuple_index()) + + def test_from_tuples(self): + self.assertRaises(Exception, MultiIndex.from_tuples, []) def test_argsort(self): result = self.index.argsort() @@ -967,6 +1076,10 @@ def test_insert(self): # key wrong length self.assertRaises(Exception, self.index.insert, 0, ('foo2',)) + def test_take_preserve_name(self): + taken = self.index.take([3,0,1]) + self.assertEqual(taken.names, self.index.names) + class TestFactor(unittest.TestCase): def setUp(self): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 4603a07d294e1..f29fbad9d790b 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -227,7 +227,7 @@ def test_set_change_dtype(self): self.assert_(mgr2.get('baz').dtype == np.object_) mgr2.set('quux', randn(N).astype(int)) - self.assert_(mgr2.get('quux').dtype == np.int_) + self.assert_(mgr2.get('quux').dtype == np.int64) mgr2.set('quux', randn(N)) self.assert_(mgr2.get('quux').dtype == np.float_) @@ -249,7 +249,7 @@ def test_as_matrix_int_bool(self): blocks = [get_int_ex(['a']), get_int_ex(['b'])] mgr = BlockManager.from_blocks(blocks, np.arange(index_sz)) - self.assert_(mgr.as_matrix().dtype == np.int_) + self.assert_(mgr.as_matrix().dtype == np.int64) def test_xs(self): pass diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index b6cfac1a7b023..69ad9f6996b65 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -5,7 +5,7 @@ from numpy.random import randn import numpy as np -from pandas.core.index import MultiIndex +from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull from pandas.util.testing import (assert_almost_equal, @@ -23,7 +23,7 @@ def setUp(self): [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], @@ -75,6 +75,10 @@ def test_reindex_preserve_levels(self): chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEquals(result.index.names, self.frame.index.names) + def test_repr_to_string(self): repr(self.frame) repr(self.ymd) @@ -277,6 +281,24 @@ def _check_counts(frame, axis=0): df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) + def test_count_level_series(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz'], + ['one', 'two', 'three', 'four']], + labels=[[0, 0, 0, 2, 2], + [2, 0, 1, 1, 2]]) + + s = Series(np.random.randn(len(index)), index=index) + + result = s.count(level=0) + expected = s.groupby(level=0).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + result = s.count(level=1) + expected = s.groupby(level=1).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) @@ -346,6 +368,10 @@ def test_stack(self): ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -372,11 +398,26 @@ def test_unstack_bug(self): def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') - self.assertEquals(unstacked.columns.names, [None, 'second']) + self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) + def test_unstack_level_name(self): + result = self.frame.unstack('second') + expected = self.frame.unstack(level=1) + assert_frame_equal(result, expected) + + def test_stack_level_name(self): + unstacked = self.frame.unstack('second') + result = unstacked.stack('exp') + expected = self.frame.unstack().stack(0) + assert_frame_equal(result, expected) + + result = self.frame.stack('exp') + expected = self.frame.stack() + assert_series_equal(result, expected) + def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) diff --git a/pandas/tests/test_ndframe.py b/pandas/tests/test_ndframe.py index e09d78b177a23..70a5d79d2c428 100644 --- a/pandas/tests/test_ndframe.py +++ b/pandas/tests/test_ndframe.py @@ -13,15 +13,15 @@ def setUp(self): def test_constructor(self): # with cast - ndf = NDFrame(self.ndf._data, dtype=int) - self.assert_(ndf.values.dtype == np.int_) + ndf = NDFrame(self.ndf._data, dtype=np.int64) + self.assert_(ndf.values.dtype == np.int64) def test_ndim(self): self.assertEquals(self.ndf.ndim, 2) def test_astype(self): casted = self.ndf.astype(int) - self.assert_(casted.values.dtype == np.int_) + self.assert_(casted.values.dtype == np.int64) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index fc5cdbcac1603..e54485f2e2059 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -419,6 +419,11 @@ def test_xs(self): itemA_view.values[:] = np.nan self.assert_(np.isnan(self.panel['ItemA'].values).all()) + # mixed-type + self.panel['strings'] = 'foo' + self.assertRaises(Exception, self.panel.xs, 'D', axis=2, + copy=False) + def test_getitem_fancy_labels(self): p = self.panel @@ -560,6 +565,12 @@ def test_constructor_cast(self): data = [[['foo', 'bar', 'baz']]] self.assertRaises(ValueError, Panel, data, dtype=float) + def test_constructor_empty_panel(self): + empty = Panel() + self.assert_(len(empty.items) == 0) + self.assert_(len(empty.major_axis) == 0) + self.assert_(len(empty.minor_axis) == 0) + def test_consolidate(self): self.assert_(self.panel._data.is_consolidated()) @@ -664,6 +675,20 @@ def test_reindex_like(self): smaller_like = self.panel.reindex_like(smaller) assert_panel_equal(smaller, smaller_like) + def test_take(self): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) + + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) + + self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + def test_sort_index(self): import random @@ -820,6 +845,30 @@ def test_join_overlap(self): expected = p1_suf.join(p2_suf).join(no_overlap) assert_panel_equal(joined, expected) + def test_repr_empty(self): + empty = Panel() + repr(empty) + + def test_rename(self): + mapper = { + 'ItemA' : 'foo', + 'ItemB' : 'bar', + 'ItemC' : 'baz' + } + + renamed = self.panel.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_(renamed.items.equals(exp)) + + renamed = self.panel.rename_axis(str.lower, axis=2) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_(renamed.minor_axis.equals(exp)) + + # don't copy + renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assert_((self.panel['ItemA'].values == 3).all()) + class TestLongPanel(unittest.TestCase): def setUp(self): @@ -955,6 +1004,17 @@ def test_combine_scalar(self): expected = DataFrame(self.panel._data) * 2 assert_frame_equal(result, expected) + def test_combine_series(self): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) + + s = self.panel.ix[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) + def test_operators(self): wp = self.panel.to_wide() result = (self.panel + 1).to_wide() @@ -970,9 +1030,6 @@ def is_sorted(arr): sorted_major = sorted_minor.sortlevel(level=0) self.assert_(is_sorted(sorted_major.major_labels)) - def test_to_wide(self): - pass - def test_toCSV(self): self.panel.toCSV('__tmp__') os.remove('__tmp__') @@ -1138,6 +1195,7 @@ def test_pivot(self): # corner case, empty df = pivot(np.array([]), np.array([]), np.array([])) + def test_group_agg(): values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) bounds = np.arange(5) * 2 diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f8b865ab7f860..6c909f6d40ab3 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -157,6 +157,11 @@ def test_constructor_dict(self): expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) assert_series_equal(result, expected) + def test_constructor_tuples(self): + data = [(1, 1), (2, 2), (2, 3)] + s = Series(data) + self.assertEqual(list(s), data) + def test_fromDict(self): data = {'a' : 0, 'b' : 1, 'c' : 2, 'd' : 3} @@ -244,6 +249,17 @@ def test_getitem_regression(self): result = s[range(5)] assert_series_equal(result, s) + def test_getitem_slice_bug(self): + s = Series(range(10), range(10)) + result = s[-12:] + assert_series_equal(result, s) + + result = s[-7:] + assert_series_equal(result, s[3:]) + + result = s[:-12] + assert_series_equal(result, s[:0]) + def test_getitem_int64(self): idx = np.int64(5) self.assertEqual(self.ts[idx], self.ts[5]) @@ -426,7 +442,13 @@ def test_repr(self): def test_to_string(self): from cStringIO import StringIO - self.ts.to_string(buffer=StringIO()) + buf = StringIO() + + s = self.ts.to_string() + + retval = self.ts.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue().strip(), s) def test_iter(self): for i, val in enumerate(self.series): @@ -545,6 +567,13 @@ def test_describe(self): _ = self.series.describe() _ = self.ts.describe() + def test_describe_objects(self): + s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) + result = s.describe() + expected = Series({'count' : 7, 'unique' : 4, + 'top' : 'a', 'freq' : 3}, index=result.index) + assert_series_equal(result, expected) + def test_append(self): appendedSeries = self.series.append(self.ts) for idx, value in appendedSeries.iteritems(): @@ -752,6 +781,23 @@ def test_count(self): self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) + def test_value_counts(self): + s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) + hist = s.value_counts() + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + assert_series_equal(hist, expected) + + # handle NA's properly + s[5:7] = np.nan + hist = s.value_counts() + expected = s.dropna().value_counts() + assert_series_equal(hist, expected) + + s = Series({}) + hist = s.value_counts() + expected = Series([]) + assert_series_equal(hist, expected) + def test_sort(self): ts = self.ts.copy() ts.sort() @@ -820,6 +866,11 @@ def test_clip(self): self.assertEqual(self.ts.clip(lower=val).min(), val) self.assertEqual(self.ts.clip(upper=val).max(), val) + result = self.ts.clip(-0.5, 0.5) + expected = np.clip(self.ts, -0.5, 0.5) + assert_series_equal(result, expected) + self.assert_(isinstance(expected, Series)) + def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN @@ -1025,6 +1076,11 @@ def test_align_nocopy(self): rb[:2] = 5 self.assert_((b[:2] == 5).all()) + def test_align_sameindex(self): + a, b = self.ts.align(self.ts) + self.assert_(a.index is self.ts.index) + self.assert_(b.index is self.ts.index) + def test_reindex(self): identity = self.series.reindex(self.series.index) self.assertEqual(id(self.series.index), id(identity.index)) @@ -1283,6 +1339,11 @@ def test_first_last_valid(self): self.assert_(ser.last_valid_index() is None) self.assert_(ser.first_valid_index() is None) + def test_mpl_compat_hack(self): + result = self.ts[:, np.newaxis] + expected = self.ts.values[:, np.newaxis] + assert_almost_equal(result, expected) + #------------------------------------------------------------------------------- # GroupBy diff --git a/pandas/tests/test_sparse.py b/pandas/tests/test_sparse.py index 3ac04eb4dc385..a361dc70ef39f 100644 --- a/pandas/tests/test_sparse.py +++ b/pandas/tests/test_sparse.py @@ -250,7 +250,7 @@ def test_copy_astype(self): self.assert_((self.bseries.sp_values[:5] == 5).all()) def test_astype(self): - self.assertRaises(Exception, self.bseries.astype, np.int_) + self.assertRaises(Exception, self.bseries.astype, np.int64) def test_kind(self): self.assertEquals(self.bseries.kind, 'block') @@ -958,7 +958,7 @@ def test_applymap(self): self.assert_(isinstance(result, SparseDataFrame)) def test_astype(self): - self.assertRaises(Exception, self.frame.astype, np.int_) + self.assertRaises(Exception, self.frame.astype, np.int64) def test_fillna(self): self.assertRaises(NotImplementedError, self.frame.fillna, 0) diff --git a/pandas/tools/__init__.py b/pandas/tools/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py new file mode 100644 index 0000000000000..9090e8c09eed4 --- /dev/null +++ b/pandas/tools/pivot.py @@ -0,0 +1,104 @@ +from pandas import DataFrame +import numpy as np + +def pivot_table(data, values=None, rows=None, cols=None, aggfunc=np.mean, + fill_value=None): + """ + Create a spreadsheet-style pivot table as a DataFrame. The levels in the + pivot table will be stored in MultiIndex objects (hierarchical indexes) on + the index and columns of the result DataFrame + + Parameters + ---------- + data : DataFrame + values : column to aggregate, optional + rows : list + Columns to group on the x-axis of the pivot table + cols : list + Columns to group on the x-axis of the pivot table + aggfunc : function, default numpy.mean + fill_value : scalar, default None + Value to replace missing values with + + Examples + -------- + >>> df + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 + + >>> table = pivot_table(df, values='D', rows=['A', 'B'], + ... cols=['C'], aggfunc=np.sum) + >>> table + small large + foo one 1 4 + two 6 NaN + bar one 5 4 + two 6 7 + + Returns + ------- + table : DataFrame + """ + rows = _convert_by(rows) + cols = _convert_by(cols) + + keys = rows + cols + grouped = data.groupby(keys) + + if values is not None: + grouped = grouped[values] + + agged = grouped.agg(aggfunc) + + table = agged + for k in cols: + table = table.unstack(level=k) + + if fill_value is not None: + table = table.fillna(value=fill_value) + + return table + +def _convert_by(by): + if by is None: + by = [] + elif np.isscalar(by): + by = [by] + else: + by = list(by) + return by + +def pprint_table(table): + pass + +if __name__ == '__main__': + def _sample(values, n): + indexer = np.random.randint(0, len(values), n) + return np.asarray(values).take(indexer) + + levels = [['a', 'b', 'c', 'd'], + ['foo', 'bar', 'baz'], + ['one', 'two'], + ['US', 'JP', 'UK']] + names = ['k1', 'k2', 'k3', 'k4'] + + n = 100000 + + data = {} + for name, level in zip(names, levels): + data[name] = _sample(level, n) + + data['values'] = np.random.randn(n) + data = DataFrame(data) + + table = pivot_table(data, values='values', + rows=['k1', 'k2'], cols=['k3', 'k4']) + diff --git a/pandas/tools/tests/__init__.py b/pandas/tools/tests/__init__.py new file mode 100644 index 0000000000000..8b137891791fe --- /dev/null +++ b/pandas/tools/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py new file mode 100644 index 0000000000000..b1cf2546817b2 --- /dev/null +++ b/pandas/tools/tests/test_pivot.py @@ -0,0 +1,54 @@ +import unittest + +import numpy as np + +from pandas import DataFrame +from pandas.tools.pivot import pivot_table +from pandas.util.testing import assert_frame_equal + +class TestPivotTable(unittest.TestCase): + + def setUp(self): + self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11)}) + + def test_pivot_table(self): + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, values='D', rows=rows, cols=cols) + + if len(rows) > 1: + self.assertEqual(table.index.names, rows) + else: + self.assertEqual(table.index.name, rows[0]) + + if len(cols) > 1: + self.assertEqual(table.columns.names, cols) + else: + self.assertEqual(table.columns.name, cols[0]) + + expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() + assert_frame_equal(table, expected) + + def test_pivot_table_multiple(self): + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, rows=rows, cols=cols) + expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() + assert_frame_equal(table, expected) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + diff --git a/pandas/util/counter.py b/pandas/util/counter.py new file mode 100644 index 0000000000000..f23f6e6fbbad1 --- /dev/null +++ b/pandas/util/counter.py @@ -0,0 +1,290 @@ +# This is copied from collections in Python 2.7, for compatibility with older +# versions of Python. It can be dropped when we depend on Python 2.7/3.1 + +import heapq as _heapq +from itertools import repeat as _repeat, chain as _chain, starmap as _starmap +from operator import itemgetter as _itemgetter + +try: + from collections import Mapping +except: + # ABCs were only introduced in Python 2.6, so this is a hack for Python 2.5: + Mapping = dict + +class Counter(dict): + '''Dict subclass for counting hashable items. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> c = Counter('abcdeabcdabcaba') # count elements from a string + + >>> c.most_common(3) # three most common elements + [('a', 5), ('b', 4), ('c', 3)] + >>> sorted(c) # list all unique elements + ['a', 'b', 'c', 'd', 'e'] + >>> ''.join(sorted(c.elements())) # list elements with repetitions + 'aaaaabbbbcccdde' + >>> sum(c.values()) # total of all counts + 15 + + >>> c['a'] # count of letter 'a' + 5 + >>> for elem in 'shazam': # update counts from an iterable + ... c[elem] += 1 # by adding 1 to each element's count + >>> c['a'] # now there are seven 'a' + 7 + >>> del c['b'] # remove all 'b' + >>> c['b'] # now there are zero 'b' + 0 + + >>> d = Counter('simsalabim') # make another counter + >>> c.update(d) # add in the second counter + >>> c['a'] # now there are nine 'a' + 9 + + >>> c.clear() # empty the counter + >>> c + Counter() + + Note: If a count is set to zero or reduced to zero, it will remain + in the counter until the entry is deleted or the counter is cleared: + + >>> c = Counter('aaabbc') + >>> c['b'] -= 2 # reduce the count of 'b' by two + >>> c.most_common() # 'b' is still in, but its count is zero + [('a', 3), ('c', 1), ('b', 0)] + + ''' + # References: + # http://en.wikipedia.org/wiki/Multiset + # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html + # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm + # http://code.activestate.com/recipes/259174/ + # Knuth, TAOCP Vol. II section 4.6.3 + + def __init__(self, iterable=None, **kwds): + '''Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + >>> c = Counter(a=4, b=2) # a new counter from keyword args + + ''' + super(Counter, self).__init__() + self.update(iterable, **kwds) + + def __missing__(self, key): + 'The count of elements not in the Counter is zero.' + # Needed so that self[missing_item] does not raise KeyError + return 0 + + def most_common(self, n=None): + '''List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abcdeabcdabcaba').most_common(3) + [('a', 5), ('b', 4), ('c', 3)] + + ''' + # Emulate Bag.sortedByCount from Smalltalk + if n is None: + return sorted(self.iteritems(), key=_itemgetter(1), reverse=True) + return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1)) + + def elements(self): + '''Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1 + >>> prime_factors = Counter({2: 2, 3: 3, 17: 1}) + >>> product = 1 + >>> for factor in prime_factors.elements(): # loop over factors + ... product *= factor # and multiply them + >>> product + 1836 + + Note, if an element's count has been set to zero or is a negative + number, elements() will ignore it. + + ''' + # Emulate Bag.do from Smalltalk and Multiset.begin from C++. + return _chain.from_iterable(_starmap(_repeat, self.iteritems())) + + # Override dict methods where necessary + + @classmethod + def fromkeys(cls, iterable, v=None): + # There is no equivalent method for counters because setting v=1 + # means that no element can have a count greater than one. + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None, **kwds): + '''Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + ''' + # The regular dict.update() operation makes no sense here because the + # replace behavior results in the some of original untouched counts + # being mixed-in with all of the other counts for a mismash that + # doesn't have a straight-forward interpretation in most counting + # contexts. Instead, we implement straight-addition. Both the inputs + # and outputs are allowed to contain zero and negative counts. + + if iterable is not None: + if isinstance(iterable, Mapping): + if self: + self_get = self.get + for elem, count in iterable.iteritems(): + self[elem] = self_get(elem, 0) + count + else: + super(Counter, self).update(iterable) # fast path when counter is empty + else: + self_get = self.get + for elem in iterable: + self[elem] = self_get(elem, 0) + 1 + if kwds: + self.update(kwds) + + def subtract(self, iterable=None, **kwds): + '''Like dict.update() but subtracts counts instead of replacing them. + Counts can be reduced below zero. Both the inputs and outputs are + allowed to contain zero and negative counts. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.subtract('witch') # subtract elements from another iterable + >>> c.subtract(Counter('watch')) # subtract elements from another counter + >>> c['h'] # 2 in which, minus 1 in witch, minus 1 in watch + 0 + >>> c['w'] # 1 in which, minus 1 in witch, minus 1 in watch + -1 + + ''' + if iterable is not None: + self_get = self.get + if isinstance(iterable, Mapping): + for elem, count in iterable.items(): + self[elem] = self_get(elem, 0) - count + else: + for elem in iterable: + self[elem] = self_get(elem, 0) - 1 + if kwds: + self.subtract(kwds) + + def copy(self): + 'Return a shallow copy.' + return self.__class__(self) + + def __reduce__(self): + return self.__class__, (dict(self),) + + def __delitem__(self, elem): + 'Like dict.__delitem__() but does not raise KeyError for missing values.' + if elem in self: + super(Counter, self).__delitem__(elem) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + + # Multiset-style mathematical operations discussed in: + # Knuth TAOCP Volume II section 4.6.3 exercise 19 + # and at http://en.wikipedia.org/wiki/Multiset + # + # Outputs guaranteed to only include positive counts. + # + # To strip negative and zero counts, add-in an empty counter: + # c += Counter() + + def __add__(self, other): + '''Add counts from two counters. + + >>> Counter('abbb') + Counter('bcc') + Counter({'b': 4, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count + other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __sub__(self, other): + ''' Subtract count, but keep only results with positive counts. + + >>> Counter('abbbc') - Counter('bccd') + Counter({'b': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count - other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count < 0: + result[elem] = 0 - count + return result + + def __or__(self, other): + '''Union is the maximum of value in either of the input counters. + + >>> Counter('abbb') | Counter('bcc') + Counter({'b': 3, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = other_count if count < other_count else count + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __and__(self, other): + ''' Intersection is the minimum of corresponding counts. + + >>> Counter('abbb') & Counter('bcc') + Counter({'b': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = count if count < other_count else other_count + if newcount > 0: + result[elem] = newcount + return result diff --git a/pandas/util/py3compat.py b/pandas/util/py3compat.py index e8bb212e215f2..afb48ef41cc95 100644 --- a/pandas/util/py3compat.py +++ b/pandas/util/py3compat.py @@ -1,3 +1,14 @@ import sys PY3 = (sys.version_info[0] >= 3) + +if PY3: + def isidentifier(s): + return s.isidentifier() + +else: + # Python 2 + import re + _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + def isidentifier(s, dotted=False): + return bool(_name_re.match(s)) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c398eebc8cb59..a4b914e437dd2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,3 +1,5 @@ +from __future__ import division + # pylint: disable-msg=W0402 from datetime import datetime @@ -70,7 +72,7 @@ def isiterable(obj): def assert_almost_equal(a, b): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) - + if isinstance(a, basestring): assert a == b, (a, b) return True @@ -116,9 +118,9 @@ def assert_dict_equal(a, b, compare_keys=True): assert_almost_equal(a[k], b[k]) def assert_series_equal(left, right): + assert_almost_equal(left.values, right.values) assert(left.dtype == right.dtype) assert(left.index.equals(right.index)) - assert_almost_equal(left, right) def assert_frame_equal(left, right): assert(isinstance(left, DataFrame)) diff --git a/scripts/bench_join.py b/scripts/bench_join.py index 1a82f8a1762bd..56d97599bb802 100644 --- a/scripts/bench_join.py +++ b/scripts/bench_join.py @@ -136,7 +136,7 @@ def join(a, b, av, bv, how="left"): def bench_python(n=100000, pct_overlap=0.20, K=1): import gc ns = [2, 3, 4, 5, 6] - iterations = 50 + iterations = 200 pct_overlap = 0.2 kinds = ['outer', 'left', 'inner'] @@ -156,11 +156,16 @@ def bench_python(n=100000, pct_overlap=0.20, K=1): for kind in kinds: gc.disable() + elapsed = 0 _s = time.clock() - for _ in range(iterations): + for i in range(iterations): + if i % 10 == 0: + elapsed += time.clock() - _s + gc.collect() + _s = time.clock() a_frame.join(b_frame, how=kind) # join(a, b, avf, bvf, how=kind) - elapsed = time.clock() - _s + elapsed += time.clock() - _s gc.enable() result[kind] = (elapsed / iterations) * 1000 diff --git a/scripts/bench_join_multi.py b/scripts/bench_join_multi.py new file mode 100644 index 0000000000000..c591ca7d833c6 --- /dev/null +++ b/scripts/bench_join_multi.py @@ -0,0 +1,54 @@ +from pandas import * + +import numpy as np +from itertools import izip +from pandas.util.testing import rands +import pandas._tseries as lib + +N = 100000 + +key1 = [rands(10) for _ in xrange(N)] +key2 = [rands(10) for _ in xrange(N)] + +zipped = izip(key1, key2) + +def _zip(*args): + arr = np.empty(N, dtype=object) + arr[:] = zip(*args) + return arr + +def _zip2(*args): + return lib.list_to_object_array(zip(*args)) + +index = MultiIndex.from_arrays([key1, key2]) +to_join = DataFrame({'j1' : np.random.randn(100000)}, index=index) + +data = DataFrame({'A' : np.random.randn(500000), + 'key1' : np.repeat(key1, 5), + 'key2' : np.repeat(key2, 5)}) + +# data.join(to_join, on=['key1', 'key2']) + +""" +Cython function for list_to_object_array + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr +""" + + + diff --git a/scripts/faster_xs.py b/scripts/faster_xs.py new file mode 100644 index 0000000000000..a539642b78185 --- /dev/null +++ b/scripts/faster_xs.py @@ -0,0 +1,16 @@ +import numpy as np + +import pandas.util.testing as tm + +from pandas.core.internals import _interleaved_dtype + +df = tm.makeDataFrame() + +df['E'] = 'foo' +df['F'] = 'foo' +df['G'] = 2 +df['H'] = df['A'] > 0 + +blocks = df._data.blocks +items = df.columns + diff --git a/setup.py b/setup.py index 318e0bb8e8967..e28cfbd026ed6 100755 --- a/setup.py +++ b/setup.py @@ -128,9 +128,9 @@ ] MAJOR = 0 -MINOR = 4 -MICRO = 3 -ISRELEASED = True +MINOR = 5 +MICRO = 1 +ISRELEASED = False VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) FULLVERSION = VERSION @@ -274,22 +274,28 @@ def run(self): cmdclass['build_ext'] = build_ext cmdclass['sdist'] = CheckSDist -tseries_depends = ['reindex', 'io', 'common', 'groupby' - 'skiplist', 'isnull', 'moments', 'operators'] - +tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments', + 'generated', 'parsing'] def srcpath(name=None, suffix='.pyx', subdir='src'): return pjoin('pandas', subdir, name+suffix) +if suffix == '.pyx': + tseries_depends = [srcpath(f, suffix='.pyx') + for f in tseries_depends] +else: + tseries_depends = None + tseries_ext = Extension('pandas._tseries', + depends=tseries_depends, sources=[srcpath('tseries', suffix=suffix)], - # depends=[srcpath(f, suffix='.pyx') - # for f in tseries_depends], include_dirs=[np.get_include()]) + sparse_ext = Extension('pandas._sparse', sources=[srcpath('sparse', suffix=suffix)], include_dirs=[np.get_include()]) extensions = [tseries_ext, sparse_ext] + # if _have_setuptools: # setuptools_args["test_suite"] = "nose.collector" @@ -300,10 +306,13 @@ def srcpath(name=None, suffix='.pyx', subdir='src'): packages=['pandas', 'pandas.core', 'pandas.io', + 'pandas.rpy', 'pandas.sandbox', 'pandas.stats', 'pandas.util', 'pandas.tests', + 'pandas.tools', + 'pandas.tools.tests', 'pandas.io.tests', 'pandas.stats.tests', ],