diff --git a/doc/source/conf.py b/doc/source/conf.py index 77318cdc41b9f..5c0bb2a609b32 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,255 +1,255 @@ -# -*- coding: utf-8 -*- -# -# pandas documentation build configuration file, created by -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys, os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.append(os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../sphinxext')) - -sys.path.extend([ - - # numpy standard doc extensions - os.path.join(os.path.dirname(__file__), - '..', '../..', - 'sphinxext') - -]) - -# -- General configuration ----------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. - -extensions = ['sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'numpydoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.pngmath', - 'sphinx.ext.ifconfig', - 'sphinx.ext.autosummary', - 'matplotlib.sphinxext.only_directives', - 'matplotlib.sphinxext.plot_directive', - ] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates', '_templates/autosummary'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'pandas' -copyright = u'2008-2010, AQR Capital Management, LLC' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -import pandas - -# def svn_version(): -# import os, subprocess, re, warnings -# env = os.environ.copy() -# env['LC_ALL'] = 'C' -# try: -# out = subprocess.Popen(['svn', 'info'], stdout=subprocess.PIPE, -# env=env).communicate()[0] -# except OSError: -# warnings.warn(" --- Could not run svn info --- ") -# return "" - -# r = re.compile('Revision: ([0-9]+)') -# svnver = None -# for line in out.split('\n'): -# m = r.match(line) -# if m: -# svnver = m.group(1) - -# if not svnver: -# raise ValueError("Error while parsing svn version ?") -# return svnver - -# version = '%s r%s' % (pandas.__version__, svn_version()) -version = '%s' % (pandas.__version__) - -# The full version, including alpha/beta/rc tags. -release = version - -# JP: added from sphinxdocs -autosummary_generate = True - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of documents that shouldn't be included in the build. -#unused_docs = [] - -# List of directories, relative to source directory, that shouldn't be searched -# for source files. -exclude_trees = [] - -# The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -html_theme = 'default' - -# The style sheet to use for HTML and HTML Help pages. A file of that name -# must exist either in Sphinx' static/ path, or in one of the custom paths -# given in html_static_path. -#html_style = 'statsmodels.css' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -html_use_modindex = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = '' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'pandas' - - -# -- Options for LaTeX output -------------------------------------------------- - -# The paper size ('letter' or 'a4'). -#latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -#latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'pandas.tex', u'pandas Documentation', - u'Wes McKinney', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# Additional stuff for the LaTeX preamble. -#latex_preamble = '' - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_use_modindex = True - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.scipy.org/': None} -import glob -autosummary_generate = glob.glob("*.rst") +# -*- coding: utf-8 -*- +# +# pandas documentation build configuration file, created by +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../sphinxext')) + +sys.path.extend([ + + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), + '..', '../..', + 'sphinxext') + +]) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. + +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'numpydoc', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.pngmath', + 'sphinx.ext.ifconfig', + 'sphinx.ext.autosummary', + 'matplotlib.sphinxext.only_directives', + 'matplotlib.sphinxext.plot_directive', + ] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates', '_templates/autosummary'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pandas' +copyright = u'2008-2010, AQR Capital Management, LLC' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pandas + +# def svn_version(): +# import os, subprocess, re, warnings +# env = os.environ.copy() +# env['LC_ALL'] = 'C' +# try: +# out = subprocess.Popen(['svn', 'info'], stdout=subprocess.PIPE, +# env=env).communicate()[0] +# except OSError: +# warnings.warn(" --- Could not run svn info --- ") +# return "" + +# r = re.compile('Revision: ([0-9]+)') +# svnver = None +# for line in out.split('\n'): +# m = r.match(line) +# if m: +# svnver = m.group(1) + +# if not svnver: +# raise ValueError("Error while parsing svn version ?") +# return svnver + +# version = '%s r%s' % (pandas.__version__, svn_version()) +version = '%s' % (pandas.__version__) + +# The full version, including alpha/beta/rc tags. +release = version + +# JP: added from sphinxdocs +autosummary_generate = True + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'default' + +# The style sheet to use for HTML and HTML Help pages. A file of that name +# must exist either in Sphinx' static/ path, or in one of the custom paths +# given in html_static_path. +#html_style = 'statsmodels.css' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pandas' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'pandas.tex', u'pandas Documentation', + u'Wes McKinney', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'http://docs.scipy.org/': None} +import glob +autosummary_generate = glob.glob("*.rst") diff --git a/doc/source/core.rst b/doc/source/core.rst index e24e704e3af22..7d6565a5af8c9 100755 --- a/doc/source/core.rst +++ b/doc/source/core.rst @@ -1,15 +1,15 @@ -.. _core: - -*************** -Data Structures -*************** - -.. currentmodule:: pandas - -.. toctree:: - :maxdepth: 2 - - indexobj - series - dataframe - panel +.. _core: + +*************** +Data Structures +*************** + +.. currentmodule:: pandas + +.. toctree:: + :maxdepth: 2 + + indexobj + series + dataframe + panel diff --git a/doc/source/index.rst b/doc/source/index.rst index 2758578ae452a..0dd92bbccc48d 100755 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,96 +1,96 @@ -.. Pandas documentation master file, created by - - -pandas: a python data analysis library -====================================== - -:mod:`pandas` is a python package providing convenient data structures -for time series, cross-sectional, or any other form of "labeled" data, -with tools for building statistical and econometric models. - -This library was created with the following design principles: - - - Working with time series and cross-sectional data should be easy - - The user should not have to worry (much) about handling missing data - - Data alignment should be automatic and transparent - - Speed matters - - Perhaps most importantly: *things should work just like you want them to* - -Many of these principles are here to address the shortcomings -frequently experienced using other languages / scientific research -environments. In MATLAB, for example, you spend a lot of time coercing -data into matrices, cleaning and aligning it, and keeping everything -homogeneous. You have to use lots of functions like **nanmean, nanstd, -repmat** (for broadcasting), and other functions which help you to -maintain reliable data. Using `NumPy `__ and a -Pythonic approach, pandas helps hide the dirty details of working with -unclean data, allowing you to focus on the problem you're trying to -solve rather than the implementation. - -pandas is implemented primarily using NumPy and is intended to be able -to integrate very easily with other NumPy-based scientific libraries, -such as :mod:`scikits.statsmodels`. - -.. note:: - - This documentation assumes general familiarity with NumPy. If you - haven't used NumPy much or at all, please check out the `NumPy - documentation `__ first. - -See the package overview for more detail about what's in the library. - -User manual ------------ - -`PDF Version `__ - -.. module:: pandas - -**Date**: |today| - -**Version**: |version| - -**License:** BSD - -**Requirements:** python 2.4 to 2.7, NumPy, and dateutil - -**Code Repository:** http://github.com/wesm/pandas - -Library documentation -~~~~~~~~~~~~~~~~~~~~~ - -.. toctree:: - :maxdepth: 2 - - overview - core - groupby - datetools - stats - r_interface - io - -Other topics of interest -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. toctree:: - :maxdepth: 2 - - examples - frame_vs_matrix - r_guide - missing_data - related - -Indices and tables ------------------- - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - -Contact -------- - -Please feel free to send comments or questions directly to -wesmckinn@gmail.com or the pystatsmodels mailing list. +.. Pandas documentation master file, created by + + +pandas: a python data analysis library +====================================== + +:mod:`pandas` is a python package providing convenient data structures +for time series, cross-sectional, or any other form of "labeled" data, +with tools for building statistical and econometric models. + +This library was created with the following design principles: + + - Working with time series and cross-sectional data should be easy + - The user should not have to worry (much) about handling missing data + - Data alignment should be automatic and transparent + - Speed matters + - Perhaps most importantly: *things should work just like you want them to* + +Many of these principles are here to address the shortcomings +frequently experienced using other languages / scientific research +environments. In MATLAB, for example, you spend a lot of time coercing +data into matrices, cleaning and aligning it, and keeping everything +homogeneous. You have to use lots of functions like **nanmean, nanstd, +repmat** (for broadcasting), and other functions which help you to +maintain reliable data. Using `NumPy `__ and a +Pythonic approach, pandas helps hide the dirty details of working with +unclean data, allowing you to focus on the problem you're trying to +solve rather than the implementation. + +pandas is implemented primarily using NumPy and is intended to be able +to integrate very easily with other NumPy-based scientific libraries, +such as :mod:`scikits.statsmodels`. + +.. note:: + + This documentation assumes general familiarity with NumPy. If you + haven't used NumPy much or at all, please check out the `NumPy + documentation `__ first. + +See the package overview for more detail about what's in the library. + +User manual +----------- + +`PDF Version `__ + +.. module:: pandas + +**Date**: |today| + +**Version**: |version| + +**License:** BSD + +**Requirements:** python 2.4 to 2.7, NumPy, and dateutil + +**Code Repository:** http://github.com/wesm/pandas + +Library documentation +~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 2 + + overview + core + groupby + datetools + stats + r_interface + io + +Other topics of interest +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 2 + + examples + frame_vs_matrix + r_guide + missing_data + related + +Indices and tables +------------------ + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +Contact +------- + +Please feel free to send comments or questions directly to +wesmckinn@gmail.com or the pystatsmodels mailing list. diff --git a/doc/source/stats.rst b/doc/source/stats.rst index a0ad21b357738..d0cc5d2669161 100755 --- a/doc/source/stats.rst +++ b/doc/source/stats.rst @@ -1,15 +1,15 @@ -.. currentmodule:: pandas.stats.api - -.. _stats: - -********************************** -Built-in statistical functionality -********************************** - -.. currentmodule:: pandas - -.. toctree:: - :maxdepth: 2 - - stats_moments - stats_ols +.. currentmodule:: pandas.stats.api + +.. _stats: + +********************************** +Built-in statistical functionality +********************************** + +.. currentmodule:: pandas + +.. toctree:: + :maxdepth: 2 + + stats_moments + stats_ols diff --git a/pandas/core/collection.py b/pandas/core/collection.py index fbf02c6796e50..41ad95c1115ba 100644 --- a/pandas/core/collection.py +++ b/pandas/core/collection.py @@ -1,90 +1,90 @@ -from pandas.core.generic import Picklable -from pandas.core.pytools import rands, adjoin -import cPickle -import os - -__all__ = ['PickleContainer'] - -class PickleContainer(Picklable): - """ - Store collection of objects on disk with this dict-like object. - - Parameters - ---------- - dirPath: string - Directory where to store the objects - lruSize: int - Number of objects to keep in memory (not implemented yet) - """ - def __init__(self, dirPath, lruSize=5): - self.dirPath = dirPath - if not os.path.exists(dirPath): - os.mkdir(dirPath) - - self._lruSize = lruSize - - self._paths = {} - self._classes = {} - self._lru = {} - - def __repr__(self): - output = str(self.__class__) + '\n' - keys, values = zip(*self._classes.iteritems()) - output += adjoin(5, map(str, keys), map(str, values)) - return output - - def __setitem__(self, key, value): - theKey = rands(10) - filePath = self.dirPath + '/' + theKey - - self._paths[key] = filePath - - if isinstance(value, Picklable): - value.save(filePath) - else: - f = open(filePath, 'w') - try: - cPickle.dump(value, f) - finally: - f.close() - - self._paths[key] = filePath - self._classes[key] = value.__class__ - - def __getitem__(self, key): - if key not in self._paths: - raise Exception('Requested key not in this container!') - - thePath = self._paths[key] - theClass = self._classes[key] - - if issubclass(theClass, Picklable): - obj = theClass.load(thePath) - else: - f = open(thePath, 'rb') - try: - obj = cPickle.load(f) - finally: - f.close() - - return obj - - def __delitem__(self, key): - del self._paths[key] - del self._classes[key] - - def __iter__(self): - return iter(self._paths) - - def iteritems(self): - for key, path in self._paths.iteritems(): - yield key, self[key] - - def keys(self): - return self._paths.keys() - - def values(self): - result = [] - for key in self._paths: - result.append(self[key]) - return result +from pandas.core.generic import Picklable +from pandas.core.pytools import rands, adjoin +import cPickle +import os + +__all__ = ['PickleContainer'] + +class PickleContainer(Picklable): + """ + Store collection of objects on disk with this dict-like object. + + Parameters + ---------- + dirPath: string + Directory where to store the objects + lruSize: int + Number of objects to keep in memory (not implemented yet) + """ + def __init__(self, dirPath, lruSize=5): + self.dirPath = dirPath + if not os.path.exists(dirPath): + os.mkdir(dirPath) + + self._lruSize = lruSize + + self._paths = {} + self._classes = {} + self._lru = {} + + def __repr__(self): + output = str(self.__class__) + '\n' + keys, values = zip(*self._classes.iteritems()) + output += adjoin(5, map(str, keys), map(str, values)) + return output + + def __setitem__(self, key, value): + theKey = rands(10) + filePath = self.dirPath + '/' + theKey + + self._paths[key] = filePath + + if isinstance(value, Picklable): + value.save(filePath) + else: + f = open(filePath, 'w') + try: + cPickle.dump(value, f) + finally: + f.close() + + self._paths[key] = filePath + self._classes[key] = value.__class__ + + def __getitem__(self, key): + if key not in self._paths: + raise Exception('Requested key not in this container!') + + thePath = self._paths[key] + theClass = self._classes[key] + + if issubclass(theClass, Picklable): + obj = theClass.load(thePath) + else: + f = open(thePath, 'rb') + try: + obj = cPickle.load(f) + finally: + f.close() + + return obj + + def __delitem__(self, key): + del self._paths[key] + del self._classes[key] + + def __iter__(self): + return iter(self._paths) + + def iteritems(self): + for key, path in self._paths.iteritems(): + yield key, self[key] + + def keys(self): + return self._paths.keys() + + def values(self): + result = [] + for key in self._paths: + result.append(self[key]) + return result diff --git a/pandas/core/pytools.py b/pandas/core/pytools.py index 9b7e1d86de925..e04c1eddbaa23 100644 --- a/pandas/core/pytools.py +++ b/pandas/core/pytools.py @@ -1,110 +1,110 @@ -"""A collection of tools for various purely Python operations""" -from random import Random -import itertools -import string - -# In theory should be few to no imports outside perhaps stdlib here - -def rands(n): - """Generates a random alphanumeric string of length *n*""" - return ''.join(Random().sample(string.letters+string.digits, n)) - -def adjoin(space, *lists): - """ - Glues together two sets of strings using the amount of space requested. - The idea is to prettify. - """ - outLines = [] - newLists = [] - lengths = [max(map(len, x)) + space for x in lists[:-1]] - - # not the last one - lengths.append(max(map(len, lists[-1]))) - - maxLen = max(map(len, lists)) - for i, lst in enumerate(lists): - nl = [x.ljust(lengths[i]) for x in lst] - nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) - newLists.append(nl) - toJoin = zip(*newLists) - for lines in toJoin: - outLines.append(''.join(lines)) - return '\n'.join(outLines) - - -def iterpairs(seq): - """ - Parameters - ---------- - seq: sequence - - Returns - ------- - iterator returning overlapping pairs of elements - - Example - ------- - >>> iterpairs([1, 2, 3, 4]) - [(1, 2), (2, 3), (3, 4) - """ - # input may not be sliceable - seq_it = iter(seq) - seq_it_next = iter(seq) - _ = seq_it_next.next() - - return itertools.izip(seq_it, seq_it_next) - -def indent(string, spaces=4): - dent = ' ' * spaces - return '\n'.join([dent + x for x in string.split('\n')]) - -def banner(message): - """ - Return 80-char width message declaration with = bars on top and bottom. - """ - bar = '=' * 80 - return '%s\n%s\n%s' % (bar, message, bar) - -class groupby(dict): - """ - A simple groupby different from the one in itertools. - - Does not require the sequence elements to be sorted by keys, - however it is slower. - """ - def __init__(self, seq, key=lambda x:x): - for value in seq: - k = key(value) - self.setdefault(k, []).append(value) - __iter__ = dict.iteritems - - -def map_indices_py(arr): - """ - Returns a dictionary with (element, index) pairs for each element in the - given array/list - """ - return dict([(x, i) for i, x in enumerate(arr)]) - -#=============================================================================== -# Set operations -#=============================================================================== - -def union(*seqs): - result = set([]) - for seq in seqs: - if not isinstance(seq, set): - seq = set(seq) - result |= seq - return type(seqs[0])(list(result)) - -def difference(a, b): - return type(a)(list(set(a) - set(b))) - -def intersection(*seqs): - result = set(seqs[0]) - for seq in seqs: - if not isinstance(seq, set): - seq = set(seq) - result &= seq - return type(seqs[0])(list(result)) +"""A collection of tools for various purely Python operations""" +from random import Random +import itertools +import string + +# In theory should be few to no imports outside perhaps stdlib here + +def rands(n): + """Generates a random alphanumeric string of length *n*""" + return ''.join(Random().sample(string.letters+string.digits, n)) + +def adjoin(space, *lists): + """ + Glues together two sets of strings using the amount of space requested. + The idea is to prettify. + """ + outLines = [] + newLists = [] + lengths = [max(map(len, x)) + space for x in lists[:-1]] + + # not the last one + lengths.append(max(map(len, lists[-1]))) + + maxLen = max(map(len, lists)) + for i, lst in enumerate(lists): + nl = [x.ljust(lengths[i]) for x in lst] + nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) + newLists.append(nl) + toJoin = zip(*newLists) + for lines in toJoin: + outLines.append(''.join(lines)) + return '\n'.join(outLines) + + +def iterpairs(seq): + """ + Parameters + ---------- + seq: sequence + + Returns + ------- + iterator returning overlapping pairs of elements + + Example + ------- + >>> iterpairs([1, 2, 3, 4]) + [(1, 2), (2, 3), (3, 4) + """ + # input may not be sliceable + seq_it = iter(seq) + seq_it_next = iter(seq) + _ = seq_it_next.next() + + return itertools.izip(seq_it, seq_it_next) + +def indent(string, spaces=4): + dent = ' ' * spaces + return '\n'.join([dent + x for x in string.split('\n')]) + +def banner(message): + """ + Return 80-char width message declaration with = bars on top and bottom. + """ + bar = '=' * 80 + return '%s\n%s\n%s' % (bar, message, bar) + +class groupby(dict): + """ + A simple groupby different from the one in itertools. + + Does not require the sequence elements to be sorted by keys, + however it is slower. + """ + def __init__(self, seq, key=lambda x:x): + for value in seq: + k = key(value) + self.setdefault(k, []).append(value) + __iter__ = dict.iteritems + + +def map_indices_py(arr): + """ + Returns a dictionary with (element, index) pairs for each element in the + given array/list + """ + return dict([(x, i) for i, x in enumerate(arr)]) + +#=============================================================================== +# Set operations +#=============================================================================== + +def union(*seqs): + result = set([]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result |= seq + return type(seqs[0])(list(result)) + +def difference(a, b): + return type(a)(list(set(a) - set(b))) + +def intersection(*seqs): + result = set(seqs[0]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result &= seq + return type(seqs[0])(list(result)) diff --git a/pandas/core/setup.py b/pandas/core/setup.py index 0114d5c90c411..ecb8da63e306e 100644 --- a/pandas/core/setup.py +++ b/pandas/core/setup.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python - -def configuration(parent_package='',top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration('core', parent_package, top_path) - config.add_data_dir('tests') - return config - -if __name__ == '__main__': - print('This is the wrong setup.py file to run') - +#!/usr/bin/env python + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('core', parent_package, top_path) + config.add_data_dir('tests') + return config + +if __name__ == '__main__': + print('This is the wrong setup.py file to run') + diff --git a/pandas/core/tests/test_index.py b/pandas/core/tests/test_index.py index 97c18bb5a9593..6cb914645b448 100644 --- a/pandas/core/tests/test_index.py +++ b/pandas/core/tests/test_index.py @@ -1,202 +1,202 @@ -from datetime import timedelta -from pandas.core.index import Index -import pandas.util.testing as common -import pandas.lib.tseries as tseries -import numpy as np -import os -import pickle -import unittest - -class TestIndex(unittest.TestCase): - - def setUp(self): - self.strIndex = common.makeStringIndex(100) - self.dateIndex = common.makeDateIndex(100) - self.intIndex = common.makeIntIndex(100) - - def test_deepcopy(self): - from copy import deepcopy - - copy = deepcopy(self.strIndex) - self.assert_(copy is self.strIndex) - - def test_duplicates(self): - self.assertRaises(Exception, Index, [0, 0, 0]) - - def test_sort(self): - self.assertRaises(Exception, self.strIndex.sort) - - def test_mutability(self): - self.assertRaises(Exception, self.strIndex.__setitem__, 5, 0) - self.assertRaises(Exception, self.strIndex.__setitem__, slice(1,5), 0) - - def test_constructor(self): - # regular instance creation - common.assert_contains_all(self.strIndex, self.strIndex) - common.assert_contains_all(self.dateIndex, self.dateIndex) - - # casting - arr = np.array(self.strIndex) - index = arr.view(Index) - common.assert_contains_all(arr, index) - self.assert_(np.array_equal(self.strIndex, index)) - - # corner case - self.assertRaises(Exception, Index, 0) - - # arr = np.array(5.) - # self.assertRaises(Exception, arr.view, Index) - - def test_compat(self): - self.strIndex.tolist() - - def test_equals(self): - # same - self.assert_(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) - - # different length - self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b']))) - - # same length, different values - self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'd']))) - - # Must also be an Index - self.assertFalse(Index(['a', 'b', 'c']).equals(['a', 'b', 'c'])) - - def test_asOfDate(self): - d = self.dateIndex[0] - self.assert_(self.dateIndex.asOfDate(d) is d) - self.assert_(self.dateIndex.asOfDate(d - timedelta(1)) is None) - - d = self.dateIndex[-1] - self.assert_(self.dateIndex.asOfDate(d + timedelta(1)) is d) - - def test_argsort(self): - result = self.strIndex.argsort() - expected = np.array(self.strIndex).argsort() - self.assert_(np.array_equal(result, expected)) - - def test_comparators(self): - index = self.dateIndex - element = index[len(index) // 2] - arr = np.array(index) - - self.assert_(np.array_equal(arr == element, index == element)) - self.assert_(np.array_equal(arr > element, index > element)) - self.assert_(np.array_equal(arr < element, index < element)) - self.assert_(np.array_equal(arr >= element, index >= element)) - self.assert_(np.array_equal(arr <= element, index <= element)) - - def test_booleanindex(self): - boolIdx = np.repeat(True, len(self.strIndex)).astype(bool) - boolIdx[5:30:2] = False - - subIndex = self.strIndex[boolIdx] - common.assert_dict_equal(tseries.map_indices(subIndex), - subIndex.indexMap) - - subIndex = self.strIndex[list(boolIdx)] - common.assert_dict_equal(tseries.map_indices(subIndex), - subIndex.indexMap) - - def test_fancy(self): - sl = self.strIndex[[1,2,3]] - for i in sl: - self.assertEqual(i, sl[sl.indexMap[i]]) - - def test_getitem(self): - arr = np.array(self.dateIndex) - self.assertEquals(self.dateIndex[5], arr[5]) - - def test_add(self): - firstCat = self.strIndex + self.dateIndex - secondCat = self.strIndex + self.strIndex - - self.assert_(common.equalContents(np.append(self.strIndex, - self.dateIndex), firstCat)) - self.assert_(common.equalContents(secondCat, self.strIndex)) - common.assert_contains_all(self.strIndex, firstCat.indexMap) - common.assert_contains_all(self.strIndex, secondCat.indexMap) - common.assert_contains_all(self.dateIndex, firstCat.indexMap) - - # this is valid too - shifted = self.dateIndex + timedelta(1) - - def test_add_string(self): - # from bug report - index = Index(['a', 'b', 'c']) - index2 = index + 'foo' - - self.assert_('a' not in index2.indexMap) - self.assert_('afoo' in index2.indexMap) - - def test_shift(self): - shifted = self.dateIndex.shift(0, timedelta(1)) - self.assert_(shifted is self.dateIndex) - - shifted = self.dateIndex.shift(5, timedelta(1)) - self.assert_(np.array_equal(shifted, self.dateIndex + timedelta(5))) - - def test_intersection(self): - first = self.strIndex[:20] - second = self.strIndex[:10] - intersect = first.intersection(second) - - self.assert_(common.equalContents(intersect, second)) - - # Corner cases - inter = first.intersection(first) - self.assert_(inter is first) - - # non-iterable input - self.assertRaises(Exception, first.intersection, 0.5) - - def test_union(self): - first = self.strIndex[5:20] - second = self.strIndex[:10] - everything = self.strIndex[:20] - union = first.union(second) - self.assert_(common.equalContents(union, everything)) - - # Corner cases - union = first.union(first) - self.assert_(union is first) - - union = first.union([]) - self.assert_(union is first) - - # non-iterable input - self.assertRaises(Exception, first.union, 0.5) - - def test_diff(self): - first = self.strIndex[5:20] - second = self.strIndex[:10] - answer = self.strIndex[10:20] - result = first - second - - self.assert_(common.equalContents(result, answer)) - - diff = first.diff(first) - self.assert_(len(diff) == 0) - - # non-iterable input - self.assertRaises(Exception, first.diff, 0.5) - - def test_pickle(self): - def testit(index): - pickled = pickle.dumps(index) - unpickled = pickle.loads(pickled) - - self.assert_(isinstance(unpickled, Index)) - self.assert_(np.array_equal(unpickled, index)) - - common.assert_dict_equal(unpickled.indexMap, index.indexMap) - - testit(self.strIndex) - testit(self.dateIndex) - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], - exit=False) - +from datetime import timedelta +from pandas.core.index import Index +import pandas.util.testing as common +import pandas.lib.tseries as tseries +import numpy as np +import os +import pickle +import unittest + +class TestIndex(unittest.TestCase): + + def setUp(self): + self.strIndex = common.makeStringIndex(100) + self.dateIndex = common.makeDateIndex(100) + self.intIndex = common.makeIntIndex(100) + + def test_deepcopy(self): + from copy import deepcopy + + copy = deepcopy(self.strIndex) + self.assert_(copy is self.strIndex) + + def test_duplicates(self): + self.assertRaises(Exception, Index, [0, 0, 0]) + + def test_sort(self): + self.assertRaises(Exception, self.strIndex.sort) + + def test_mutability(self): + self.assertRaises(Exception, self.strIndex.__setitem__, 5, 0) + self.assertRaises(Exception, self.strIndex.__setitem__, slice(1,5), 0) + + def test_constructor(self): + # regular instance creation + common.assert_contains_all(self.strIndex, self.strIndex) + common.assert_contains_all(self.dateIndex, self.dateIndex) + + # casting + arr = np.array(self.strIndex) + index = arr.view(Index) + common.assert_contains_all(arr, index) + self.assert_(np.array_equal(self.strIndex, index)) + + # corner case + self.assertRaises(Exception, Index, 0) + + # arr = np.array(5.) + # self.assertRaises(Exception, arr.view, Index) + + def test_compat(self): + self.strIndex.tolist() + + def test_equals(self): + # same + self.assert_(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) + + # different length + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b']))) + + # same length, different values + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'd']))) + + # Must also be an Index + self.assertFalse(Index(['a', 'b', 'c']).equals(['a', 'b', 'c'])) + + def test_asOfDate(self): + d = self.dateIndex[0] + self.assert_(self.dateIndex.asOfDate(d) is d) + self.assert_(self.dateIndex.asOfDate(d - timedelta(1)) is None) + + d = self.dateIndex[-1] + self.assert_(self.dateIndex.asOfDate(d + timedelta(1)) is d) + + def test_argsort(self): + result = self.strIndex.argsort() + expected = np.array(self.strIndex).argsort() + self.assert_(np.array_equal(result, expected)) + + def test_comparators(self): + index = self.dateIndex + element = index[len(index) // 2] + arr = np.array(index) + + self.assert_(np.array_equal(arr == element, index == element)) + self.assert_(np.array_equal(arr > element, index > element)) + self.assert_(np.array_equal(arr < element, index < element)) + self.assert_(np.array_equal(arr >= element, index >= element)) + self.assert_(np.array_equal(arr <= element, index <= element)) + + def test_booleanindex(self): + boolIdx = np.repeat(True, len(self.strIndex)).astype(bool) + boolIdx[5:30:2] = False + + subIndex = self.strIndex[boolIdx] + common.assert_dict_equal(tseries.map_indices(subIndex), + subIndex.indexMap) + + subIndex = self.strIndex[list(boolIdx)] + common.assert_dict_equal(tseries.map_indices(subIndex), + subIndex.indexMap) + + def test_fancy(self): + sl = self.strIndex[[1,2,3]] + for i in sl: + self.assertEqual(i, sl[sl.indexMap[i]]) + + def test_getitem(self): + arr = np.array(self.dateIndex) + self.assertEquals(self.dateIndex[5], arr[5]) + + def test_add(self): + firstCat = self.strIndex + self.dateIndex + secondCat = self.strIndex + self.strIndex + + self.assert_(common.equalContents(np.append(self.strIndex, + self.dateIndex), firstCat)) + self.assert_(common.equalContents(secondCat, self.strIndex)) + common.assert_contains_all(self.strIndex, firstCat.indexMap) + common.assert_contains_all(self.strIndex, secondCat.indexMap) + common.assert_contains_all(self.dateIndex, firstCat.indexMap) + + # this is valid too + shifted = self.dateIndex + timedelta(1) + + def test_add_string(self): + # from bug report + index = Index(['a', 'b', 'c']) + index2 = index + 'foo' + + self.assert_('a' not in index2.indexMap) + self.assert_('afoo' in index2.indexMap) + + def test_shift(self): + shifted = self.dateIndex.shift(0, timedelta(1)) + self.assert_(shifted is self.dateIndex) + + shifted = self.dateIndex.shift(5, timedelta(1)) + self.assert_(np.array_equal(shifted, self.dateIndex + timedelta(5))) + + def test_intersection(self): + first = self.strIndex[:20] + second = self.strIndex[:10] + intersect = first.intersection(second) + + self.assert_(common.equalContents(intersect, second)) + + # Corner cases + inter = first.intersection(first) + self.assert_(inter is first) + + # non-iterable input + self.assertRaises(Exception, first.intersection, 0.5) + + def test_union(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + everything = self.strIndex[:20] + union = first.union(second) + self.assert_(common.equalContents(union, everything)) + + # Corner cases + union = first.union(first) + self.assert_(union is first) + + union = first.union([]) + self.assert_(union is first) + + # non-iterable input + self.assertRaises(Exception, first.union, 0.5) + + def test_diff(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + answer = self.strIndex[10:20] + result = first - second + + self.assert_(common.equalContents(result, answer)) + + diff = first.diff(first) + self.assert_(len(diff) == 0) + + # non-iterable input + self.assertRaises(Exception, first.diff, 0.5) + + def test_pickle(self): + def testit(index): + pickled = pickle.dumps(index) + unpickled = pickle.loads(pickled) + + self.assert_(isinstance(unpickled, Index)) + self.assert_(np.array_equal(unpickled, index)) + + common.assert_dict_equal(unpickled.indexMap, index.indexMap) + + testit(self.strIndex) + testit(self.dateIndex) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/lib/bench.py b/pandas/lib/bench.py index 251da5cfa362e..8f08afc8ec145 100644 --- a/pandas/lib/bench.py +++ b/pandas/lib/bench.py @@ -1,176 +1,176 @@ -import time - -import numpy as np - -from pandas import Series, Index, isnull -import pandas.lib.tseries as tseries -from pandas.util.testing import assert_almost_equal, assert_dict_equal - -def _timeit(f, n=10): - _s = time.clock() - for i in xrange(n): - f() - - return (time.clock() - _s) / n - -def bench_reindex(): - K = 100000 - index = Index(np.arange(K)) - values = np.arange(float(K)) - obj_vals = values.astype(object) - - new_index = np.arange(K) - np.random.shuffle(new_index) - new_index = Index(new_index) - - f = lambda: tseries.reindex(new_index, values, index.indexMap) - print 'tseries.reindex: %.2f ms per iteration' % (_timeit(f, n=50) * 1000) - - def _test(): - filler, mask = tseries.getMergeVec(new_index, index.indexMap) - result = values.take(filler) - np.putmask(result, -mask, np.NaN) - - return result - - timing = _timeit(_test, n=50) * 1000 - print 'getMergeVec method: %.2f ms per iteration' % timing - - f2 = lambda: tseries.reindexObj(new_index, values, index.indexMap) - print ('tseries.reindexObj with floats: %.2f ms per iteration' - % (_timeit(f2, n=50) * 1000)) - - f3 = lambda: tseries.reindexObj(new_index, obj_vals, index.indexMap) - print ('tseries.reindexObj with objects: %.2f ms per iteration' - % (_timeit(f3, n=50) * 1000)) - - f4 = lambda: tseries.reindexObject(new_index, obj_vals, index.indexMap) - print ('tseries.reindexObject buffers: %.2f ms per iteration' - % (_timeit(f4, n=50) * 1000)) - - def _test2(): - filler, mask = tseries.getMergeVec(new_index, index.indexMap) - result = obj_vals.take(filler) - np.putmask(result, -mask, np.NaN) - - return result - - timing = _timeit(_test2, n=50) * 1000 - print 'getMergeVec method: %.2f ms per iteration' % timing - - assert_almost_equal(_test(), f()) - assert_almost_equal(f2(), f3()) - assert_almost_equal(f3(), f4()) - assert_almost_equal(f2(), f4()) - assert_almost_equal(f2(), _test2()) - - -def _isnan(obj): - return obj != obj - -def test_groupby(): - mapping = Series({ - 1 : 2., - 2 : 2., - 3 : np.NaN, - 4 : np.NaN, - 5 : 3., - 6 : 3., - 7 : np.NaN - }) - - index = Index([1, 2, 3, 4, 5, 6, 7]) - - expected = { - 2 : [1, 2], - 3 : [5, 6], - np.NaN : [3, 4, 7] - } - - def compare_with_null(d1, d2): - d1_nulls = None - d2_nulls = None - for k, v in d1.iteritems(): - if _isnan(k): - d1_nulls = v - else: - assert(k in d2) - assert(np.array_equal(v, d2[k])) - - for k, v in d2.iteritems(): - if _isnan(k): - d2_nulls = v - else: - assert(k in d1) - - if d1_nulls is not None or d2_nulls is not None: - assert(np.array_equal(d1_nulls, d2_nulls)) - - grouped = tseries.groupby(index, mapping.get) - compare_with_null(grouped, expected) - -def groupby_nocython(index, mapper, output=None): - if output is None: - result = {} - else: - result = output - - index = np.asarray(index) - mapped_index = np.array([mapper(x) for x in index]) - - # A little hack here - if issubclass(mapped_index.dtype.type, basestring): - mapped_index = mapped_index.astype(object) - - mask = isnull(mapped_index) - nullkeys = index[mask] - - if nullkeys is not None and len(nullkeys) > 0: - result[np.NaN] = nullkeys - - notmask = -mask - index = index[notmask] - mapped_index = mapped_index[notmask] - - for idx, key in zip(index, mapped_index): - result.setdefault(key, []).append(idx) - - return result - -def bench_groupby(): - N = 200 - - arr = np.arange(10000).astype(object) - values = np.random.randn(10000) - keys = arr // 10 - d = dict(zip(arr, keys)) - - f = lambda: groupby_nocython(arr, d.get) - print 'no cython: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) - - f = lambda: tseries.arrmap(arr, d.get) - timing = _timeit(f, n=N) * 1000 - print 'arrmap: %.2f ms per iteration' % timing - - f = lambda: isnull(tseries.arrmap(arr, d.get)) - print 'isnull: %.2f ms per iteration' % (_timeit(f, n=N) * 1000 - timing) - - f = lambda: tseries.groupby(arr, d.get) - print 'groupby: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) - - f = lambda: tseries.groupby_indices(arr, d.get) - print 'groupby_inds: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) - - def _test(): - groups = tseries.groupby_indices(arr, d.get) - - result = {} - for k, v in groups.iteritems(): - result[k] = np.mean(values.take(v)) - - return result - - print 'test: %.2f ms per iteration' % (_timeit(_test, n=N) * 1000) - -def bench_map_indices(): - pass +import time + +import numpy as np + +from pandas import Series, Index, isnull +import pandas.lib.tseries as tseries +from pandas.util.testing import assert_almost_equal, assert_dict_equal + +def _timeit(f, n=10): + _s = time.clock() + for i in xrange(n): + f() + + return (time.clock() - _s) / n + +def bench_reindex(): + K = 100000 + index = Index(np.arange(K)) + values = np.arange(float(K)) + obj_vals = values.astype(object) + + new_index = np.arange(K) + np.random.shuffle(new_index) + new_index = Index(new_index) + + f = lambda: tseries.reindex(new_index, values, index.indexMap) + print 'tseries.reindex: %.2f ms per iteration' % (_timeit(f, n=50) * 1000) + + def _test(): + filler, mask = tseries.getMergeVec(new_index, index.indexMap) + result = values.take(filler) + np.putmask(result, -mask, np.NaN) + + return result + + timing = _timeit(_test, n=50) * 1000 + print 'getMergeVec method: %.2f ms per iteration' % timing + + f2 = lambda: tseries.reindexObj(new_index, values, index.indexMap) + print ('tseries.reindexObj with floats: %.2f ms per iteration' + % (_timeit(f2, n=50) * 1000)) + + f3 = lambda: tseries.reindexObj(new_index, obj_vals, index.indexMap) + print ('tseries.reindexObj with objects: %.2f ms per iteration' + % (_timeit(f3, n=50) * 1000)) + + f4 = lambda: tseries.reindexObject(new_index, obj_vals, index.indexMap) + print ('tseries.reindexObject buffers: %.2f ms per iteration' + % (_timeit(f4, n=50) * 1000)) + + def _test2(): + filler, mask = tseries.getMergeVec(new_index, index.indexMap) + result = obj_vals.take(filler) + np.putmask(result, -mask, np.NaN) + + return result + + timing = _timeit(_test2, n=50) * 1000 + print 'getMergeVec method: %.2f ms per iteration' % timing + + assert_almost_equal(_test(), f()) + assert_almost_equal(f2(), f3()) + assert_almost_equal(f3(), f4()) + assert_almost_equal(f2(), f4()) + assert_almost_equal(f2(), _test2()) + + +def _isnan(obj): + return obj != obj + +def test_groupby(): + mapping = Series({ + 1 : 2., + 2 : 2., + 3 : np.NaN, + 4 : np.NaN, + 5 : 3., + 6 : 3., + 7 : np.NaN + }) + + index = Index([1, 2, 3, 4, 5, 6, 7]) + + expected = { + 2 : [1, 2], + 3 : [5, 6], + np.NaN : [3, 4, 7] + } + + def compare_with_null(d1, d2): + d1_nulls = None + d2_nulls = None + for k, v in d1.iteritems(): + if _isnan(k): + d1_nulls = v + else: + assert(k in d2) + assert(np.array_equal(v, d2[k])) + + for k, v in d2.iteritems(): + if _isnan(k): + d2_nulls = v + else: + assert(k in d1) + + if d1_nulls is not None or d2_nulls is not None: + assert(np.array_equal(d1_nulls, d2_nulls)) + + grouped = tseries.groupby(index, mapping.get) + compare_with_null(grouped, expected) + +def groupby_nocython(index, mapper, output=None): + if output is None: + result = {} + else: + result = output + + index = np.asarray(index) + mapped_index = np.array([mapper(x) for x in index]) + + # A little hack here + if issubclass(mapped_index.dtype.type, basestring): + mapped_index = mapped_index.astype(object) + + mask = isnull(mapped_index) + nullkeys = index[mask] + + if nullkeys is not None and len(nullkeys) > 0: + result[np.NaN] = nullkeys + + notmask = -mask + index = index[notmask] + mapped_index = mapped_index[notmask] + + for idx, key in zip(index, mapped_index): + result.setdefault(key, []).append(idx) + + return result + +def bench_groupby(): + N = 200 + + arr = np.arange(10000).astype(object) + values = np.random.randn(10000) + keys = arr // 10 + d = dict(zip(arr, keys)) + + f = lambda: groupby_nocython(arr, d.get) + print 'no cython: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) + + f = lambda: tseries.arrmap(arr, d.get) + timing = _timeit(f, n=N) * 1000 + print 'arrmap: %.2f ms per iteration' % timing + + f = lambda: isnull(tseries.arrmap(arr, d.get)) + print 'isnull: %.2f ms per iteration' % (_timeit(f, n=N) * 1000 - timing) + + f = lambda: tseries.groupby(arr, d.get) + print 'groupby: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) + + f = lambda: tseries.groupby_indices(arr, d.get) + print 'groupby_inds: %.2f ms per iteration' % (_timeit(f, n=N) * 1000) + + def _test(): + groups = tseries.groupby_indices(arr, d.get) + + result = {} + for k, v in groups.iteritems(): + result[k] = np.mean(values.take(v)) + + return result + + print 'test: %.2f ms per iteration' % (_timeit(_test, n=N) * 1000) + +def bench_map_indices(): + pass diff --git a/pandas/lib/setup.py b/pandas/lib/setup.py index 854ccc8d0f28f..0b1f6a62176e4 100644 --- a/pandas/lib/setup.py +++ b/pandas/lib/setup.py @@ -1,32 +1,32 @@ -#!/usr/bin/env python - -from distutils.core import Extension -import numpy - -def get_cython_ext(): - from Cython.Distutils import build_ext - - pyx_ext = Extension('tseries', ['pandas/lib/src/tseries.pyx'], - include_dirs=[numpy.get_include()]) - - - setup(name='pandas.lib.tseries', description='Nothing', - ext_modules=[pyx_ext], - cmdclass = { - 'build_ext' : build_ext - }) - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration('lib', parent_package, top_path) - config.add_extension('tseries', - sources=['src/tseries.c'], - include_dirs=[numpy.get_include()]) - - - config.add_extension('sparse', - sources=['src/sparse.c'], - include_dirs=[numpy.get_include()]) - - return config +#!/usr/bin/env python + +from distutils.core import Extension +import numpy + +def get_cython_ext(): + from Cython.Distutils import build_ext + + pyx_ext = Extension('tseries', ['pandas/lib/src/tseries.pyx'], + include_dirs=[numpy.get_include()]) + + + setup(name='pandas.lib.tseries', description='Nothing', + ext_modules=[pyx_ext], + cmdclass = { + 'build_ext' : build_ext + }) + +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + + config = Configuration('lib', parent_package, top_path) + config.add_extension('tseries', + sources=['src/tseries.c'], + include_dirs=[numpy.get_include()]) + + + config.add_extension('sparse', + sources=['src/sparse.c'], + include_dirs=[numpy.get_include()]) + + return config diff --git a/pandas/lib/src/common.pyx b/pandas/lib/src/common.pyx index 763096d667dd7..9c314bb82d995 100644 --- a/pandas/lib/src/common.pyx +++ b/pandas/lib/src/common.pyx @@ -1,177 +1,177 @@ -cimport numpy as np -cimport cython - -from numpy cimport * - -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys) -from cpython cimport PyFloat_Check - -import numpy as np -isnan = np.isnan -cdef double NaN = np.NaN - -from datetime import datetime as pydatetime - -cdef inline int int_max(int a, int b): return a if a >= b else b -cdef inline int int_min(int a, int b): return a if a >= b else b - -ctypedef unsigned char UChar - -cdef int is_contiguous(ndarray arr): - return np.PyArray_CHKFLAGS(arr, np.NPY_C_CONTIGUOUS) - -cdef int _contiguous_check(ndarray arr): - if not is_contiguous(arr): - raise ValueError('Tried to use data field on non-contiguous array!') - -cdef int16_t *get_int16_ptr(ndarray arr): - _contiguous_check(arr) - - return arr.data - -cdef int32_t *get_int32_ptr(ndarray arr): - _contiguous_check(arr) - - return arr.data - -cdef int64_t *get_int64_ptr(ndarray arr): - _contiguous_check(arr) - - return arr.data - -cdef double_t *get_double_ptr(ndarray arr): - _contiguous_check(arr) - - return arr.data - -cdef extern from "math.h": - double sqrt(double x) - -cdef extern from "cobject.h": - pass # for datetime API - -cdef extern from "datetime.h": - - ctypedef class datetime.datetime [object PyDateTime_DateTime]: - # cdef int *data - # cdef long hashcode - # cdef char hastzinfo - pass - - int PyDateTime_GET_YEAR(datetime o) - int PyDateTime_GET_MONTH(datetime o) - int PyDateTime_GET_DAY(datetime o) - int PyDateTime_DATE_GET_HOUR(datetime o) - int PyDateTime_DATE_GET_MINUTE(datetime o) - int PyDateTime_DATE_GET_SECOND(datetime o) - int PyDateTime_DATE_GET_MICROSECOND(datetime o) - int PyDateTime_TIME_GET_HOUR(datetime o) - int PyDateTime_TIME_GET_MINUTE(datetime o) - int PyDateTime_TIME_GET_SECOND(datetime o) - int PyDateTime_TIME_GET_MICROSECOND(datetime o) - bint PyDateTime_Check(object o) - void PyDateTime_IMPORT() - -# import datetime C API -PyDateTime_IMPORT - -# initialize numpy -import_array() - -cpdef map_indices(ndarray index): - ''' - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - ''' - cdef int i, length - cdef flatiter iter - cdef dict result - cdef object idx - - result = {} - - iter = PyArray_IterNew(index) - length = PyArray_SIZE(index) - - for i from 0 <= i < length: - idx = PyArray_GETITEM(index, PyArray_ITER_DATA(iter)) - result[idx] = i - PyArray_ITER_NEXT(iter) - - return result - -def isAllDates(ndarray index): - cdef int i, length - cdef flatiter iter - cdef object date - - iter = PyArray_IterNew(index) - length = PyArray_SIZE(index) - - if length == 0: - return False - - for i from 0 <= i < length: - date = PyArray_GETITEM(index, PyArray_ITER_DATA(iter)) - - if not PyDateTime_Check(date): - return False - PyArray_ITER_NEXT(iter) - - return True - -def isAllDates2(ndarray[object, ndim=1] arr): - ''' - cannot use - ''' - - cdef int i, size = len(arr) - cdef object date - - if size == 0: - return False - - for i from 0 <= i < size: - date = arr[i] - - if not PyDateTime_Check(date): - return False - - return True - -cdef double __add(double a, double b): - return a + b -cdef double __sub(double a, double b): - return a - b - -cdef double __div(double a, double b): - if b == 0: - return NaN - else: - return a / b - -cdef double __mul(double a, double b): - return a * b -cdef double __eq(double a, double b): - return a == b -cdef double __ne(double a, double b): - return a != b -cdef double __lt(double a, double b): - return a < b -cdef double __gt(double a, double b): - return a > b - -cdef double __pow(double a, double b): - # NaN - if a != a or b != b: - return NaN - return a ** b - -ctypedef double (* double_func)(double a, double b) - +cimport numpy as np +cimport cython + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys) +from cpython cimport PyFloat_Check + +import numpy as np +isnan = np.isnan +cdef double NaN = np.NaN + +from datetime import datetime as pydatetime + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a >= b else b + +ctypedef unsigned char UChar + +cdef int is_contiguous(ndarray arr): + return np.PyArray_CHKFLAGS(arr, np.NPY_C_CONTIGUOUS) + +cdef int _contiguous_check(ndarray arr): + if not is_contiguous(arr): + raise ValueError('Tried to use data field on non-contiguous array!') + +cdef int16_t *get_int16_ptr(ndarray arr): + _contiguous_check(arr) + + return arr.data + +cdef int32_t *get_int32_ptr(ndarray arr): + _contiguous_check(arr) + + return arr.data + +cdef int64_t *get_int64_ptr(ndarray arr): + _contiguous_check(arr) + + return arr.data + +cdef double_t *get_double_ptr(ndarray arr): + _contiguous_check(arr) + + return arr.data + +cdef extern from "math.h": + double sqrt(double x) + +cdef extern from "cobject.h": + pass # for datetime API + +cdef extern from "datetime.h": + + ctypedef class datetime.datetime [object PyDateTime_DateTime]: + # cdef int *data + # cdef long hashcode + # cdef char hastzinfo + pass + + int PyDateTime_GET_YEAR(datetime o) + int PyDateTime_GET_MONTH(datetime o) + int PyDateTime_GET_DAY(datetime o) + int PyDateTime_DATE_GET_HOUR(datetime o) + int PyDateTime_DATE_GET_MINUTE(datetime o) + int PyDateTime_DATE_GET_SECOND(datetime o) + int PyDateTime_DATE_GET_MICROSECOND(datetime o) + int PyDateTime_TIME_GET_HOUR(datetime o) + int PyDateTime_TIME_GET_MINUTE(datetime o) + int PyDateTime_TIME_GET_SECOND(datetime o) + int PyDateTime_TIME_GET_MICROSECOND(datetime o) + bint PyDateTime_Check(object o) + void PyDateTime_IMPORT() + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() + +cpdef map_indices(ndarray index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef int i, length + cdef flatiter iter + cdef dict result + cdef object idx + + result = {} + + iter = PyArray_IterNew(index) + length = PyArray_SIZE(index) + + for i from 0 <= i < length: + idx = PyArray_GETITEM(index, PyArray_ITER_DATA(iter)) + result[idx] = i + PyArray_ITER_NEXT(iter) + + return result + +def isAllDates(ndarray index): + cdef int i, length + cdef flatiter iter + cdef object date + + iter = PyArray_IterNew(index) + length = PyArray_SIZE(index) + + if length == 0: + return False + + for i from 0 <= i < length: + date = PyArray_GETITEM(index, PyArray_ITER_DATA(iter)) + + if not PyDateTime_Check(date): + return False + PyArray_ITER_NEXT(iter) + + return True + +def isAllDates2(ndarray[object, ndim=1] arr): + ''' + cannot use + ''' + + cdef int i, size = len(arr) + cdef object date + + if size == 0: + return False + + for i from 0 <= i < size: + date = arr[i] + + if not PyDateTime_Check(date): + return False + + return True + +cdef double __add(double a, double b): + return a + b +cdef double __sub(double a, double b): + return a - b + +cdef double __div(double a, double b): + if b == 0: + return NaN + else: + return a / b + +cdef double __mul(double a, double b): + return a * b +cdef double __eq(double a, double b): + return a == b +cdef double __ne(double a, double b): + return a != b +cdef double __lt(double a, double b): + return a < b +cdef double __gt(double a, double b): + return a > b + +cdef double __pow(double a, double b): + # NaN + if a != a or b != b: + return NaN + return a ** b + +ctypedef double (* double_func)(double a, double b) + diff --git a/pandas/lib/src/io.pyx b/pandas/lib/src/io.pyx index 79321c01301a7..387c021d5438c 100644 --- a/pandas/lib/src/io.pyx +++ b/pandas/lib/src/io.pyx @@ -1,47 +1,47 @@ -cdef int _EPOCH_ORD = 719163 - -from datetime import date as pydate - -cdef inline int64_t gmtime(object date): - cdef int y, m, d, h, mn, s, ms, days - - y = PyDateTime_GET_YEAR(date) - m = PyDateTime_GET_MONTH(date) - d = PyDateTime_GET_DAY(date) - h = PyDateTime_DATE_GET_HOUR(date) - mn = PyDateTime_DATE_GET_MINUTE(date) - s = PyDateTime_DATE_GET_SECOND(date) - ms = PyDateTime_DATE_GET_MICROSECOND(date) / 1000 - - days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 - return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 - -cpdef object to_datetime(int64_t timestamp): - return pydatetime.utcfromtimestamp(timestamp / 1000.0) - -cpdef object to_timestamp(object dt): - return gmtime(dt) - -def array_to_timestamp(ndarray[object, ndim=1] arr): - cdef int i, n - cdef ndarray[int64_t, ndim=1] result - - n = len(arr) - result = np.empty(n, dtype=np.int64) - - for i from 0 <= i < n: - result[i] = gmtime(arr[i]) - - return result - -def array_to_datetime(ndarray[int64_t, ndim=1] arr): - cdef int i, n - cdef ndarray[object, ndim=1] result - - n = len(arr) - result = np.empty(n, dtype=object) - - for i from 0 <= i < n: - result[i] = to_datetime(arr[i]) - - return result +cdef int _EPOCH_ORD = 719163 + +from datetime import date as pydate + +cdef inline int64_t gmtime(object date): + cdef int y, m, d, h, mn, s, ms, days + + y = PyDateTime_GET_YEAR(date) + m = PyDateTime_GET_MONTH(date) + d = PyDateTime_GET_DAY(date) + h = PyDateTime_DATE_GET_HOUR(date) + mn = PyDateTime_DATE_GET_MINUTE(date) + s = PyDateTime_DATE_GET_SECOND(date) + ms = PyDateTime_DATE_GET_MICROSECOND(date) / 1000 + + days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 + return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + +cpdef object to_datetime(int64_t timestamp): + return pydatetime.utcfromtimestamp(timestamp / 1000.0) + +cpdef object to_timestamp(object dt): + return gmtime(dt) + +def array_to_timestamp(ndarray[object, ndim=1] arr): + cdef int i, n + cdef ndarray[int64_t, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=np.int64) + + for i from 0 <= i < n: + result[i] = gmtime(arr[i]) + + return result + +def array_to_datetime(ndarray[int64_t, ndim=1] arr): + cdef int i, n + cdef ndarray[object, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=object) + + for i from 0 <= i < n: + result[i] = to_datetime(arr[i]) + + return result diff --git a/pandas/lib/src/isnull.pyx b/pandas/lib/src/isnull.pyx index f099b63aefd8a..b69b30562320b 100644 --- a/pandas/lib/src/isnull.pyx +++ b/pandas/lib/src/isnull.pyx @@ -1,34 +1,34 @@ - -cdef double INF = np.inf -cdef double NEGINF = -INF - -cdef inline _checknull(object val): - if isinstance(val, float): - return val != val or val == INF or val == NEGINF - else: - return val is None - -cpdef checknull(object val): - return _checknull(val) - -def isnullobj(ndarray input): - cdef int i, length - cdef object val - cdef ndarray[npy_int8, ndim=1] result - cdef flatiter iter - - length = PyArray_SIZE(input) - - result = np.zeros(length, dtype=np.int8) - - iter= PyArray_IterNew(input) - - for i from 0 <= i < length: - val = PyArray_GETITEM(input, PyArray_ITER_DATA(iter)) - - if _checknull(val): - result[i] = 1 - - PyArray_ITER_NEXT(iter) - - return result + +cdef double INF = np.inf +cdef double NEGINF = -INF + +cdef inline _checknull(object val): + if isinstance(val, float): + return val != val or val == INF or val == NEGINF + else: + return val is None + +cpdef checknull(object val): + return _checknull(val) + +def isnullobj(ndarray input): + cdef int i, length + cdef object val + cdef ndarray[npy_int8, ndim=1] result + cdef flatiter iter + + length = PyArray_SIZE(input) + + result = np.zeros(length, dtype=np.int8) + + iter= PyArray_IterNew(input) + + for i from 0 <= i < length: + val = PyArray_GETITEM(input, PyArray_ITER_DATA(iter)) + + if _checknull(val): + result[i] = 1 + + PyArray_ITER_NEXT(iter) + + return result diff --git a/pandas/lib/src/moments.pyx b/pandas/lib/src/moments.pyx index 36779cca93ae1..d6aee7503a1cd 100644 --- a/pandas/lib/src/moments.pyx +++ b/pandas/lib/src/moments.pyx @@ -1,486 +1,486 @@ -# Cython implementations of rolling sum, mean, variance, skewness, -# other statistical moment functions -# -# Misc implementation notes -# ------------------------- -# -# - In Cython x * x is faster than x ** 2 for C types, this should be -# periodically revisited to see if it's still true. -# -# - - -# original C implementation by N. Devillard. -# This code in public domain. -# Function : kth_smallest() -# In : array of elements, # of elements in the array, rank k -# Out : one element -# Job : find the kth smallest element in the array - -# Reference: - -# Author: Wirth, Niklaus -# Title: Algorithms + data structures = programs -# Publisher: Englewood Cliffs: Prentice-Hall, 1976 -# Physical description: 366 p. -# Series: Prentice-Hall Series in Automatic Computation - - -def kth_smallest(ndarray[double_t, ndim=1] a, int k): - cdef: - int i,j,l,m,n - double_t x, t - - n = len(a) - - l = 0 - m = n-1 - while (l j: break - - if j < k: l = i - if k < i: m = j - return a[k] - - -def median(ndarray arr): - ''' - A faster median - ''' - cdef int n = len(arr) - - if len(arr) == 0: - return np.NaN - - arr = arr.copy() - - if n % 2: - return kth_smallest(arr, n / 2) - else: - return (kth_smallest(arr, n / 2) + - kth_smallest(arr, n / 2 - 1)) / 2 - -#------------------------------------------------------------------------------- -# Rolling sum - -def roll_sum(ndarray[double_t, ndim=1] input, - int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - - if val == val: - nobs += 1 - sum_x += val - - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling mean - -def roll_mean(ndarray[double_t, ndim=1] input, - int win, int minp): - cdef double val, prev, sum_x = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - - if val == val: - nobs += 1 - sum_x += val - - if nobs >= minp: - output[i] = sum_x / nobs - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Exponentially weighted moving average - -def ewma(ndarray[double_t, ndim=1] input, double_t com): - ''' - Compute exponentially-weighted moving average using center-of-mass. - - Parameters - ---------- - input : ndarray (float64 type) - com : float64 - - Returns - ------- - y : ndarray - ''' - - cdef double cur, prev, neww, oldw, adj - cdef int i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - - neww = 1. / (1. + com) - oldw = 1. - neww - adj = oldw - - output[0] = neww * input[0] - - for i from 1 <= i < N: - cur = input[i] - prev = output[i - 1] - - if cur == cur: - if prev == prev: - output[i] = oldw * prev + neww * cur - else: - output[i] = neww * cur - else: - output[i] = prev - - for i from 0 <= i < N: - cur = input[i] - output[i] = output[i] / (1. - adj) - - if cur == cur: - adj *= oldw - - return output - -#------------------------------------------------------------------------------- -# Rolling variance - -def roll_var(ndarray[double_t, ndim=1] input, - int win, int minp): - cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0 - cdef int i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - sum_xx += val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - sum_xx -= prev * prev - nobs -= 1 - - if val == val: - nobs += 1 - sum_x += val - sum_xx += val * val - - if nobs >= minp: - output[i] = (nobs * sum_xx - sum_x * sum_x) / (nobs * nobs - nobs) - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling skewness - -def roll_skew(ndarray[double_t, ndim=1] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - # 3 components of the skewness equation - cdef double A, B, C, R - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - - nobs -= 1 - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - - R = sqrt(B) - - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling kurtosis - - -def roll_kurt(ndarray[double_t, ndim=1] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef int nobs = 0, i - cdef int N = len(input) - - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev - - nobs -= 1 - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) - - output[i] = K - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling median, min, max - -ctypedef double_t (* skiplist_f)(object sl, int n, int p) - -cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): - cdef ndarray[double_t, ndim=1] input = arg - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef int nobs = 0, i - - cdef int N = len(input) - cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - if minp > N: - minp = N + 1 - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = op(skiplist, nobs, minp) - - return output - -def roll_median(ndarray input, int win, int minp): - ''' - O(N log(window)) implementation using skip list - ''' - return _roll_skiplist_op(input, win, minp, _get_median) - -def roll_max(ndarray input, int win, int minp): - ''' - O(N log(window)) implementation using skip list - ''' - return _roll_skiplist_op(input, win, minp, _get_max) - -def roll_min(ndarray input, int win, int minp): - ''' - O(N log(window)) implementation using skip list - ''' - return _roll_skiplist_op(input, win, minp, _get_min) - -# Unfortunately had to resort to some hackery here, would like for -# Cython to be able to get this right. - -cdef double_t _get_median(object sl, int nobs, int minp): - cdef int midpoint - cdef IndexableSkiplist skiplist = sl - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - return skiplist.get(midpoint) - else: - return (skiplist.get(midpoint) + - skiplist.get(midpoint - 1)) / 2 - else: - return NaN - -cdef double_t _get_max(object skiplist, int nobs, int minp): - if nobs >= minp: - return skiplist.get(nobs - 1) - else: - return NaN - -cdef double_t _get_min(object skiplist, int nobs, int minp): - if nobs >= minp: - return skiplist.get(0) - else: - return NaN +# Cython implementations of rolling sum, mean, variance, skewness, +# other statistical moment functions +# +# Misc implementation notes +# ------------------------- +# +# - In Cython x * x is faster than x ** 2 for C types, this should be +# periodically revisited to see if it's still true. +# +# - + +# original C implementation by N. Devillard. +# This code in public domain. +# Function : kth_smallest() +# In : array of elements, # of elements in the array, rank k +# Out : one element +# Job : find the kth smallest element in the array + +# Reference: + +# Author: Wirth, Niklaus +# Title: Algorithms + data structures = programs +# Publisher: Englewood Cliffs: Prentice-Hall, 1976 +# Physical description: 366 p. +# Series: Prentice-Hall Series in Automatic Computation + + +def kth_smallest(ndarray[double_t, ndim=1] a, int k): + cdef: + int i,j,l,m,n + double_t x, t + + n = len(a) + + l = 0 + m = n-1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + +def median(ndarray arr): + ''' + A faster median + ''' + cdef int n = len(arr) + + if len(arr) == 0: + return np.NaN + + arr = arr.copy() + + if n % 2: + return kth_smallest(arr, n / 2) + else: + return (kth_smallest(arr, n / 2) + + kth_smallest(arr, n / 2 - 1)) / 2 + +#------------------------------------------------------------------------------- +# Rolling sum + +def roll_sum(ndarray[double_t, ndim=1] input, + int win, int minp): + cdef double val, prev, sum_x = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + + if nobs >= minp: + output[i] = sum_x + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling mean + +def roll_mean(ndarray[double_t, ndim=1] input, + int win, int minp): + cdef double val, prev, sum_x = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + + if nobs >= minp: + output[i] = sum_x / nobs + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving average + +def ewma(ndarray[double_t, ndim=1] input, double_t com): + ''' + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + + Returns + ------- + y : ndarray + ''' + + cdef double cur, prev, neww, oldw, adj + cdef int i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + + neww = 1. / (1. + com) + oldw = 1. - neww + adj = oldw + + output[0] = neww * input[0] + + for i from 1 <= i < N: + cur = input[i] + prev = output[i - 1] + + if cur == cur: + if prev == prev: + output[i] = oldw * prev + neww * cur + else: + output[i] = neww * cur + else: + output[i] = prev + + for i from 0 <= i < N: + cur = input[i] + output[i] = output[i] / (1. - adj) + + if cur == cur: + adj *= oldw + + return output + +#------------------------------------------------------------------------------- +# Rolling variance + +def roll_var(ndarray[double_t, ndim=1] input, + int win, int minp): + cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0 + cdef int i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + sum_xx += val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + sum_xx -= prev * prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + sum_xx += val * val + + if nobs >= minp: + output[i] = (nobs * sum_xx - sum_x * sum_x) / (nobs * nobs - nobs) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling skewness + +def roll_skew(ndarray[double_t, ndim=1] input, + int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + # 3 components of the skewness equation + cdef double A, B, C, R + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + + nobs -= 1 + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + if nobs >= minp: + A = x / nobs + B = xx / nobs - A * A + C = xxx / nobs - A * A * A - 3 * A * B + + R = sqrt(B) + + output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / + ((nobs-2) * R * R * R)) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling kurtosis + + +def roll_kurt(ndarray[double_t, ndim=1] input, + int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + # 5 components of the kurtosis equation + cdef double A, B, C, D, R, K + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + + # seriously don't ask me why this is faster + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + xxxx -= prev * prev * prev * prev + + nobs -= 1 + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + if nobs >= minp: + A = x / nobs + R = A * A + B = xx / nobs - R + R = R * A + C = xxx / nobs - R - 3 * A * B + R = R * A + D = xxxx / nobs - R - 6*B*A*A - 4*C*A + + K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) + K = K / ((nobs - 2.)*(nobs-3.)) + + output[i] = K + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling median, min, max + +ctypedef double_t (* skiplist_f)(object sl, int n, int p) + +cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): + cdef ndarray[double_t, ndim=1] input = arg + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef int nobs = 0, i + + cdef int N = len(input) + cdef ndarray[double_t, ndim=1] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + if minp > N: + minp = N + 1 + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = op(skiplist, nobs, minp) + + return output + +def roll_median(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_median) + +def roll_max(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_max) + +def roll_min(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_min) + +# Unfortunately had to resort to some hackery here, would like for +# Cython to be able to get this right. + +cdef double_t _get_median(object sl, int nobs, int minp): + cdef int midpoint + cdef IndexableSkiplist skiplist = sl + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + return skiplist.get(midpoint) + else: + return (skiplist.get(midpoint) + + skiplist.get(midpoint - 1)) / 2 + else: + return NaN + +cdef double_t _get_max(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(nobs - 1) + else: + return NaN + +cdef double_t _get_min(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(0) + else: + return NaN diff --git a/pandas/lib/src/operators.pyx b/pandas/lib/src/operators.pyx index d7f1a3dfee1f4..0d164625a6e04 100644 --- a/pandas/lib/src/operators.pyx +++ b/pandas/lib/src/operators.pyx @@ -1,65 +1,65 @@ -@cython.boundscheck(False) -@cython.wraparound(False) -cdef ndarray _applyFunc(double_func func, ndarray index, ndarray ao, - ndarray bo, dict aMap, dict bMap): - ''' - C function taking a function pointer for quickly adding two Series objects. - ''' - cdef ndarray result - cdef double *result_data, *a_data, *b_data - cdef int length - cdef Py_ssize_t i, aidx, bidx - cdef double nan - cdef object idx - cdef ndarray[object, ndim=1] ibuf - cdef ndarray[double_t, ndim=1] A, B - - A = ao - B = bo - - ibuf = index - - nan = np.NaN - length = len(index) - result = np.empty(length, dtype=float) - result_data = result.data - - for i from 0 <= i < length: - idx = ibuf[i] - - if idx not in aMap or idx not in bMap: - result_data[i] = nan - continue - - aidx = aMap[idx] - bidx = bMap[idx] - result_data[i] = func(A[aidx], B[bidx]) - - return result - -def combineFunc(object name, ndarray index, ndarray ao, - ndarray bo, dict aMap, dict bMap): - ''' - Combine two series (values and index maps for each passed in) using the - indicated function. - ''' - if name == "__add__": - return _applyFunc(__add, index, ao, bo, aMap, bMap) - elif name == "__sub__": - return _applyFunc(__sub, index, ao, bo, aMap, bMap) - elif name == "__div__": - return _applyFunc(__div, index, ao, bo, aMap, bMap) - elif name == "__mul__": - return _applyFunc(__mul, index, ao, bo, aMap, bMap) - elif name == "__eq__": - return _applyFunc(__eq, index, ao, bo, aMap, bMap) - elif name == "__ne__": - return _applyFunc(__ne, index, ao, bo, aMap, bMap) - elif name == "__lt__": - return _applyFunc(__lt, index, ao, bo, aMap, bMap) - elif name == "__gt__": - return _applyFunc(__gt, index, ao, bo, aMap, bMap) - elif name == "__pow__": - return _applyFunc(__pow, index, ao, bo, aMap, bMap) - else: - raise Exception('bad funcname requested of Cython code') +@cython.boundscheck(False) +@cython.wraparound(False) +cdef ndarray _applyFunc(double_func func, ndarray index, ndarray ao, + ndarray bo, dict aMap, dict bMap): + ''' + C function taking a function pointer for quickly adding two Series objects. + ''' + cdef ndarray result + cdef double *result_data, *a_data, *b_data + cdef int length + cdef Py_ssize_t i, aidx, bidx + cdef double nan + cdef object idx + cdef ndarray[object, ndim=1] ibuf + cdef ndarray[double_t, ndim=1] A, B + + A = ao + B = bo + + ibuf = index + + nan = np.NaN + length = len(index) + result = np.empty(length, dtype=float) + result_data = result.data + + for i from 0 <= i < length: + idx = ibuf[i] + + if idx not in aMap or idx not in bMap: + result_data[i] = nan + continue + + aidx = aMap[idx] + bidx = bMap[idx] + result_data[i] = func(A[aidx], B[bidx]) + + return result + +def combineFunc(object name, ndarray index, ndarray ao, + ndarray bo, dict aMap, dict bMap): + ''' + Combine two series (values and index maps for each passed in) using the + indicated function. + ''' + if name == "__add__": + return _applyFunc(__add, index, ao, bo, aMap, bMap) + elif name == "__sub__": + return _applyFunc(__sub, index, ao, bo, aMap, bMap) + elif name == "__div__": + return _applyFunc(__div, index, ao, bo, aMap, bMap) + elif name == "__mul__": + return _applyFunc(__mul, index, ao, bo, aMap, bMap) + elif name == "__eq__": + return _applyFunc(__eq, index, ao, bo, aMap, bMap) + elif name == "__ne__": + return _applyFunc(__ne, index, ao, bo, aMap, bMap) + elif name == "__lt__": + return _applyFunc(__lt, index, ao, bo, aMap, bMap) + elif name == "__gt__": + return _applyFunc(__gt, index, ao, bo, aMap, bMap) + elif name == "__pow__": + return _applyFunc(__pow, index, ao, bo, aMap, bMap) + else: + raise Exception('bad funcname requested of Cython code') diff --git a/pandas/lib/src/reindex.pyx b/pandas/lib/src/reindex.pyx index 6ca485fb35cdb..dbcd61cf0dc43 100644 --- a/pandas/lib/src/reindex.pyx +++ b/pandas/lib/src/reindex.pyx @@ -1,234 +1,234 @@ -def getFillVec(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap, - kind=None): - - if kind is None: - fillVec, maskVec = getMergeVec(newIndex, oldMap) - elif kind == 'PAD': - fillVec, maskVec = _pad(oldIndex, newIndex, oldMap, newMap) - elif kind == 'BACKFILL': - fillVec, maskVec = _backfill(oldIndex, newIndex, oldMap, newMap) - else: - raise Exception("Don't recognize method: %s" % kind) - - return fillVec, maskVec.astype(np.bool) - -@cython.wraparound(False) -def _backfill(ndarray[object, ndim=1] oldIndex, - ndarray[object, ndim=1] newIndex, - dict oldMap, dict newMap): - ''' - Backfilling logic for generating fill vector - - Diagram of what's going on - - Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 - A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - B B 1 1 - . 2 1 - . 2 1 - . 2 1 - C C 2 1 - . 0 - . 0 - D - ''' - cdef int i, j, oldLength, newLength, curLoc - # Make empty vectors - cdef ndarray[int32_t, ndim=1] fillVec - cdef ndarray[int8_t, ndim=1] mask - cdef int newPos, oldPos - cdef object prevOld, curOld - - # Get the size - oldLength = len(oldIndex) - newLength = len(newIndex) - - fillVec = np.empty(len(newIndex), dtype = np.int32) - fillVec.fill(-1) - - mask = np.zeros(len(newIndex), dtype = np.int8) - - # Current positions - oldPos = oldLength - 1 - newPos = newLength - 1 - - # corner case, no filling possible - if newIndex[0] > oldIndex[oldLength - 1]: - return fillVec, mask - - while newPos >= 0: - curOld = oldIndex[oldPos] - - # Until we reach a point where we are before the curOld point - while newIndex[newPos] > curOld: - newPos -= 1 - if newPos < 0: - break - - # Get the location in the old index - curLoc = oldMap[curOld] - - # At the beginning of the old index - if oldPos == 0: - # Make sure we are before the curOld index - if newIndex[newPos] <= curOld: - fillVec[:newPos + 1] = curLoc - mask[:newPos + 1] = 1 - # Exit the main loop - break - else: - # Get the index there - prevOld = oldIndex[oldPos - 1] - - # Until we reach the previous index - while newIndex[newPos] > prevOld: - # Set the current fill location - fillVec[newPos] = curLoc - mask[newPos] = 1 - - newPos -= 1 - if newPos < 0: - break - - # Move one period back - oldPos -= 1 - - return (fillVec, mask) - -@cython.wraparound(False) -def _pad(ndarray[object, ndim=1] oldIndex, - ndarray[object, ndim=1] newIndex, - dict oldMap, dict newMap): - ''' - Padding logic for generating fill vector - - Diagram of what's going on - - Old New Fill vector Mask - . 0 - . 0 - . 0 - A A 0 1 - . 0 1 - . 0 1 - . 0 1 - . 0 1 - . 0 1 - B B 1 1 - . 1 1 - . 1 1 - . 1 1 - C C 2 1 - ''' - cdef int i, j, oldLength, newLength, curLoc - # Make empty vectors - cdef ndarray[int32_t, ndim=1] fillVec - cdef ndarray[int8_t, ndim=1] mask - cdef int newPos, oldPos - cdef object prevOld, curOld - - # Get the size - oldLength = len(oldIndex) - newLength = len(newIndex) - - fillVec = np.empty(len(newIndex), dtype = np.int32) - fillVec.fill(-1) - - mask = np.zeros(len(newIndex), dtype = np.int8) - - oldPos = 0 - newPos = 0 - - # corner case, no filling possible - if newIndex[newLength - 1] < oldIndex[0]: - return fillVec, mask - - while newPos < newLength: - curOld = oldIndex[oldPos] - - # At beginning, keep going until we go exceed the - # first OLD index in the NEW index - while newIndex[newPos] < curOld: - newPos += 1 - if newPos > newLength - 1: - break - - # We got there, get the current location in the old index - curLoc = oldMap[curOld] - - # We're at the end of the road, need to propagate this value to the end - if oldPos == oldLength - 1: - if newIndex[newPos] >= curOld: - fillVec[newPos:] = curLoc - mask[newPos:] = 1 - break - else: - # Not at the end, need to go about filling - - # Get the next index so we know when to stop propagating this value - nextOld = oldIndex[oldPos + 1] - - done = 0 - - # Until we reach the next OLD value in the NEW index - while newIndex[newPos] < nextOld: - # Use this location to fill - fillVec[newPos] = curLoc - - # Set mask to be 1 so will not be NaN'd - mask[newPos] = 1 - newPos += 1 - - # We got to the end of the new index - if newPos > newLength - 1: - done = 1 - break - - # We got to the end of the new index - if done: - break - - # We already advanced the iterold pointer to the next value, - # inc the count - oldPos += 1 - - return fillVec, mask - -@cython.boundscheck(False) -def getMergeVec(ndarray values, dict oldMap): - cdef int i, j, length, newLength - - cdef flatiter iternew - cdef object idx - cdef ndarray[int32_t, ndim=1] fillVec - cdef ndarray[int8_t, ndim=1] mask - - newLength = len(values) - fillVec = np.empty(newLength, dtype=np.int32) - mask = np.zeros(newLength, dtype=np.int8) - - iternew = PyArray_IterNew(values) - - for i from 0 <= i < newLength: - idx = PyArray_GETITEM(values, PyArray_ITER_DATA(iternew)) - - if idx in oldMap: - fillVec[i] = oldMap[idx] - mask[i] = 1 - - PyArray_ITER_NEXT(iternew) - - for i from 0 <= i < newLength: - if mask[i] == 0: - fillVec[i] = -1 - - return fillVec, mask.astype(bool) - +def getFillVec(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap, + kind=None): + + if kind is None: + fillVec, maskVec = getMergeVec(newIndex, oldMap) + elif kind == 'PAD': + fillVec, maskVec = _pad(oldIndex, newIndex, oldMap, newMap) + elif kind == 'BACKFILL': + fillVec, maskVec = _backfill(oldIndex, newIndex, oldMap, newMap) + else: + raise Exception("Don't recognize method: %s" % kind) + + return fillVec, maskVec.astype(np.bool) + +@cython.wraparound(False) +def _backfill(ndarray[object, ndim=1] oldIndex, + ndarray[object, ndim=1] newIndex, + dict oldMap, dict newMap): + ''' + Backfilling logic for generating fill vector + + Diagram of what's going on + + Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 + A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + B B 1 1 + . 2 1 + . 2 1 + . 2 1 + C C 2 1 + . 0 + . 0 + D + ''' + cdef int i, j, oldLength, newLength, curLoc + # Make empty vectors + cdef ndarray[int32_t, ndim=1] fillVec + cdef ndarray[int8_t, ndim=1] mask + cdef int newPos, oldPos + cdef object prevOld, curOld + + # Get the size + oldLength = len(oldIndex) + newLength = len(newIndex) + + fillVec = np.empty(len(newIndex), dtype = np.int32) + fillVec.fill(-1) + + mask = np.zeros(len(newIndex), dtype = np.int8) + + # Current positions + oldPos = oldLength - 1 + newPos = newLength - 1 + + # corner case, no filling possible + if newIndex[0] > oldIndex[oldLength - 1]: + return fillVec, mask + + while newPos >= 0: + curOld = oldIndex[oldPos] + + # Until we reach a point where we are before the curOld point + while newIndex[newPos] > curOld: + newPos -= 1 + if newPos < 0: + break + + # Get the location in the old index + curLoc = oldMap[curOld] + + # At the beginning of the old index + if oldPos == 0: + # Make sure we are before the curOld index + if newIndex[newPos] <= curOld: + fillVec[:newPos + 1] = curLoc + mask[:newPos + 1] = 1 + # Exit the main loop + break + else: + # Get the index there + prevOld = oldIndex[oldPos - 1] + + # Until we reach the previous index + while newIndex[newPos] > prevOld: + # Set the current fill location + fillVec[newPos] = curLoc + mask[newPos] = 1 + + newPos -= 1 + if newPos < 0: + break + + # Move one period back + oldPos -= 1 + + return (fillVec, mask) + +@cython.wraparound(False) +def _pad(ndarray[object, ndim=1] oldIndex, + ndarray[object, ndim=1] newIndex, + dict oldMap, dict newMap): + ''' + Padding logic for generating fill vector + + Diagram of what's going on + + Old New Fill vector Mask + . 0 + . 0 + . 0 + A A 0 1 + . 0 1 + . 0 1 + . 0 1 + . 0 1 + . 0 1 + B B 1 1 + . 1 1 + . 1 1 + . 1 1 + C C 2 1 + ''' + cdef int i, j, oldLength, newLength, curLoc + # Make empty vectors + cdef ndarray[int32_t, ndim=1] fillVec + cdef ndarray[int8_t, ndim=1] mask + cdef int newPos, oldPos + cdef object prevOld, curOld + + # Get the size + oldLength = len(oldIndex) + newLength = len(newIndex) + + fillVec = np.empty(len(newIndex), dtype = np.int32) + fillVec.fill(-1) + + mask = np.zeros(len(newIndex), dtype = np.int8) + + oldPos = 0 + newPos = 0 + + # corner case, no filling possible + if newIndex[newLength - 1] < oldIndex[0]: + return fillVec, mask + + while newPos < newLength: + curOld = oldIndex[oldPos] + + # At beginning, keep going until we go exceed the + # first OLD index in the NEW index + while newIndex[newPos] < curOld: + newPos += 1 + if newPos > newLength - 1: + break + + # We got there, get the current location in the old index + curLoc = oldMap[curOld] + + # We're at the end of the road, need to propagate this value to the end + if oldPos == oldLength - 1: + if newIndex[newPos] >= curOld: + fillVec[newPos:] = curLoc + mask[newPos:] = 1 + break + else: + # Not at the end, need to go about filling + + # Get the next index so we know when to stop propagating this value + nextOld = oldIndex[oldPos + 1] + + done = 0 + + # Until we reach the next OLD value in the NEW index + while newIndex[newPos] < nextOld: + # Use this location to fill + fillVec[newPos] = curLoc + + # Set mask to be 1 so will not be NaN'd + mask[newPos] = 1 + newPos += 1 + + # We got to the end of the new index + if newPos > newLength - 1: + done = 1 + break + + # We got to the end of the new index + if done: + break + + # We already advanced the iterold pointer to the next value, + # inc the count + oldPos += 1 + + return fillVec, mask + +@cython.boundscheck(False) +def getMergeVec(ndarray values, dict oldMap): + cdef int i, j, length, newLength + + cdef flatiter iternew + cdef object idx + cdef ndarray[int32_t, ndim=1] fillVec + cdef ndarray[int8_t, ndim=1] mask + + newLength = len(values) + fillVec = np.empty(newLength, dtype=np.int32) + mask = np.zeros(newLength, dtype=np.int8) + + iternew = PyArray_IterNew(values) + + for i from 0 <= i < newLength: + idx = PyArray_GETITEM(values, PyArray_ITER_DATA(iternew)) + + if idx in oldMap: + fillVec[i] = oldMap[idx] + mask[i] = 1 + + PyArray_ITER_NEXT(iternew) + + for i from 0 <= i < newLength: + if mask[i] == 0: + fillVec[i] = -1 + + return fillVec, mask.astype(bool) + diff --git a/pandas/sandbox/stats/rls.py b/pandas/sandbox/stats/rls.py index 855c8d4b1fe39..b873225ccc715 100644 --- a/pandas/sandbox/stats/rls.py +++ b/pandas/sandbox/stats/rls.py @@ -1,137 +1,137 @@ -"""Restricted least squares""" - -import numpy as np -from scikits.statsmodels.regression import WLS, GLS, RegressionResults - -class RLS(GLS): - """ - Restricted general least squares model that handles linear constraints - - Parameters - ---------- - endog: array-like - n length array containing the dependent variable - exog: array-like - n-by-p array of independent variables - constr: array-like - k-by-p array of linear constraints - param (0.): array-like or scalar - p-by-1 array (or scalar) of constraint parameters - sigma (None): scalar or array-like - The weighting matrix of the covariance. No scaling by default (OLS). - If sigma is a scalar, then it is converted into an n-by-n diagonal - matrix with sigma as each diagonal element. - If sigma is an n-length array, then it is assumed to be a diagonal - matrix with the given sigma on the diagonal (WLS). - - Notes - ----- - endog = exog * beta + epsilon - weights' * constr * beta = param - - See Greene and Seaks, "The Restricted Least Squares Estimator: - A Pedagogical Note", The Review of Economics and Statistics, 1991. - """ - - def __init__(self, endog, exog, constr, param=0., sigma=None): - N, Q = exog.shape - if constr.ndim == 1: - K, P = 1, constr.shape[0] - else: - K, P = constr.shape - if Q != P: - raise Exception('Constraints and design do not align') - self.ncoeffs = Q - self.nconstraint = K - self.constraint = constr - if np.isscalar(param) and K > 1: - param = np.ones((K,)) * param - self.param = param - if sigma is None: - sigma = 1. - if np.isscalar(sigma): - sigma = np.ones(N) * sigma - sigma = np.squeeze(sigma) - if sigma.ndim == 1: - self.sigma = np.diag(sigma) - self.cholsigmainv = np.diag(np.sqrt(sigma)) - else: - self.sigma = sigma - self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(self.sigma)).T - super(GLS, self).__init__(endog, exog) - - _rwexog = None - @property - def rwexog(self): - """Whitened exogenous variables augmented with restrictions""" - if self._rwexog is None: - P = self.ncoeffs - K = self.nconstraint - design = np.zeros((P + K, P + K)) - design[:P, :P] = np.dot(self.wexog.T, self.wexog) #top left - constr = np.reshape(self.constraint, (K, P)) - design[:P, P:] = constr.T #top right partition - design[P:, :P] = constr #bottom left partition - design[P:, P:] = np.zeros((K, K)) #bottom right partition - self._rwexog = design - return self._rwexog - - _inv_rwexog = None - @property - def inv_rwexog(self): - """Inverse of self.rwexog""" - if self._inv_rwexog is None: - self._inv_rwexog = np.linalg.inv(self.rwexog) - return self._inv_rwexog - - _rwendog = None - @property - def rwendog(self): - """Whitened endogenous variable augmented with restriction parameters""" - if self._rwendog is None: - P = self.ncoeffs - K = self.nconstraint - response = np.zeros((P + K,)) - response[:P] = np.dot(self.wexog.T, self.wendog) - response[P:] = self.param - self._rwendog = response - return self._rwendog - - _ncp = None - @property - def rnorm_cov_params(self): - """Parameter covariance under restrictions""" - if self._ncp is None: - P = self.ncoeffs - self._ncp = self.inv_rwexog[:P, :P] - return self._ncp - - _wncp = None - @property - def wrnorm_cov_params(self): - """ - Heteroskedasticity-consistent parameter covariance - Used to calculate White standard errors. - """ - if self._wncp is None: - df = self.df_resid - pred = np.dot(self.wexog, self.coeffs) - eps = np.diag((self.wendog - pred) ** 2) - sigmaSq = np.sum(eps) - pinvX = np.dot(self.rnorm_cov_params, self.wexog.T) - self._wncp = np.dot(np.dot(pinvX, eps), pinvX.T) * df / sigmaSq - return self._wncp - - _coeffs = None - @property - def coeffs(self): - """Estimated parameters""" - if self._coeffs is None: - betaLambda = np.dot(self.inv_rwexog, self.rwendog) - self._coeffs = betaLambda[:self.ncoeffs] - return self._coeffs - - def fit(self): - rncp = self.wrnorm_cov_params - lfit = RegressionResults(self, self.coeffs, normalized_cov_params=rncp) - return lfit +"""Restricted least squares""" + +import numpy as np +from scikits.statsmodels.regression import WLS, GLS, RegressionResults + +class RLS(GLS): + """ + Restricted general least squares model that handles linear constraints + + Parameters + ---------- + endog: array-like + n length array containing the dependent variable + exog: array-like + n-by-p array of independent variables + constr: array-like + k-by-p array of linear constraints + param (0.): array-like or scalar + p-by-1 array (or scalar) of constraint parameters + sigma (None): scalar or array-like + The weighting matrix of the covariance. No scaling by default (OLS). + If sigma is a scalar, then it is converted into an n-by-n diagonal + matrix with sigma as each diagonal element. + If sigma is an n-length array, then it is assumed to be a diagonal + matrix with the given sigma on the diagonal (WLS). + + Notes + ----- + endog = exog * beta + epsilon + weights' * constr * beta = param + + See Greene and Seaks, "The Restricted Least Squares Estimator: + A Pedagogical Note", The Review of Economics and Statistics, 1991. + """ + + def __init__(self, endog, exog, constr, param=0., sigma=None): + N, Q = exog.shape + if constr.ndim == 1: + K, P = 1, constr.shape[0] + else: + K, P = constr.shape + if Q != P: + raise Exception('Constraints and design do not align') + self.ncoeffs = Q + self.nconstraint = K + self.constraint = constr + if np.isscalar(param) and K > 1: + param = np.ones((K,)) * param + self.param = param + if sigma is None: + sigma = 1. + if np.isscalar(sigma): + sigma = np.ones(N) * sigma + sigma = np.squeeze(sigma) + if sigma.ndim == 1: + self.sigma = np.diag(sigma) + self.cholsigmainv = np.diag(np.sqrt(sigma)) + else: + self.sigma = sigma + self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(self.sigma)).T + super(GLS, self).__init__(endog, exog) + + _rwexog = None + @property + def rwexog(self): + """Whitened exogenous variables augmented with restrictions""" + if self._rwexog is None: + P = self.ncoeffs + K = self.nconstraint + design = np.zeros((P + K, P + K)) + design[:P, :P] = np.dot(self.wexog.T, self.wexog) #top left + constr = np.reshape(self.constraint, (K, P)) + design[:P, P:] = constr.T #top right partition + design[P:, :P] = constr #bottom left partition + design[P:, P:] = np.zeros((K, K)) #bottom right partition + self._rwexog = design + return self._rwexog + + _inv_rwexog = None + @property + def inv_rwexog(self): + """Inverse of self.rwexog""" + if self._inv_rwexog is None: + self._inv_rwexog = np.linalg.inv(self.rwexog) + return self._inv_rwexog + + _rwendog = None + @property + def rwendog(self): + """Whitened endogenous variable augmented with restriction parameters""" + if self._rwendog is None: + P = self.ncoeffs + K = self.nconstraint + response = np.zeros((P + K,)) + response[:P] = np.dot(self.wexog.T, self.wendog) + response[P:] = self.param + self._rwendog = response + return self._rwendog + + _ncp = None + @property + def rnorm_cov_params(self): + """Parameter covariance under restrictions""" + if self._ncp is None: + P = self.ncoeffs + self._ncp = self.inv_rwexog[:P, :P] + return self._ncp + + _wncp = None + @property + def wrnorm_cov_params(self): + """ + Heteroskedasticity-consistent parameter covariance + Used to calculate White standard errors. + """ + if self._wncp is None: + df = self.df_resid + pred = np.dot(self.wexog, self.coeffs) + eps = np.diag((self.wendog - pred) ** 2) + sigmaSq = np.sum(eps) + pinvX = np.dot(self.rnorm_cov_params, self.wexog.T) + self._wncp = np.dot(np.dot(pinvX, eps), pinvX.T) * df / sigmaSq + return self._wncp + + _coeffs = None + @property + def coeffs(self): + """Estimated parameters""" + if self._coeffs is None: + betaLambda = np.dot(self.inv_rwexog, self.rwendog) + self._coeffs = betaLambda[:self.ncoeffs] + return self._coeffs + + def fit(self): + rncp = self.wrnorm_cov_params + lfit = RegressionResults(self, self.coeffs, normalized_cov_params=rncp) + return lfit diff --git a/pandas/setup.py b/pandas/setup.py index 249709b6dcbf8..af2909a67fe34 100644 --- a/pandas/setup.py +++ b/pandas/setup.py @@ -1,17 +1,17 @@ -#!/usr/bin/env python - -def configuration(parent_package='',top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration('pandas', parent_package, top_path) - config.add_subpackage('core') - config.add_subpackage('io') - config.add_subpackage('lib') - config.add_subpackage('rpy') - config.add_subpackage('sandbox') - config.add_subpackage('stats') - config.add_subpackage('util') - return config - -if __name__ == '__main__': - print('This is the wrong setup.py file to run') - +#!/usr/bin/env python + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('pandas', parent_package, top_path) + config.add_subpackage('core') + config.add_subpackage('io') + config.add_subpackage('lib') + config.add_subpackage('rpy') + config.add_subpackage('sandbox') + config.add_subpackage('stats') + config.add_subpackage('util') + return config + +if __name__ == '__main__': + print('This is the wrong setup.py file to run') + diff --git a/pandas/stats/api.py b/pandas/stats/api.py index fabd2582509e2..3732f9ed39524 100644 --- a/pandas/stats/api.py +++ b/pandas/stats/api.py @@ -1,9 +1,9 @@ -""" -Common namespace of statistical functions -""" - -# pylint: disable-msg=W0611,W0614,W0401 - -from pandas.stats.moments import * -from pandas.stats.interface import ols -from pandas.stats.fama_macbeth import fama_macbeth +""" +Common namespace of statistical functions +""" + +# pylint: disable-msg=W0611,W0614,W0401 + +from pandas.stats.moments import * +from pandas.stats.interface import ols +from pandas.stats.fama_macbeth import fama_macbeth diff --git a/pandas/stats/common.py b/pandas/stats/common.py index ef5031120b882..f88789cce7d64 100644 --- a/pandas/stats/common.py +++ b/pandas/stats/common.py @@ -1,65 +1,65 @@ -TIME = 0 -ENTITY = 1 - -def _get_cluster_type(cluster_type): - if cluster_type in (TIME, ENTITY, None): - return cluster_type - - elif isinstance(cluster_type, basestring): - cluster_type_up = cluster_type.upper() - - if cluster_type_up == 'ENTITY': - return ENTITY - elif cluster_type_up == 'TIME': - return TIME - - raise Exception('Unrecognized clustering type: %s' % cluster_type) - -FULL_SAMPLE = 0 -ROLLING = 1 -EXPANDING = 2 - -def _get_window_type(window_type): - if window_type in (FULL_SAMPLE, ROLLING, EXPANDING): - return window_type - elif isinstance(window_type, basestring): - window_type_up = window_type.upper() - - if window_type_up in ('FULL SAMPLE', 'FULL_SAMPLE'): - return FULL_SAMPLE - elif window_type_up == 'ROLLING': - return ROLLING - elif window_type_up == 'EXPANDING': - return EXPANDING - - raise Exception('Unrecognized window type: %s' % window_type) - -def _get_window_type_name(window_type): - names = { - 0 : 'full sample', - 1 : 'rolling', - 2 : 'expanding' - } - return names[window_type] - -def banner(text, width=80): - """ - - """ - toFill = width - len(text) - - left = toFill // 2 - right = toFill - left - - return '%s%s%s' % ('-' * left, text, '-' * right) - -def f_stat_to_dict(result): - f_stat, shape, p_value = result - - result = {} - result['f-stat'] = f_stat - result['DF X'] = shape[0] - result['DF Resid'] = shape[1] - result['p-value'] = p_value - - return result +TIME = 0 +ENTITY = 1 + +def _get_cluster_type(cluster_type): + if cluster_type in (TIME, ENTITY, None): + return cluster_type + + elif isinstance(cluster_type, basestring): + cluster_type_up = cluster_type.upper() + + if cluster_type_up == 'ENTITY': + return ENTITY + elif cluster_type_up == 'TIME': + return TIME + + raise Exception('Unrecognized clustering type: %s' % cluster_type) + +FULL_SAMPLE = 0 +ROLLING = 1 +EXPANDING = 2 + +def _get_window_type(window_type): + if window_type in (FULL_SAMPLE, ROLLING, EXPANDING): + return window_type + elif isinstance(window_type, basestring): + window_type_up = window_type.upper() + + if window_type_up in ('FULL SAMPLE', 'FULL_SAMPLE'): + return FULL_SAMPLE + elif window_type_up == 'ROLLING': + return ROLLING + elif window_type_up == 'EXPANDING': + return EXPANDING + + raise Exception('Unrecognized window type: %s' % window_type) + +def _get_window_type_name(window_type): + names = { + 0 : 'full sample', + 1 : 'rolling', + 2 : 'expanding' + } + return names[window_type] + +def banner(text, width=80): + """ + + """ + toFill = width - len(text) + + left = toFill // 2 + right = toFill - left + + return '%s%s%s' % ('-' * left, text, '-' * right) + +def f_stat_to_dict(result): + f_stat, shape, p_value = result + + result = {} + result['f-stat'] = f_stat + result['DF X'] = shape[0] + result['DF Resid'] = shape[1] + result['p-value'] = p_value + + return result diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py index 30bf47b8e35c3..487feee05f0c1 100644 --- a/pandas/stats/interface.py +++ b/pandas/stats/interface.py @@ -1,117 +1,117 @@ -from pandas.core.api import Series - -from pandas.stats.ols import OLS, MovingOLS -from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS -import pandas.stats.common as common - -def ols(**kwargs): - """Returns the appropriate OLS object depending on whether you need - simple or panel OLS, and a full-sample or rolling/expanding OLS. - - Parameters - ---------- - y: Series for simple OLS. DataFrame for panel OLS. - x: Series, DataFrame, or dict of Series for simple OLS. - Dict of DataFrame for panel OLS. - intercept: bool - True if you want an intercept. Defaults to True. - nw_lags: None or int - Number of Newey-West lags. Defaults to None. - nw_overlap: bool - Whether there are overlaps in the NW lags. Defaults to False. - window_type: {'full sample', 'rolling', 'expanding'} - 'full sample' by default - window: int - size of window (for rolling/expanding OLS). If window passed and no - explicit window_type, 'rolling" will be used as the window_type - - Panel OLS options: - pool: bool - Whether to run pooled panel regression. Defaults to true. - weights: DataFrame - Weight for each observation. The weights are not normalized; - they're multiplied directly by each observation. - entity_effects: bool - Whether to account for entity fixed effects. Defaults to false. - time_effects: bool - Whether to account for time fixed effects. Defaults to false. - x_effects: list - List of x's to account for fixed effects. Defaults to none. - dropped_dummies: dict - Key is the name of the variable for the fixed effect. - Value is the value of that variable for which we drop the dummy. - - For entity fixed effects, key equals 'entity'. - - By default, the first dummy is dropped if no dummy is specified. - cluster: {'time', 'entity'} - cluster variances - - Returns - ------- - The appropriate OLS object, which allows you to obtain betas and various - statistics, such as std err, t-stat, etc. - - Examples - -------- - # Run simple OLS. - result = ols(y=y, x=x) - - # Run rolling simple OLS with window of size 10. - result = ols(y=y, x=x, window_type='rolling', window=10) - print result.beta - - result = ols(y=y, x=x, nw_lags=1) - - # Set up LHS and RHS for data across all items - y = A - x = {'B' : B, 'C' : C} - - # Run panel OLS. - result = ols(y=y, x=x) - - # Run expanding panel OLS with window 10 and entity clustering. - result = ols(y=y, x=x, cluster='entity', window_type='expanding', window=10) - """ - pool = kwargs.get('pool') - if 'pool' in kwargs: - del kwargs['pool'] - - window_type = kwargs.get('window_type') - window = kwargs.get('window') - - if window_type is None: - if window is None: - window_type = common.FULL_SAMPLE - else: - window_type = common.ROLLING - else: - window_type = common._get_window_type(window_type) - - if window_type != common.FULL_SAMPLE: - kwargs['window_type'] = common._get_window_type_name(window_type) - - y = kwargs.get('y') - if window_type == common.FULL_SAMPLE: - # HACK (!) - for rolling_field in ('window_type', 'window', 'min_periods'): - if rolling_field in kwargs: - del kwargs[rolling_field] - - if isinstance(y, Series): - klass = OLS - else: - if pool == False: - klass = NonPooledPanelOLS - else: - klass = PanelOLS - else: - if isinstance(y, Series): - klass = MovingOLS - else: - if pool == False: - klass = NonPooledPanelOLS - else: - klass = MovingPanelOLS - - return klass(**kwargs) +from pandas.core.api import Series + +from pandas.stats.ols import OLS, MovingOLS +from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS +import pandas.stats.common as common + +def ols(**kwargs): + """Returns the appropriate OLS object depending on whether you need + simple or panel OLS, and a full-sample or rolling/expanding OLS. + + Parameters + ---------- + y: Series for simple OLS. DataFrame for panel OLS. + x: Series, DataFrame, or dict of Series for simple OLS. + Dict of DataFrame for panel OLS. + intercept: bool + True if you want an intercept. Defaults to True. + nw_lags: None or int + Number of Newey-West lags. Defaults to None. + nw_overlap: bool + Whether there are overlaps in the NW lags. Defaults to False. + window_type: {'full sample', 'rolling', 'expanding'} + 'full sample' by default + window: int + size of window (for rolling/expanding OLS). If window passed and no + explicit window_type, 'rolling" will be used as the window_type + + Panel OLS options: + pool: bool + Whether to run pooled panel regression. Defaults to true. + weights: DataFrame + Weight for each observation. The weights are not normalized; + they're multiplied directly by each observation. + entity_effects: bool + Whether to account for entity fixed effects. Defaults to false. + time_effects: bool + Whether to account for time fixed effects. Defaults to false. + x_effects: list + List of x's to account for fixed effects. Defaults to none. + dropped_dummies: dict + Key is the name of the variable for the fixed effect. + Value is the value of that variable for which we drop the dummy. + + For entity fixed effects, key equals 'entity'. + + By default, the first dummy is dropped if no dummy is specified. + cluster: {'time', 'entity'} + cluster variances + + Returns + ------- + The appropriate OLS object, which allows you to obtain betas and various + statistics, such as std err, t-stat, etc. + + Examples + -------- + # Run simple OLS. + result = ols(y=y, x=x) + + # Run rolling simple OLS with window of size 10. + result = ols(y=y, x=x, window_type='rolling', window=10) + print result.beta + + result = ols(y=y, x=x, nw_lags=1) + + # Set up LHS and RHS for data across all items + y = A + x = {'B' : B, 'C' : C} + + # Run panel OLS. + result = ols(y=y, x=x) + + # Run expanding panel OLS with window 10 and entity clustering. + result = ols(y=y, x=x, cluster='entity', window_type='expanding', window=10) + """ + pool = kwargs.get('pool') + if 'pool' in kwargs: + del kwargs['pool'] + + window_type = kwargs.get('window_type') + window = kwargs.get('window') + + if window_type is None: + if window is None: + window_type = common.FULL_SAMPLE + else: + window_type = common.ROLLING + else: + window_type = common._get_window_type(window_type) + + if window_type != common.FULL_SAMPLE: + kwargs['window_type'] = common._get_window_type_name(window_type) + + y = kwargs.get('y') + if window_type == common.FULL_SAMPLE: + # HACK (!) + for rolling_field in ('window_type', 'window', 'min_periods'): + if rolling_field in kwargs: + del kwargs[rolling_field] + + if isinstance(y, Series): + klass = OLS + else: + if pool == False: + klass = NonPooledPanelOLS + else: + klass = PanelOLS + else: + if isinstance(y, Series): + klass = MovingOLS + else: + if pool == False: + klass = NonPooledPanelOLS + else: + klass = MovingPanelOLS + + return klass(**kwargs) diff --git a/pandas/stats/math.py b/pandas/stats/math.py index c9c3ce68fc7a2..05c6be0cc5389 100644 --- a/pandas/stats/math.py +++ b/pandas/stats/math.py @@ -1,132 +1,132 @@ -# pylint: disable-msg=E1103 -# pylint: disable-msg=W0212 - -from __future__ import division - -import numpy as np -import numpy.linalg as linalg - -def rank(X, cond=1.0e-12): - """ - Return the rank of a matrix X based on its generalized inverse, - not the SVD. - """ - X = np.asarray(X) - if len(X.shape) == 2: - import scipy.linalg as SL - D = SL.svdvals(X) - result = np.add.reduce(np.greater(D / D.max(), cond)) - return int(result.astype(np.int32)) - else: - return int(not np.alltrue(np.equal(X, 0.))) - -def solve(a, b): - """Returns the solution of A X = B.""" - try: - return linalg.solve(a, b) - except linalg.LinAlgError: - return np.dot(linalg.pinv(a), b) - -def inv(a): - """Returns the inverse of A.""" - try: - return np.linalg.inv(a) - except linalg.LinAlgError: - return np.linalg.pinv(a) - -def is_psd(m): - eigvals = linalg.eigvals(m) - return np.isreal(eigvals).all() and (eigvals >= 0).all() - -def newey_west(m, max_lags, nobs, df, nw_overlap=False): - """ - Compute Newey-West adjusted covariance matrix, taking into account - specified number of leads / lags - - Parameters - ---------- - m: (N x K) - max_lags: int - nobs: int - Number of observations in model - df: int - Degrees of freedom in explanatory variables - nw_overlap: boolean - - Returns - ------- - ndarray (K x K) - - Reference - --------- - Newey, W. K. & West, K. D. (1987) A Simple, Positive - Semi-definite, Heteroskedasticity and Autocorrelation Consistent - Covariance Matrix, Econometrica, vol. 55(3), 703-708 - """ - Xeps = np.dot(m.T, m) - for lag in xrange(1, max_lags + 1): - auto_cov = np.dot(m[:-lag].T, m[lag:]) - weight = lag / (max_lags + 1) - if nw_overlap: - weight = 0 - bb = auto_cov + auto_cov.T - dd = (1 - weight) * bb - Xeps += dd - - Xeps *= nobs / (nobs - df) - - if nw_overlap and not is_psd(Xeps): - new_max_lags = int(np.ceil(max_lags * 1.5)) -# print ('nw_overlap is True and newey_west generated a non positive ' -# 'semidefinite matrix, so using newey_west with max_lags of %d.' -# % new_max_lags) - return newey_west(m, new_max_lags, nobs, df) - - return Xeps - -def calc_F(R, r, beta, var_beta, nobs, df): - """ - Computes the standard F-test statistic for linear restriction - hypothesis testing - - Parameters - ---------- - R: ndarray (N x N) - Restriction matrix - r: ndarray (N x 1) - Restriction vector - beta: ndarray (N x 1) - Estimated model coefficients - var_beta: ndarray (N x N) - Variance covariance matrix of regressors - nobs: int - Number of observations in model - df: int - Model degrees of freedom - - Returns - ------- - F value, (q, df_resid), p value - """ - from scipy.stats import f - - hyp = np.dot(R, beta.reshape(len(beta), 1)) - r - RSR = np.dot(R, np.dot(var_beta, R.T)) - - q = len(r) - - F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q - - p_value = 1 - f.cdf(F, q, nobs - df) - - return F, (q, nobs - df), p_value - -def chain_dot(*matrices): - """ - Returns the dot product of the given matrices. - - Parameters - ---------- - matrices: argument list of ndarray - """ - return reduce(lambda x, y: np.dot(y, x), matrices[::-1]) +# pylint: disable-msg=E1103 +# pylint: disable-msg=W0212 + +from __future__ import division + +import numpy as np +import numpy.linalg as linalg + +def rank(X, cond=1.0e-12): + """ + Return the rank of a matrix X based on its generalized inverse, + not the SVD. + """ + X = np.asarray(X) + if len(X.shape) == 2: + import scipy.linalg as SL + D = SL.svdvals(X) + result = np.add.reduce(np.greater(D / D.max(), cond)) + return int(result.astype(np.int32)) + else: + return int(not np.alltrue(np.equal(X, 0.))) + +def solve(a, b): + """Returns the solution of A X = B.""" + try: + return linalg.solve(a, b) + except linalg.LinAlgError: + return np.dot(linalg.pinv(a), b) + +def inv(a): + """Returns the inverse of A.""" + try: + return np.linalg.inv(a) + except linalg.LinAlgError: + return np.linalg.pinv(a) + +def is_psd(m): + eigvals = linalg.eigvals(m) + return np.isreal(eigvals).all() and (eigvals >= 0).all() + +def newey_west(m, max_lags, nobs, df, nw_overlap=False): + """ + Compute Newey-West adjusted covariance matrix, taking into account + specified number of leads / lags + + Parameters + ---------- + m: (N x K) + max_lags: int + nobs: int + Number of observations in model + df: int + Degrees of freedom in explanatory variables + nw_overlap: boolean + + Returns + ------- + ndarray (K x K) + + Reference + --------- + Newey, W. K. & West, K. D. (1987) A Simple, Positive + Semi-definite, Heteroskedasticity and Autocorrelation Consistent + Covariance Matrix, Econometrica, vol. 55(3), 703-708 + """ + Xeps = np.dot(m.T, m) + for lag in xrange(1, max_lags + 1): + auto_cov = np.dot(m[:-lag].T, m[lag:]) + weight = lag / (max_lags + 1) + if nw_overlap: + weight = 0 + bb = auto_cov + auto_cov.T + dd = (1 - weight) * bb + Xeps += dd + + Xeps *= nobs / (nobs - df) + + if nw_overlap and not is_psd(Xeps): + new_max_lags = int(np.ceil(max_lags * 1.5)) +# print ('nw_overlap is True and newey_west generated a non positive ' +# 'semidefinite matrix, so using newey_west with max_lags of %d.' +# % new_max_lags) + return newey_west(m, new_max_lags, nobs, df) + + return Xeps + +def calc_F(R, r, beta, var_beta, nobs, df): + """ + Computes the standard F-test statistic for linear restriction + hypothesis testing + + Parameters + ---------- + R: ndarray (N x N) + Restriction matrix + r: ndarray (N x 1) + Restriction vector + beta: ndarray (N x 1) + Estimated model coefficients + var_beta: ndarray (N x N) + Variance covariance matrix of regressors + nobs: int + Number of observations in model + df: int + Model degrees of freedom + + Returns + ------- + F value, (q, df_resid), p value + """ + from scipy.stats import f + + hyp = np.dot(R, beta.reshape(len(beta), 1)) - r + RSR = np.dot(R, np.dot(var_beta, R.T)) + + q = len(r) + + F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q + + p_value = 1 - f.cdf(F, q, nobs - df) + + return F, (q, nobs - df), p_value + +def chain_dot(*matrices): + """ + Returns the dot product of the given matrices. + + Parameters + ---------- + matrices: argument list of ndarray + """ + return reduce(lambda x, y: np.dot(y, x), matrices[::-1]) diff --git a/pandas/stats/setup.py b/pandas/stats/setup.py index 0a2302ff07223..d3b1e9ef07ded 100644 --- a/pandas/stats/setup.py +++ b/pandas/stats/setup.py @@ -1,11 +1,11 @@ -#!/usr/bin/env python - -def configuration(parent_package='',top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration('stats', parent_package, top_path) - config.add_data_dir('tests') - return config - -if __name__ == '__main__': - print('This is the wrong setup.py file to run') - +#!/usr/bin/env python + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('stats', parent_package, top_path) + config.add_data_dir('tests') + return config + +if __name__ == '__main__': + print('This is the wrong setup.py file to run') + diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py index 1b78f50bec139..85f8d1a74dfd3 100644 --- a/pandas/stats/tests/test_fama_macbeth.py +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -1,32 +1,32 @@ -from pandas.stats.api import fama_macbeth -from common import assert_almost_equal, BaseTest - -class TestFamaMacBeth(BaseTest): - def testFamaMacBethRolling(self): - self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y) - - def checkFamaMacBethExtended(self, window_type, x, y, **kwds): - window = 25 - - result = fama_macbeth(y=y, x=x, window_type=window_type, window=window, - **kwds) - - index = result._index - time = len(index) - - for i in xrange(time - window + 1): - if window_type == 'rolling': - start = index[i] - else: - start = index[0] - - end = index[i + window - 1] - - x2 = {} - for k, v in x.iteritems(): - x2[k] = v.truncate(start, end) - y2 = y.truncate(start, end) - - reference = fama_macbeth(y=y2, x=x2, **kwds) - - assert_almost_equal(reference._stats, result._stats[:, i]) +from pandas.stats.api import fama_macbeth +from common import assert_almost_equal, BaseTest + +class TestFamaMacBeth(BaseTest): + def testFamaMacBethRolling(self): + self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y) + + def checkFamaMacBethExtended(self, window_type, x, y, **kwds): + window = 25 + + result = fama_macbeth(y=y, x=x, window_type=window_type, window=window, + **kwds) + + index = result._index + time = len(index) + + for i in xrange(time - window + 1): + if window_type == 'rolling': + start = index[i] + else: + start = index[0] + + end = index[i + window - 1] + + x2 = {} + for k, v in x.iteritems(): + x2[k] = v.truncate(start, end) + y2 = y.truncate(start, end) + + reference = fama_macbeth(y=y2, x=x2, **kwds) + + assert_almost_equal(reference._stats, result._stats[:, i]) diff --git a/pandas/stats/tests/test_ols_filter.py b/pandas/stats/tests/test_ols_filter.py index 737113dcea848..2224a4cf479c0 100644 --- a/pandas/stats/tests/test_ols_filter.py +++ b/pandas/stats/tests/test_ols_filter.py @@ -1,84 +1,84 @@ -from datetime import datetime -import unittest - -from numpy import NaN -import numpy as np - -from pandas.core.datetools import bday -from pandas.core.api import DateRange, Series, DataFrame -from pandas.stats.ols import _filter_data - -class TestOLSFilter(unittest.TestCase): - - def setUp(self): - date_index = DateRange(datetime(2009, 12, 11), periods=3, offset=bday) - ts = Series([3, 1, 4], index=date_index) - self.TS1 = ts - - date_index = DateRange(datetime(2009, 12, 11), periods=5, offset=bday) - ts = Series([1, 5, 9, 2, 6], index=date_index) - self.TS2 = ts - - date_index = DateRange(datetime(2009, 12, 11), periods=3, offset=bday) - ts = Series([5, NaN, 3], index=date_index) - self.TS3 = ts - - date_index = DateRange(datetime(2009, 12, 11), periods=5, offset=bday) - ts = Series([NaN, 5, 8, 9, 7], index=date_index) - self.TS4 = ts - - data = {'x1' : self.TS2, 'x2' : self.TS4} - self.DF1 = DataFrame(data=data) - - data = {'x1' : self.TS2, 'x2' : self.TS4} - self.DICT1 = data - - def testFilterWithSeriesRHS(self): - (lhs, rhs, rhs_pre, - index, valid) = _filter_data(self.TS1, {'x1' : self.TS2}) - self.tsAssertEqual(self.TS1, lhs) - self.tsAssertEqual(self.TS2[:3], rhs['x1']) - self.tsAssertEqual(self.TS2, rhs_pre['x1']) - - def testFilterWithSeriesRHS2(self): - (lhs, rhs, rhs_pre, - index, valid) = _filter_data(self.TS2, {'x1' : self.TS1}) - self.tsAssertEqual(self.TS2[:3], lhs) - self.tsAssertEqual(self.TS1, rhs['x1']) - self.tsAssertEqual(self.TS1, rhs_pre['x1']) - - def testFilterWithSeriesRHS3(self): - (lhs, rhs, rhs_pre, - index, valid) = _filter_data(self.TS3, {'x1' : self.TS4}) - exp_lhs = self.TS3[2:3] - exp_rhs = self.TS4[2:3] - exp_rhs_pre = self.TS4[1:] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs, rhs['x1']) - self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) - - def testFilterWithDataFrameRHS(self): - (lhs, rhs, rhs_pre, - index, valid) = _filter_data(self.TS1, self.DF1) - exp_lhs = self.TS1[1:] - exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) - - def testFilterWithDictRHS(self): - (lhs, rhs, rhs_pre, - index, valid) = _filter_data(self.TS1, self.DICT1) - exp_lhs = self.TS1[1:] - exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) - - def tsAssertEqual(self, ts1, ts2): - self.assert_(np.array_equal(ts1, ts2)) - -if __name__ == '__main__': - unittest.main() +from datetime import datetime +import unittest + +from numpy import NaN +import numpy as np + +from pandas.core.datetools import bday +from pandas.core.api import DateRange, Series, DataFrame +from pandas.stats.ols import _filter_data + +class TestOLSFilter(unittest.TestCase): + + def setUp(self): + date_index = DateRange(datetime(2009, 12, 11), periods=3, offset=bday) + ts = Series([3, 1, 4], index=date_index) + self.TS1 = ts + + date_index = DateRange(datetime(2009, 12, 11), periods=5, offset=bday) + ts = Series([1, 5, 9, 2, 6], index=date_index) + self.TS2 = ts + + date_index = DateRange(datetime(2009, 12, 11), periods=3, offset=bday) + ts = Series([5, NaN, 3], index=date_index) + self.TS3 = ts + + date_index = DateRange(datetime(2009, 12, 11), periods=5, offset=bday) + ts = Series([NaN, 5, 8, 9, 7], index=date_index) + self.TS4 = ts + + data = {'x1' : self.TS2, 'x2' : self.TS4} + self.DF1 = DataFrame(data=data) + + data = {'x1' : self.TS2, 'x2' : self.TS4} + self.DICT1 = data + + def testFilterWithSeriesRHS(self): + (lhs, rhs, rhs_pre, + index, valid) = _filter_data(self.TS1, {'x1' : self.TS2}) + self.tsAssertEqual(self.TS1, lhs) + self.tsAssertEqual(self.TS2[:3], rhs['x1']) + self.tsAssertEqual(self.TS2, rhs_pre['x1']) + + def testFilterWithSeriesRHS2(self): + (lhs, rhs, rhs_pre, + index, valid) = _filter_data(self.TS2, {'x1' : self.TS1}) + self.tsAssertEqual(self.TS2[:3], lhs) + self.tsAssertEqual(self.TS1, rhs['x1']) + self.tsAssertEqual(self.TS1, rhs_pre['x1']) + + def testFilterWithSeriesRHS3(self): + (lhs, rhs, rhs_pre, + index, valid) = _filter_data(self.TS3, {'x1' : self.TS4}) + exp_lhs = self.TS3[2:3] + exp_rhs = self.TS4[2:3] + exp_rhs_pre = self.TS4[1:] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs, rhs['x1']) + self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) + + def testFilterWithDataFrameRHS(self): + (lhs, rhs, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DF1) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def testFilterWithDictRHS(self): + (lhs, rhs, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DICT1) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def tsAssertEqual(self, ts1, ts2): + self.assert_(np.array_equal(ts1, ts2)) + +if __name__ == '__main__': + unittest.main() diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 438e259fa1385..fe88331abe515 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -1,222 +1,222 @@ -""" -Pierre G-M's caching decorators -""" - -import warnings - -__all__ = ['resettable_cache','cache_readonly', 'cache_writable'] - -#------------------------------------------------------------------------------- -# Pierre G-M's caching decorators - -class CacheWriteWarning(UserWarning): - pass - - -class ResettableCache(dict): - """ - Dictionary whose elements mey depend one from another. - - If entry `B` depends on entry `A`, changing the values of entry `A` will - reset the value of entry `B` to a default (None); deleteing entry `A` will - delete entry `B`. The connections between entries are stored in a - `_resetdict` private attribute. - - Parameters - ---------- - reset : dictionary, optional - An optional dictionary, associated a sequence of entries to any key - of the object. - items : var, optional - An optional dictionary used to initialize the dictionary - - Examples - -------- - >>> reset = dict(a=('b',), b=('c',)) - >>> cache = resettable_cache(a=0, b=1, c=2, reset=reset) - >>> assert_equal(cache, dict(a=0, b=1, c=2)) - - >>> print "Try resetting a" - >>> cache['a'] = 1 - >>> assert_equal(cache, dict(a=1, b=None, c=None)) - >>> cache['c'] = 2 - >>> assert_equal(cache, dict(a=1, b=None, c=2)) - >>> cache['b'] = 0 - >>> assert_equal(cache, dict(a=1, b=0, c=None)) - - >>> print "Try deleting b" - >>> del(cache['a']) - >>> assert_equal(cache, {}) - """ - - def __init__(self, reset=None, **items): - self._resetdict = reset or {} - dict.__init__(self, **items) - - def __setitem__(self, key, value): - dict.__setitem__(self, key, value) - for mustreset in self._resetdict.get(key, []): - self[mustreset] = None - - def __delitem__(self, key): - dict.__delitem__(self, key) - for mustreset in self._resetdict.get(key, []): - del(self[mustreset]) - -resettable_cache = ResettableCache - -class CachedAttribute(object): - - def __init__(self, func, cachename=None, resetlist=None): - self.fget = func - self.name = func.__name__ - self.cachename = cachename or '_cache' - self.resetlist = resetlist or () - - def __get__(self, obj, type=None): - if obj is None: - return self.fget - # Get the cache or set a default one if needed - _cachename = self.cachename - _cache = getattr(obj, _cachename, None) - if _cache is None: - setattr(obj, _cachename, resettable_cache()) - _cache = getattr(obj, _cachename) - # Get the name of the attribute to set and cache - name = self.name - _cachedval = _cache.get(name, None) -# print "[_cachedval=%s]" % _cachedval - if _cachedval is None: - # Call the "fget" function - _cachedval = self.fget(obj) - # Set the attribute in obj -# print "Setting %s in cache to %s" % (name, _cachedval) - try: - _cache[name] = _cachedval - except KeyError: - setattr(_cache, name, _cachedval) - # Update the reset list if needed (and possible) - resetlist = self.resetlist - if resetlist is not (): - try: - _cache._resetdict[name] = self.resetlist - except AttributeError: - pass -# else: -# print "Reading %s from cache (%s)" % (name, _cachedval) - return _cachedval - - def __set__(self, obj, value): - errmsg = "The attribute '%s' cannot be overwritten" % self.name - warnings.warn(errmsg, CacheWriteWarning) - -class CachedWritableAttribute(CachedAttribute): - # - def __set__(self, obj, value): - _cache = getattr(obj, self.cachename) - name = self.name - try: - _cache[name] = value - except KeyError: - setattr(_cache, name, value) - -class _cache_readonly(object): - """ - Decorator for CachedAttribute - """ - - def __init__(self, cachename=None, resetlist=None): - self.func = None - self.cachename = cachename - self.resetlist = resetlist or None - - def __call__(self, func): - return CachedAttribute(func, - cachename=self.cachename, - resetlist=self.resetlist) -cache_readonly = _cache_readonly() - -class cache_writable(_cache_readonly): - """ - Decorator for CachedWritableAttribute - """ - def __call__(self, func): - return CachedWritableAttribute(func, - cachename=self.cachename, - resetlist=self.resetlist) - - -if __name__ == "__main__": -### Tests resettable_cache ---------------------------------------------------- - - from numpy.testing import * - - reset = dict(a=('b',), b=('c',)) - cache = resettable_cache(a=0, b=1, c=2, reset=reset) - assert_equal(cache, dict(a=0, b=1, c=2)) - # - print "Try resetting a" - cache['a'] = 1 - assert_equal(cache, dict(a=1, b=None, c=None)) - cache['c'] = 2 - assert_equal(cache, dict(a=1, b=None, c=2)) - cache['b'] = 0 - assert_equal(cache, dict(a=1, b=0, c=None)) - # - print "Try deleting b" - del(cache['a']) - assert_equal(cache, {}) -### --------------------------------------------------------------------------- - - - class Example(object): - # - def __init__(self): - self._cache = resettable_cache() - self.a = 0 - # - @cache_readonly - def b(self): - return 1 - @cache_writable(resetlist='d') - def c(self): - return 2 - @cache_writable(resetlist=('e', 'f')) - def d(self): - return self.c + 1 - # - @cache_readonly - def e(self): - return 4 - @cache_readonly - def f(self): - return self.e + 1 - - ex = Example() - print "(attrs : %s)" % str(ex.__dict__) - print "(cached : %s)" % str(ex._cache) - print "Try a :", ex.a - print "Try accessing/setting a readonly attribute" - assert_equal(ex.__dict__, dict(a=0, _cache={})) - print "Try b #1:", ex.b - b = ex.b - assert_equal(b, 1) - assert_equal(ex.__dict__, dict(a=0, _cache=dict(b=1,))) -# assert_equal(ex.__dict__, dict(a=0, b=1, _cache=dict(b=1))) - ex.b = -1 - print "Try dict", ex.__dict__ - assert_equal(ex._cache, dict(b=1,)) - # - print "Try accessing/resetting a cachewritable attribute" - c = ex.c - assert_equal(c, 2) - assert_equal(ex._cache, dict(b=1, c=2)) - d = ex.d - assert_equal(d, 3) - assert_equal(ex._cache, dict(b=1, c=2, d=3)) - ex.c = 0 - assert_equal(ex._cache, dict(b=1, c=0, d=None, e=None, f=None)) - d = ex.d - assert_equal(ex._cache, dict(b=1, c=0, d=1, e=None, f=None)) - ex.d = 5 - assert_equal(ex._cache, dict(b=1, c=0, d=5, e=None, f=None)) +""" +Pierre G-M's caching decorators +""" + +import warnings + +__all__ = ['resettable_cache','cache_readonly', 'cache_writable'] + +#------------------------------------------------------------------------------- +# Pierre G-M's caching decorators + +class CacheWriteWarning(UserWarning): + pass + + +class ResettableCache(dict): + """ + Dictionary whose elements mey depend one from another. + + If entry `B` depends on entry `A`, changing the values of entry `A` will + reset the value of entry `B` to a default (None); deleteing entry `A` will + delete entry `B`. The connections between entries are stored in a + `_resetdict` private attribute. + + Parameters + ---------- + reset : dictionary, optional + An optional dictionary, associated a sequence of entries to any key + of the object. + items : var, optional + An optional dictionary used to initialize the dictionary + + Examples + -------- + >>> reset = dict(a=('b',), b=('c',)) + >>> cache = resettable_cache(a=0, b=1, c=2, reset=reset) + >>> assert_equal(cache, dict(a=0, b=1, c=2)) + + >>> print "Try resetting a" + >>> cache['a'] = 1 + >>> assert_equal(cache, dict(a=1, b=None, c=None)) + >>> cache['c'] = 2 + >>> assert_equal(cache, dict(a=1, b=None, c=2)) + >>> cache['b'] = 0 + >>> assert_equal(cache, dict(a=1, b=0, c=None)) + + >>> print "Try deleting b" + >>> del(cache['a']) + >>> assert_equal(cache, {}) + """ + + def __init__(self, reset=None, **items): + self._resetdict = reset or {} + dict.__init__(self, **items) + + def __setitem__(self, key, value): + dict.__setitem__(self, key, value) + for mustreset in self._resetdict.get(key, []): + self[mustreset] = None + + def __delitem__(self, key): + dict.__delitem__(self, key) + for mustreset in self._resetdict.get(key, []): + del(self[mustreset]) + +resettable_cache = ResettableCache + +class CachedAttribute(object): + + def __init__(self, func, cachename=None, resetlist=None): + self.fget = func + self.name = func.__name__ + self.cachename = cachename or '_cache' + self.resetlist = resetlist or () + + def __get__(self, obj, type=None): + if obj is None: + return self.fget + # Get the cache or set a default one if needed + _cachename = self.cachename + _cache = getattr(obj, _cachename, None) + if _cache is None: + setattr(obj, _cachename, resettable_cache()) + _cache = getattr(obj, _cachename) + # Get the name of the attribute to set and cache + name = self.name + _cachedval = _cache.get(name, None) +# print "[_cachedval=%s]" % _cachedval + if _cachedval is None: + # Call the "fget" function + _cachedval = self.fget(obj) + # Set the attribute in obj +# print "Setting %s in cache to %s" % (name, _cachedval) + try: + _cache[name] = _cachedval + except KeyError: + setattr(_cache, name, _cachedval) + # Update the reset list if needed (and possible) + resetlist = self.resetlist + if resetlist is not (): + try: + _cache._resetdict[name] = self.resetlist + except AttributeError: + pass +# else: +# print "Reading %s from cache (%s)" % (name, _cachedval) + return _cachedval + + def __set__(self, obj, value): + errmsg = "The attribute '%s' cannot be overwritten" % self.name + warnings.warn(errmsg, CacheWriteWarning) + +class CachedWritableAttribute(CachedAttribute): + # + def __set__(self, obj, value): + _cache = getattr(obj, self.cachename) + name = self.name + try: + _cache[name] = value + except KeyError: + setattr(_cache, name, value) + +class _cache_readonly(object): + """ + Decorator for CachedAttribute + """ + + def __init__(self, cachename=None, resetlist=None): + self.func = None + self.cachename = cachename + self.resetlist = resetlist or None + + def __call__(self, func): + return CachedAttribute(func, + cachename=self.cachename, + resetlist=self.resetlist) +cache_readonly = _cache_readonly() + +class cache_writable(_cache_readonly): + """ + Decorator for CachedWritableAttribute + """ + def __call__(self, func): + return CachedWritableAttribute(func, + cachename=self.cachename, + resetlist=self.resetlist) + + +if __name__ == "__main__": +### Tests resettable_cache ---------------------------------------------------- + + from numpy.testing import * + + reset = dict(a=('b',), b=('c',)) + cache = resettable_cache(a=0, b=1, c=2, reset=reset) + assert_equal(cache, dict(a=0, b=1, c=2)) + # + print "Try resetting a" + cache['a'] = 1 + assert_equal(cache, dict(a=1, b=None, c=None)) + cache['c'] = 2 + assert_equal(cache, dict(a=1, b=None, c=2)) + cache['b'] = 0 + assert_equal(cache, dict(a=1, b=0, c=None)) + # + print "Try deleting b" + del(cache['a']) + assert_equal(cache, {}) +### --------------------------------------------------------------------------- + + + class Example(object): + # + def __init__(self): + self._cache = resettable_cache() + self.a = 0 + # + @cache_readonly + def b(self): + return 1 + @cache_writable(resetlist='d') + def c(self): + return 2 + @cache_writable(resetlist=('e', 'f')) + def d(self): + return self.c + 1 + # + @cache_readonly + def e(self): + return 4 + @cache_readonly + def f(self): + return self.e + 1 + + ex = Example() + print "(attrs : %s)" % str(ex.__dict__) + print "(cached : %s)" % str(ex._cache) + print "Try a :", ex.a + print "Try accessing/setting a readonly attribute" + assert_equal(ex.__dict__, dict(a=0, _cache={})) + print "Try b #1:", ex.b + b = ex.b + assert_equal(b, 1) + assert_equal(ex.__dict__, dict(a=0, _cache=dict(b=1,))) +# assert_equal(ex.__dict__, dict(a=0, b=1, _cache=dict(b=1))) + ex.b = -1 + print "Try dict", ex.__dict__ + assert_equal(ex._cache, dict(b=1,)) + # + print "Try accessing/resetting a cachewritable attribute" + c = ex.c + assert_equal(c, 2) + assert_equal(ex._cache, dict(b=1, c=2)) + d = ex.d + assert_equal(d, 3) + assert_equal(ex._cache, dict(b=1, c=2, d=3)) + ex.c = 0 + assert_equal(ex._cache, dict(b=1, c=0, d=None, e=None, f=None)) + d = ex.d + assert_equal(ex._cache, dict(b=1, c=0, d=1, e=None, f=None)) + ex.d = 5 + assert_equal(ex._cache, dict(b=1, c=0, d=5, e=None, f=None))