misc documentation, some work on rpy2 interface. near git migration

git-svn-id: http://pandas.googlecode.com/svn/trunk@202 d5231056-7de3-11de-ac95-d976489f1ece
neurodebian · Sep 19, 2010 · 2aeb176 · 2aeb176
1 parent 3f3508f
commit 2aeb176
Show file tree

Hide file tree

Showing 19 changed files with 639 additions and 31 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -0,0 +1,26 @@
+# .coveragerc to control coverage.py
+[run]
+branch = False
+
+[report]
+# Regexes for lines to exclude from consideration
+exclude_lines =
+    # Have to re-enable the standard pragma
+    pragma: no cover
+
+    # Don't complain about missing debug-only code:
+    def __repr__
+    if self\.debug
+
+    # Don't complain if tests don't hit defensive assertion code:
+    raise AssertionError
+    raise NotImplementedError
+
+    # Don't complain if non-runnable code isn't run:
+    if 0:
+    if __name__ == .__main__.:
+
+ignore_errors = False
+
+[html]
+directory = coverage_html_report
diff --git a/LICENSE.txt → LICENSE b/LICENSE.txt → LICENSE
diff --git a/README.txt → README.rst b/README.txt → README.rst
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -0,0 +1,17 @@
+=============
+Release Notes
+=============
+
+pandas 0.3.0
+============
+
+**Release date:**
+
+**New features / modules**
+
+**Improvements**
+
+**API Changes**
+
+**Bug fixes**
+
diff --git a/TODO.txt → TODO.rst b/TODO.txt → TODO.rst
diff --git a/bench/alignment.py b/bench/alignment.py
@@ -0,0 +1,21 @@
+# Setup
+import numpy as np
+import pandas
+import la
+N = 1000
+K = 50
+arr1 = np.random.randn(N, K)
+arr2 = np.random.randn(N, K)
+idx1 = range(N)
+idx2 = range(K)
+
+# pandas
+dma1 = pandas.DataMatrix(arr1, idx1, idx2)
+dma2 = pandas.DataMatrix(arr2, idx1[::-1], idx2[::-1])
+
+# larry
+lar1 = la.larry(arr1, [idx1, idx2])
+lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]])
+
+for i in range(100):
+    result = lar1 + lar2
diff --git a/bench/serialize.py b/bench/serialize.py
@@ -0,0 +1,80 @@
+import time, os
+import numpy as np
+
+import la
+import pandas
+
+def timeit(f, iterations):
+    start = time.clock()
+
+    for i in xrange(iterations):
+        f()
+
+    return time.clock() - start
+
+def roundtrip_archive(N, iterations=10):
+
+    # Create data
+    arr = np.random.randn(N, N)
+    lar = la.larry(arr)
+    dma = pandas.DataMatrix(arr, range(N), range(N))
+
+    # filenames
+    filename_numpy = '/Users/wesm/tmp/numpy.npz'
+    filename_larry = '/Users/wesm/tmp/archive.hdf5'
+    filename_pandas = '/Users/wesm/tmp/pandas_tmp'
+
+    # Delete old files
+    try:
+        os.unlink(filename_numpy)
+    except:
+        pass
+    try:
+        os.unlink(filename_larry)
+    except:
+        pass
+    try:
+        os.unlink(filename_pandas)
+    except:
+        pass
+
+    # Time a round trip save and load
+    numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr)
+    numpy_time = timeit(numpy_f, iterations) / iterations
+
+    larry_f = lambda: larry_roundtrip(filename_larry, lar, lar)
+    larry_time = timeit(larry_f, iterations) / iterations
+
+    pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
+    pandas_time = timeit(pandas_f, iterations) / iterations
+
+    print 'Numpy (npz)   %7.4f seconds' % numpy_time
+    print 'larry (HDF5)  %7.4f seconds' % larry_time
+    print 'pandas (HDF5) %7.4f seconds' % pandas_time
+
+def numpy_roundtrip(filename, arr1, arr2):
+    np.savez(filename, arr1=arr1, arr2=arr2)
+    npz = np.load(filename)
+    arr1 = npz['arr1']
+    arr2 = npz['arr2']
+
+def larry_roundtrip(filename, lar1, lar2):
+    io = la.IO(filename)
+    io['lar1'] = lar1
+    io['lar2'] = lar2
+    lar1 = io['lar1']
+    lar2 = io['lar2']
+
+def pandas_roundtrip(filename, dma1, dma2):
+    from pandas.io.pytables import HDFStore
+    store = HDFStore(filename)
+    store['dma1'] = dma1
+    store['dma2'] = dma2
+    dma1 = store['dma1']
+    dma2 = store['dma2']
+
+def pandas_roundtrip_pickle(filename, dma1, dma2):
+    dma1.save(filename)
+    dma1 = pandas.DataMatrix.load(filename)
+    dma2.save(filename)
+    dma2 = pandas.DataMatrix.load(filename)
diff --git a/bench/test.py b/bench/test.py
@@ -0,0 +1,65 @@
+import numpy as np
+import itertools
+import collections
+import scipy.ndimage as ndi
+
+N = 10000
+
+lat = np.random.randint(0, 360, N)
+lon = np.random.randint(0, 360, N)
+data = np.random.randn(N)
+
+def groupby1(lat, lon, data):
+    indexer = np.lexsort((lon, lat))
+    lat = lat.take(indexer)
+    lon = lon.take(indexer)
+    sorted_data = data.take(indexer)
+
+    keys = 1000. * lat + lon
+    unique_keys = np.unique(keys)
+    bounds = keys.searchsorted(unique_keys)
+
+    result = group_agg(sorted_data, bounds, lambda x: x.mean())
+
+    decoder = keys.searchsorted(unique_keys)
+
+    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
+
+def group_mean(lat, lon, data):
+    indexer = np.lexsort((lon, lat))
+    lat = lat.take(indexer)
+    lon = lon.take(indexer)
+    sorted_data = data.take(indexer)
+
+    keys = 1000 * lat + lon
+    unique_keys = np.unique(keys)
+
+    result = ndi.mean(sorted_data, labels=keys, index=unique_keys)
+    decoder = keys.searchsorted(unique_keys)
+
+    return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
+
+def group_mean_naive(lat, lon, data):
+    grouped = collections.defaultdict(list)
+    for lt, ln, da in zip(lat, lon, data):
+      grouped[(lt, ln)].append(da)
+
+    averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items())
+
+    return averaged
+
+def group_agg(values, bounds, f):
+    N = len(values)
+    result = np.empty(len(bounds), dtype=float)
+    for i, left_bound in enumerate(bounds):
+        if i == len(bounds) - 1:
+            right_bound = N
+        else:
+            right_bound = bounds[i + 1]
+
+        result[i] = f(values[left_bound : right_bound])
+
+    return result
+
+# for i in range(10):
+#     groupby1(lat, lon, data)
diff --git a/doc/source/r_guide.rst b/doc/source/r_guide.rst
@@ -0,0 +1,6 @@
+.. currentmodule:: pandas
+
+.. r_guide:
+
+pandas for R users
+------------------
diff --git a/pandas/core/tests/test_common.py b/pandas/core/tests/test_common.py
@@ -0,0 +1,19 @@
+from pandas.core.common import notnull, isnull
+import pandas.core.common as common
+
+import numpy as np
+
+def test_notnull():
+    assert notnull(1.)
+    assert not notnull(None)
+    assert not notnull(np.NaN)
+    assert not notnull(np.inf)
+    assert not notnull(-np.inf)
+
+def test_isnull():
+    assert not isnull(1.)
+    assert isnull(None)
+    assert isnull(np.NaN)
+    assert isnull(np.inf)
+    assert isnull(-np.inf)
+
diff --git a/pandas/lib/src/moments.pyx b/pandas/lib/src/moments.pyx
@@ -1,5 +1,13 @@
 # Cython implementations of rolling sum, mean, variance, skewness,
 # other statistical moment functions
+#
+# Misc implementation notes
+# -------------------------
+#
+# - In Cython x * x is faster than x ** 2 for C types, this should be
+#   periodically revisited to see if it's still true.
+#
+# -
 
 # original C implementation by N. Devillard.
 # This code in public domain.
@@ -32,18 +40,15 @@ def kth_smallest(ndarray[double_t, ndim=1] a, int k):
         j = m
 
         while 1:
-            while a[i] < x:
-                i += 1
-            while x < a[j]:
-                j -= 1
+            while a[i] < x: i += 1
+            while x < a[j]: j -= 1
             if i <= j:
                 t = a[i]
                 a[i] = a[j]
                 a[j] = t
-                i += 1
-                j -= 1
-            if i > j:
-                break
+                i += 1; j -= 1
+
+            if i > j: break
 
         if j < k: l = i
         if k < i: m = j
@@ -158,8 +163,20 @@ def roll_mean(ndarray[double_t, ndim=1] input,
 #-------------------------------------------------------------------------------
 # Exponentially weighted moving average
 
-def ewma(ndarray[double_t, ndim=1] input,
-         int com):
+def ewma(ndarray[double_t, ndim=1] input, double_t com):
+    '''
+    Compute exponentially-weighted moving average using center-of-mass.
+
+    Parameters
+    ----------
+    input : ndarray (float64 type)
+    com : float64
+
+    Returns
+    -------
+    y : ndarray
+    '''
+
     cdef double cur, prev, neww, oldw, adj
     cdef int i
     cdef int N = len(input)
@@ -423,12 +440,21 @@ cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op):
     return output
 
 def roll_median(ndarray input, int win, int minp):
+    '''
+    O(N log(window)) implementation using skip list
+    '''
     return _roll_skiplist_op(input, win, minp, _get_median)
 
 def roll_max(ndarray input, int win, int minp):
+    '''
+    O(N log(window)) implementation using skip list
+    '''
     return _roll_skiplist_op(input, win, minp, _get_max)
 
 def roll_min(ndarray input, int win, int minp):
+    '''
+    O(N log(window)) implementation using skip list
+    '''
     return _roll_skiplist_op(input, win, minp, _get_min)
 
 cdef double_t _get_median(IndexableSkiplist skiplist, int nobs,