Skip to content

Commit

Permalink
misc documentation, some work on rpy2 interface. near git migration
Browse files Browse the repository at this point in the history
git-svn-id: http://pandas.googlecode.com/svn/trunk@202 d5231056-7de3-11de-ac95-d976489f1ece
  • Loading branch information
wesm committed Sep 19, 2010
1 parent 3f3508f commit 2aeb176
Show file tree
Hide file tree
Showing 19 changed files with 639 additions and 31 deletions.
26 changes: 26 additions & 0 deletions .coveragerc
@@ -0,0 +1,26 @@
# .coveragerc to control coverage.py
[run]
branch = False

[report]
# Regexes for lines to exclude from consideration
exclude_lines =
# Have to re-enable the standard pragma
pragma: no cover

# Don't complain about missing debug-only code:
def __repr__
if self\.debug

# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError

# Don't complain if non-runnable code isn't run:
if 0:
if __name__ == .__main__.:

ignore_errors = False

[html]
directory = coverage_html_report
File renamed without changes.
File renamed without changes.
17 changes: 17 additions & 0 deletions RELEASE.rst
@@ -0,0 +1,17 @@
=============
Release Notes
=============

pandas 0.3.0
============

**Release date:**

**New features / modules**

**Improvements**

**API Changes**

**Bug fixes**

File renamed without changes.
21 changes: 21 additions & 0 deletions bench/alignment.py
@@ -0,0 +1,21 @@
# Setup
import numpy as np
import pandas
import la
N = 1000
K = 50
arr1 = np.random.randn(N, K)
arr2 = np.random.randn(N, K)
idx1 = range(N)
idx2 = range(K)

# pandas
dma1 = pandas.DataMatrix(arr1, idx1, idx2)
dma2 = pandas.DataMatrix(arr2, idx1[::-1], idx2[::-1])

# larry
lar1 = la.larry(arr1, [idx1, idx2])
lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]])

for i in range(100):
result = lar1 + lar2
80 changes: 80 additions & 0 deletions bench/serialize.py
@@ -0,0 +1,80 @@
import time, os
import numpy as np

import la
import pandas

def timeit(f, iterations):
start = time.clock()

for i in xrange(iterations):
f()

return time.clock() - start

def roundtrip_archive(N, iterations=10):

# Create data
arr = np.random.randn(N, N)
lar = la.larry(arr)
dma = pandas.DataMatrix(arr, range(N), range(N))

# filenames
filename_numpy = '/Users/wesm/tmp/numpy.npz'
filename_larry = '/Users/wesm/tmp/archive.hdf5'
filename_pandas = '/Users/wesm/tmp/pandas_tmp'

# Delete old files
try:
os.unlink(filename_numpy)
except:
pass
try:
os.unlink(filename_larry)
except:
pass
try:
os.unlink(filename_pandas)
except:
pass

# Time a round trip save and load
numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr)
numpy_time = timeit(numpy_f, iterations) / iterations

larry_f = lambda: larry_roundtrip(filename_larry, lar, lar)
larry_time = timeit(larry_f, iterations) / iterations

pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
pandas_time = timeit(pandas_f, iterations) / iterations

print 'Numpy (npz) %7.4f seconds' % numpy_time
print 'larry (HDF5) %7.4f seconds' % larry_time
print 'pandas (HDF5) %7.4f seconds' % pandas_time

def numpy_roundtrip(filename, arr1, arr2):
np.savez(filename, arr1=arr1, arr2=arr2)
npz = np.load(filename)
arr1 = npz['arr1']
arr2 = npz['arr2']

def larry_roundtrip(filename, lar1, lar2):
io = la.IO(filename)
io['lar1'] = lar1
io['lar2'] = lar2
lar1 = io['lar1']
lar2 = io['lar2']

def pandas_roundtrip(filename, dma1, dma2):
from pandas.io.pytables import HDFStore
store = HDFStore(filename)
store['dma1'] = dma1
store['dma2'] = dma2
dma1 = store['dma1']
dma2 = store['dma2']

def pandas_roundtrip_pickle(filename, dma1, dma2):
dma1.save(filename)
dma1 = pandas.DataMatrix.load(filename)
dma2.save(filename)
dma2 = pandas.DataMatrix.load(filename)
65 changes: 65 additions & 0 deletions bench/test.py
@@ -0,0 +1,65 @@
import numpy as np
import itertools
import collections
import scipy.ndimage as ndi

N = 10000

lat = np.random.randint(0, 360, N)
lon = np.random.randint(0, 360, N)
data = np.random.randn(N)

def groupby1(lat, lon, data):
indexer = np.lexsort((lon, lat))
lat = lat.take(indexer)
lon = lon.take(indexer)
sorted_data = data.take(indexer)

keys = 1000. * lat + lon
unique_keys = np.unique(keys)
bounds = keys.searchsorted(unique_keys)

result = group_agg(sorted_data, bounds, lambda x: x.mean())

decoder = keys.searchsorted(unique_keys)

return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))

def group_mean(lat, lon, data):
indexer = np.lexsort((lon, lat))
lat = lat.take(indexer)
lon = lon.take(indexer)
sorted_data = data.take(indexer)

keys = 1000 * lat + lon
unique_keys = np.unique(keys)

result = ndi.mean(sorted_data, labels=keys, index=unique_keys)
decoder = keys.searchsorted(unique_keys)

return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))

def group_mean_naive(lat, lon, data):
grouped = collections.defaultdict(list)
for lt, ln, da in zip(lat, lon, data):
grouped[(lt, ln)].append(da)

averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items())

return averaged

def group_agg(values, bounds, f):
N = len(values)
result = np.empty(len(bounds), dtype=float)
for i, left_bound in enumerate(bounds):
if i == len(bounds) - 1:
right_bound = N
else:
right_bound = bounds[i + 1]

result[i] = f(values[left_bound : right_bound])

return result

# for i in range(10):
# groupby1(lat, lon, data)
6 changes: 6 additions & 0 deletions doc/source/r_guide.rst
@@ -0,0 +1,6 @@
.. currentmodule:: pandas

.. r_guide:
pandas for R users
------------------
19 changes: 19 additions & 0 deletions pandas/core/tests/test_common.py
@@ -0,0 +1,19 @@
from pandas.core.common import notnull, isnull
import pandas.core.common as common

import numpy as np

def test_notnull():
assert notnull(1.)
assert not notnull(None)
assert not notnull(np.NaN)
assert not notnull(np.inf)
assert not notnull(-np.inf)

def test_isnull():
assert not isnull(1.)
assert isnull(None)
assert isnull(np.NaN)
assert isnull(np.inf)
assert isnull(-np.inf)

46 changes: 36 additions & 10 deletions pandas/lib/src/moments.pyx
@@ -1,5 +1,13 @@
# Cython implementations of rolling sum, mean, variance, skewness,
# other statistical moment functions
#
# Misc implementation notes
# -------------------------
#
# - In Cython x * x is faster than x ** 2 for C types, this should be
# periodically revisited to see if it's still true.
#
# -

# original C implementation by N. Devillard.
# This code in public domain.
Expand Down Expand Up @@ -32,18 +40,15 @@ def kth_smallest(ndarray[double_t, ndim=1] a, int k):
j = m

while 1:
while a[i] < x:
i += 1
while x < a[j]:
j -= 1
while a[i] < x: i += 1
while x < a[j]: j -= 1
if i <= j:
t = a[i]
a[i] = a[j]
a[j] = t
i += 1
j -= 1
if i > j:
break
i += 1; j -= 1

if i > j: break

if j < k: l = i
if k < i: m = j
Expand Down Expand Up @@ -158,8 +163,20 @@ def roll_mean(ndarray[double_t, ndim=1] input,
#-------------------------------------------------------------------------------
# Exponentially weighted moving average

def ewma(ndarray[double_t, ndim=1] input,
int com):
def ewma(ndarray[double_t, ndim=1] input, double_t com):
'''
Compute exponentially-weighted moving average using center-of-mass.
Parameters
----------
input : ndarray (float64 type)
com : float64
Returns
-------
y : ndarray
'''

cdef double cur, prev, neww, oldw, adj
cdef int i
cdef int N = len(input)
Expand Down Expand Up @@ -423,12 +440,21 @@ cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op):
return output

def roll_median(ndarray input, int win, int minp):
'''
O(N log(window)) implementation using skip list
'''
return _roll_skiplist_op(input, win, minp, _get_median)

def roll_max(ndarray input, int win, int minp):
'''
O(N log(window)) implementation using skip list
'''
return _roll_skiplist_op(input, win, minp, _get_max)

def roll_min(ndarray input, int win, int minp):
'''
O(N log(window)) implementation using skip list
'''
return _roll_skiplist_op(input, win, minp, _get_min)

cdef double_t _get_median(IndexableSkiplist skiplist, int nobs,
Expand Down

0 comments on commit 2aeb176

Please sign in to comment.