/
functions.py
120 lines (92 loc) · 3.62 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from pandas.core.common import isnull
import numpy as np
#-------------------------------------------------------------------------------
# NaN-friendly reductions and such
def reduce_mean(values, index, buckets, inclusive=False):
def _reduceat_mean(values, mask, locs):
the_sum = np.add.reduceat(values, locs)
the_count = np.add.reduceat(-mask, locs)
return the_sum / the_count
return _reduce_generic(values, index, buckets, _reduceat_mean,
inclusive=inclusive, na_fill=0)
def _reduceat_var(values, mask, locs):
XX = np.add.reduceat(values ** 2, locs)
X = np.add.reduceat(values, locs)
nobs = np.add.reduceat(-mask, locs)
return (XX - X * X) / (nobs - 1)
def reduce_std(values, index, buckets, inclusive=False):
result = _reduce_generic(values, index, buckets, _reduceat_var,
inclusive=inclusive, na_fill=0)
return np.sqrt(result)
def reduce_prod(values, index, buckets, inclusive=False):
def _reduceat_prod(values, mask, locs):
return np.multiply.reduceat(values, locs)
return _reduce_generic(values, index, buckets, _reduceat_prod,
inclusive=inclusive, na_fill=1)
def reduce_min(values, index, buckets, inclusive=False):
def _reduceat_min(values, mask, locs):
return np.minimum.reduceat(values, locs)
return _reduce_generic(values, index, buckets, _reduceat_min,
inclusive=inclusive, na_fill=np.inf)
def reduce_max(values, index, buckets, inclusive=False):
def _reduceat_max(values, mask, locs):
return np.maximum.reduceat(values, locs)
return _reduce_generic(values, index, buckets, _reduceat_max,
inclusive=inclusive, na_fill=-np.inf)
def _reduce_generic(values, index, buckets, freduce, inclusive=False,
na_fill=None):
"""
"""
locs = _bucket_locs(index, buckets, inclusive=inclusive)
values = np.asarray(values)
mask = isnull(values)
if na_fill is not None:
values = values.copy()
np.putmask(values, mask, na_fill)
return freduce(values, mask, locs)
def _reduceat_count(values, mask, locs):
return np.add.reduceat(-mask, locs)
def _bucket_locs(index, buckets, inclusive=False):
if inclusive:
locs = index.searchsorted(buckets, side='left')
else:
locs = index.searchsorted(buckets, side='right')
return locs
def get_bucket(date, bucks):
if date in bucks:
idx = bucks.indexMap[date] + 1
else:
idx = bucks.searchsorted(date)
return bucks[idx]
def dumb_way(series, buckets):
sampled2 = hfseries.groupby(lambda x: get_bucket(x, buckets)).mean()
sampled2 = sampled2.reindex(buckets)
return sampled2
def ts_upsample(dates, buckets, values, aggfunc, inclusive=True):
'''
put something here
'''
nbuckets = len(buckets)
nvalues = len(dates)
output = np.empty(nbuckets, dtype=float)
if inclusive:
_check = lambda x, y: x < y
else:
_check = lambda x, y: x <= y
j = 0
for i, bound in enumerate(buckets):
next_bound = buckets[i + 1]
jstart = j
while _check(dates[j], next_bound) and j < nvalues:
j += 1
output[i] = aggfunc(values[jstart:j])
return Series(output, index=buckets)
if __name__ == '__main__':
N = 1000000
K = 1000
values = np.random.randn(N)
index = np.arange(N).astype(object)
buckets = np.arange(0, N, N // K).astype(object)
result = reduce_mean(values, index, buckets)
import pandas.lib.tseries as tseries
tseries.ts_upsample_mean(index, buckets, values)