# 重采样及频率转换

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas import DataFrame, Series
from pandas.tseries.offsets import Day, Hour, Minute, MonthEnd

In [2]:
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)
ts.resample('M', kind='period').mean() # 统计每月平均数，实际上是根据月份做了group。

2000-01   -0.073327
2000-02    0.093245
2000-03   -0.368806
2000-04    0.493865
Freq: M, dtype: float64

### resample方法的参数

- freq：
- how已经被淘汰。原来的how='mean'写成resample(...).mean()。
- axis：
- fill_method已经被淘汰。原来的fill_method='...'写成resample(...).ffill()。方法为ffill或bfill，默认不插值。
- close：      在降采样中，各时间段的哪一端是闭合的，'right'（默认）或'left'。
- label：      在降采样中，如何设置聚合值的标签。'right'（默认）或'left'（面元的右/左边界）。
-              例如，9:30到9:35之间的这5分钟会被标记成9:30或9:35。
- loffset：    面元标签的时间校正值。比如'-1s'或Second(-1)用于将聚合标签调早1秒。
- limit：      在向前或向后填充时，允许填充的最大时期数。默认None。
- kind：       聚合到时期（'period'）或时间戳（'timestamp'）,默认聚合到时间序列的索引类型。
- convertion： 当重采样时期时，将低频转换到高频采样所采用的约定（'start'或'end'），默认'end'。

## 降采样

In [3]:
rng = pd.date_range('1/1/2000', periods=12, freq='T') # T是按分钟
ts = Series(np.arange(12), index=rng)
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [4]:
ts.resample(Minute(5)).sum() # 等价：ts.resample('5min').sum()


2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [5]:
ts.resample('5min', closed='right').sum() # 比较一下left/right不同结果


1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [7]:
ts.resample('5min', closed='left').sum() # 比较一下left/right不同结果


2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [6]:
ts.resample('5min', closed='left', label='right').sum() # 比较close/label不同组合的结果


2000-01-01 00:05:00    10
2000-01-01 00:10:00    35
2000-01-01 00:15:00    21
Freq: 5T, dtype: int64

In [8]:
ts.resample('5min', loffset='-1s').sum()



>>> df.resample(freq="3s", loffset="8H")

becomes:

>>> from pandas.tseries.frequencies import to_offset
>>> df = df.resample(freq="3s").mean()
>>> df.index = df.index.to_timestamp() + to_offset("8H")

  ts.resample('5min', loffset='-1s').sum()


1999-12-31 23:59:59    10
2000-01-01 00:04:59    35
2000-01-01 00:09:59    21
Freq: 5T, dtype: int64

## OHLC重采样 Open/High/Low/Close

In [10]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [9]:
ts.resample('5min').ohlc() # 其实就是5分钟k线图


Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


## 通过groupby进行重采样

In [11]:
rng = pd.date_range('1/1/2000', periods=100, freq='D')
ts = Series(np.arange(100), index=rng)
ts.groupby(lambda x: x.month).mean()

1    15
2    45
3    75
4    95
dtype: int64

In [12]:
ts.groupby(lambda x: x.weekday).mean() # 0是周日


0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

## 升采样和插值

In [14]:
frame = DataFrame(np.random.randn(2, 4),
                  index=pd.date_range('1/1/2000', periods=2, freq='W-WED'),
                  columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-1.484707,1.261486,0.679905,-1.113999
2000-01-12,0.729956,-0.381346,1.619778,-0.058862


In [16]:
df_daily = frame.resample('D') # 注意警告信息，resample现在生成的是一个延迟计算对象。
df_daily

<pandas.core.resample.DatetimeIndexResampler object at 0x7fe4d0c1e340>

In [17]:
for i in df_daily:
    print(i)

(Timestamp('2000-01-05 00:00:00', freq='D'),             Colorado     Texas  New York      Ohio
2000-01-05 -1.484707  1.261486  0.679905 -1.113999)
(Timestamp('2000-01-06 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-07 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-08 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-09 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-10 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-11 00:00:00', freq='D'), Empty DataFrame
Columns: [Colorado, Texas, New York, Ohio]
Index: [])
(Timestamp('2000-01-12 00:00:00', freq='D'),             Colorado     Texas  New York      Ohio
2000-01-12  0.729956 -0.381346  1.619778 -0.058862)


In [18]:
df_daily = frame.resample('D', ).ffill() # ffill做填充，所以resample必须先展开计算。
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-1.484707,1.261486,0.679905,-1.113999
2000-01-06,-1.484707,1.261486,0.679905,-1.113999
2000-01-07,-1.484707,1.261486,0.679905,-1.113999
2000-01-08,-1.484707,1.261486,0.679905,-1.113999
2000-01-09,-1.484707,1.261486,0.679905,-1.113999
2000-01-10,-1.484707,1.261486,0.679905,-1.113999
2000-01-11,-1.484707,1.261486,0.679905,-1.113999
2000-01-12,0.729956,-0.381346,1.619778,-0.058862


In [19]:
df_daily = frame.resample('D', limit=2).ffill() # 最多填充2次
df_daily

TypeError: resample() got an unexpected keyword argument 'limit'

## 通过时期进行重采样

In [20]:
frame = DataFrame(np.random.randn(24, 4),
                  index=pd.period_range('1-2000', '12-2001', freq='M'),
                  columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame.head()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,-1.325571,-0.197373,-0.077481,-0.355313
2000-02,-0.018144,-0.702155,0.770545,-1.395699
2000-03,1.391254,1.116864,1.226493,-2.265831
2000-04,-0.311593,0.08991,0.296247,0.886038
2000-05,0.478372,-0.083319,2.488567,1.150839


In [21]:
annual_frame = frame.resample('A-DEC').mean()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.251527,-0.306334,0.170523,-0.29455
2001,0.258466,-0.081873,-0.183205,-0.235174


In [22]:
annual_frame.resample('Q-DEC').ffill()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.251527,-0.306334,0.170523,-0.29455
2001,0.258466,-0.081873,-0.183205,-0.235174


In [23]:
annual_frame.resample('Q-DEC', convention='start').ffill()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.251527,-0.306334,0.170523,-0.29455
2001,0.258466,-0.081873,-0.183205,-0.235174


In [24]:
# 一些讲解：
# - 在降采样中，目标频率必须是源频率的子时期（subperiod）。降采样 <-> 高频到低频
# - 在升采样中，目标频率必须是源频率的超时期（superperiod）。升采样 <-> 低频到高频
# - 如果不满足这些条件，就会引发异常。这主要影响的是按季、年、周计算的频率。
#   例如，由Q-MAR定义的时间区间只能升采样为A-MAR、A-JUN、A-SEP、A-DEC等。(有问题自己慢慢琢磨吧...)

In [25]:
quater_frame = annual_frame.resample('Q-MAR').ffill()
quater_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,-0.251527,-0.306334,0.170523,-0.29455
2001Q1,-0.251527,-0.306334,0.170523,-0.29455
2001Q2,-0.251527,-0.306334,0.170523,-0.29455
2001Q3,-0.251527,-0.306334,0.170523,-0.29455
2001Q4,0.258466,-0.081873,-0.183205,-0.235174
2002Q1,0.258466,-0.081873,-0.183205,-0.235174
2002Q2,0.258466,-0.081873,-0.183205,-0.235174
2002Q3,0.258466,-0.081873,-0.183205,-0.235174
