In [1]:
import numpy as np
import pandas as pd
from datetime import time
from pandas import DataFrame, Series

## 时间序列以及截面对齐

In [3]:
close_px_all = pd.read_csv('data/stock_px.csv', parse_dates=True, index_col=0)
close_px = close_px_all[['AAPL', 'MSFT', 'XOM', 'IBM', 'SPX']]
print(close_px.head())
print(close_px.tail())

            AAPL  MSFT   XOM    IBM     SPX
1990-02-01  7.86  0.51  6.12  16.79  328.79
1990-02-02  8.00  0.51  6.24  16.89  330.92
1990-02-05  8.18  0.51  6.25  17.32  331.85
1990-02-06  8.12  0.51  6.23  17.56  329.66
1990-02-07  7.77  0.51  6.33  17.93  333.75
              AAPL   MSFT    XOM     IBM      SPX
2011-10-10  388.81  26.94  76.28  186.62  1194.89
2011-10-11  400.29  27.00  76.27  185.00  1195.54
2011-10-12  402.19  26.96  77.16  186.12  1207.25
2011-10-13  408.43  27.18  76.37  186.82  1203.66
2011-10-14  422.00  27.27  78.11  190.53  1224.58


In [9]:
close_px

Unnamed: 0,AAPL,MSFT,XOM,IBM,SPX
1990-02-01,7.86,0.51,6.12,16.79,328.79
1990-02-02,8.00,0.51,6.24,16.89,330.92
1990-02-05,8.18,0.51,6.25,17.32,331.85
1990-02-06,8.12,0.51,6.23,17.56,329.66
1990-02-07,7.77,0.51,6.33,17.93,333.75
...,...,...,...,...,...
2011-10-10,388.81,26.94,76.28,186.62,1194.89
2011-10-11,400.29,27.00,76.27,185.00,1195.54
2011-10-12,402.19,26.96,77.16,186.12,1207.25
2011-10-13,408.43,27.18,76.37,186.82,1203.66


In [8]:
volume_all = pd.read_csv('data/volume.csv', parse_dates=True, index_col=0)
volume = volume_all[['AAPL', 'MSFT', 'XOM', 'SPX']]
volume.head()

Unnamed: 0,AAPL,MSFT,XOM,SPX
1990-02-01,4193200.0,89193600.0,2916400.0,154580000.0
1990-02-02,4248800.0,71395200.0,4250000.0,164400000.0
1990-02-05,3653200.0,59731200.0,5880800.0,130950000.0
1990-02-06,2640000.0,81964800.0,4750800.0,134070000.0
1990-02-07,11180800.0,134150400.0,4124800.0,186710000.0


In [10]:
(close_px * volume).head() # 自动匹配对应位置，计算当天成交所额。IBM没有就填充NA。


Unnamed: 0,AAPL,IBM,MSFT,SPX,XOM
1990-02-01,32958552.0,,45488736.0,50824360000.0,17848368.0
1990-02-02,33990400.0,,36411552.0,54403250000.0,26520000.0
1990-02-05,29883176.0,,30462912.0,43455760000.0,36755000.0
1990-02-06,21436800.0,,41802048.0,44197520000.0,29597484.0
1990-02-07,86874816.0,,68416704.0,62314460000.0,26109984.0


In [11]:
vwap = (close_px * volume).sum() / volume.sum()
vwap

AAPL      81.246271
IBM             NaN
MSFT      16.923765
SPX     1134.418556
XOM       50.520303
dtype: float64

In [12]:
vwap.dropna()


AAPL      81.246271
MSFT      16.923765
SPX     1134.418556
XOM       50.520303
dtype: float64

In [13]:
tp = close_px.align(volume, join='inner') # 返回与volume对齐的元组
# tp[0]是close_px，tp[1]是volume。
print(tp[0][:10])
print(tp[1][:10])

            AAPL  MSFT   XOM     SPX
1990-02-01  7.86  0.51  6.12  328.79
1990-02-02  8.00  0.51  6.24  330.92
1990-02-05  8.18  0.51  6.25  331.85
1990-02-06  8.12  0.51  6.23  329.66
1990-02-07  7.77  0.51  6.33  333.75
1990-02-08  7.71  0.51  6.35  332.96
1990-02-09  8.00  0.52  6.37  333.62
1990-02-12  7.94  0.52  6.22  330.08
1990-02-13  8.06  0.52  6.23  331.02
1990-02-14  8.00  0.52  6.20  332.01
                  AAPL         MSFT        XOM          SPX
1990-02-01   4193200.0   89193600.0  2916400.0  154580000.0
1990-02-02   4248800.0   71395200.0  4250000.0  164400000.0
1990-02-05   3653200.0   59731200.0  5880800.0  130950000.0
1990-02-06   2640000.0   81964800.0  4750800.0  134070000.0
1990-02-07  11180800.0  134150400.0  4124800.0  186710000.0
1990-02-08   6680000.0   95225600.0  5651200.0  176240000.0
1990-02-09   6004400.0   62380800.0  3384800.0  146910000.0
1990-02-12   2695600.0   56086400.0  2698000.0  118390000.0
1990-02-13   3653600.0   58752000.0  3564800.0  14449

In [14]:
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])
df = DataFrame({'one': s1, 'two': s2, 'three': s3})
df

Unnamed: 0,one,two,three
a,0.0,,1.0
b,1.0,1.0,
c,2.0,2.0,2.0
d,,0.0,
e,,3.0,
f,,,0.0


In [15]:
df = DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face')) # 显示定义索引对齐
df

Unnamed: 0,one,two,three
f,,,0.0
a,0.0,,1.0
c,2.0,2.0,2.0
e,,3.0,


## 频率不同的时间序列的运算

In [16]:
ts1 = Series(np.random.randn(3),
             index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))
ts1

2012-06-13   -2.053877
2012-06-20    0.734005
2012-06-27   -0.847427
Freq: W-WED, dtype: float64

In [17]:
ts1 = ts1.resample('B').ffill() # 重新采样，如果没有ffill就需要用ts2.iteritems()遍历访问
ts1

2012-06-13   -2.053877
2012-06-14   -2.053877
2012-06-15   -2.053877
2012-06-18   -2.053877
2012-06-19   -2.053877
2012-06-20    0.734005
2012-06-21    0.734005
2012-06-22    0.734005
2012-06-25    0.734005
2012-06-26    0.734005
2012-06-27   -0.847427
Freq: B, dtype: float64

In [18]:
dates = pd.DatetimeIndex(['2012-6-12',
                          '2012-6-17',
                          '2012-6-18',
                          '2012-6-21',
                          '2012-6-22',
                          '2012-6-29'])
ts2 = Series(np.random.randn(6), index=dates)
ts2

2012-06-12    0.878601
2012-06-17   -2.358855
2012-06-18    0.532669
2012-06-21    0.308243
2012-06-22   -1.335175
2012-06-29   -1.661290
dtype: float64

In [19]:
ts2 + ts1.reindex(ts2.index, method='ffill') # ts1的索引先和ts2对齐，然后相加。


2012-06-12         NaN
2012-06-17   -4.412732
2012-06-18   -1.521208
2012-06-21    1.042248
2012-06-22   -0.601170
2012-06-29   -2.508717
dtype: float64

## 使用Period

In [21]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
             index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [22]:
infl = Series([0.025, 0.045, 0.037, 0.04],
              index=pd.period_range('1982', periods=4, freq='A-DEC'))
infl # 显然和gdp的时间频率不一样

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

In [26]:
infl_q = infl.asfreq('Q-DEC', how='end') # 调整季度
infl_q

1982Q4    0.025
1983Q4    0.045
1984Q4    0.037
1985Q4    0.040
Freq: Q-DEC, dtype: float64

In [27]:
infl_q.reindex(gdp.index, method='ffill') # 索引匹配并且填充缺失值


1984Q2    0.045
1984Q3    0.045
1984Q4    0.037
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.040
Freq: Q-DEC, dtype: float64

## 时间和“最当前”数据选取