# Chapter11 时间序列

## 11.1 日期和时间数据类型及工具

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [4]:
now = datetime.now()

In [5]:
now

datetime.datetime(2019, 3, 12, 9, 2, 14, 766181)

In [8]:
now.year, now.month, now.day

(2019, 3, 12)

In [10]:
delta = datetime(2017,9,20) - datetime(2018,3,24)

In [11]:
delta

datetime.timedelta(days=-185)

In [12]:
delta.days

-185

In [16]:
delta.seconds #因为datetime里面没有涉及具体的时分秒，所以这里不会转换计算

0

In [17]:
from datetime import timedelta

In [18]:
start = datetime(2019,3,12)

In [19]:
start + timedelta(13)

datetime.datetime(2019, 3, 25, 0, 0)

In [20]:
start - timedelta(13)*2

datetime.datetime(2019, 2, 14, 0, 0)

### 字符串和datetime的相互转换

In [21]:
stamp = datetime(2011, 1, 3)

In [22]:
str(stamp)

'2011-01-03 00:00:00'

In [31]:
stamp.strftime('%Y~%m~%d')

'2011~01~03'

In [33]:
stamp.strftime('%F') #%F是快捷方式

'2011-01-03'

In [42]:
value = '2011~01~03'

In [43]:
datetime.strptime(value,'%Y~%m~%d')

datetime.datetime(2011, 1, 3, 0, 0)

strtime是转换成时间格式，strptime是转换回字符格式

In [44]:
datestrs = ['7/6/2011', '8/6/2011']

In [45]:
[datetime.strptime(x,'%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

datetime.strptime是通过已知格式进行日期解析的最佳方式。但是每次都要编写
格式定义是很麻烦的事情，尤其是对于一些常见的日期格式。这种情况下，你可以
用dateutil这个第三方包中的parser.parse方法（pandas中已经自动安装好了）

In [46]:
from dateutil.parser import parse

In [48]:
parse('2018/9/3')

datetime.datetime(2018, 9, 3, 0, 0)

In [49]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [50]:
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

to_datetime方法可以解析多种不同的日期表示形式

In [51]:
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']

In [52]:
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [55]:
idx=pd.to_datetime(datestrs + [None])

In [56]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [57]:
idx[2] #NaT（Not a Time）是pandas中时间戳数据的null值

NaT

In [58]:
idx[1]

Timestamp('2011-08-06 00:00:00')

In [59]:
pd.isnull(idx)

array([False, False,  True])

## 11.2 时间序列基础

In [60]:
from datetime import datetime

In [61]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8),datetime(2011, 1, 10), datetime(2011, 1, 12)]

In [62]:
ts = pd.Series(np.random.randn(6),index = dates)

In [63]:
ts

2011-01-02    0.629803
2011-01-05    1.899193
2011-01-07   -0.686592
2011-01-08   -1.204324
2011-01-10    1.220419
2011-01-12    1.077004
dtype: float64

In [64]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [65]:
ts + ts[::2]

2011-01-02    1.259605
2011-01-05         NaN
2011-01-07   -1.373183
2011-01-08         NaN
2011-01-10    2.440837
2011-01-12         NaN
dtype: float64

In [66]:
ts.index.dtype

dtype('<M8[ns]')

In [67]:
stamp = ts.index[0]

In [68]:
stamp

Timestamp('2011-01-02 00:00:00')

### 索引、选取、子集构造

In [73]:
ts = pd.Series(np.random.randn(1000),
                    index = pd.date_range('1/1/2019',periods=1000))

In [74]:
ts.head()

2019-01-01   -0.074516
2019-01-02   -1.501623
2019-01-03   -0.976814
2019-01-04   -0.193179
2019-01-05   -0.431618
Freq: D, dtype: float64

切片没有数据被复制，对切片进行修改会反映到原始数据上

In [77]:
ts.truncate(before='2019-04-05').head()

2019-04-05   -0.573386
2019-04-06    0.618816
2019-04-07    0.445969
2019-04-08   -1.049830
2019-04-09   -1.219154
Freq: D, dtype: float64

In [81]:
dates = pd.date_range('1/1/2019',periods=100,freq='W-WED')

In [83]:
df = pd.DataFrame(np.random.randn(100,4),
                  index=dates,
                  columns=['Colorado', 'Texas','New York', 'Ohio'])

In [84]:
df.head()

Unnamed: 0,Colorado,Texas,New York,Ohio
2019-01-02,-1.14857,-0.264727,0.342389,-0.531049
2019-01-09,0.102369,1.994933,0.854364,-1.003866
2019-01-16,-1.240071,-1.423352,-1.075746,0.049923
2019-01-23,-0.058756,0.633824,-0.645939,0.520304
2019-01-30,0.182016,0.515914,-0.941196,0.757941


In [86]:
df.iloc[2:3,:2]

Unnamed: 0,Colorado,Texas
2019-01-16,-1.240071,-1.423352


In [90]:
df.loc['5-2019'] #amazing

Unnamed: 0,Colorado,Texas,New York,Ohio
2019-05-01,0.523869,0.081149,-0.752695,1.295676
2019-05-08,-0.348604,-1.811567,-0.907079,0.622261
2019-05-15,2.391027,-0.109101,1.272509,-1.312174
2019-05-22,-0.321372,-0.901468,0.585417,0.085627
2019-05-29,-1.693818,1.38512,-0.449105,0.320886


In [91]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000','1/2/2000','1/2/2000', '1/3/2000'])

In [92]:
dup_ts = pd.Series(np.arange(5), index=dates)

In [93]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [95]:
dup_ts.index.is_unique

False

In [96]:
grouped = dup_ts.groupby(level=0)

In [97]:
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [98]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## 11.3 日期的范围、频率以及移动

In [99]:
pd.date_range('2000-01-01','2000-01-03 23:59',freq='12H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 12:00:00',
               '2000-01-02 00:00:00', '2000-01-02 12:00:00',
               '2000-01-03 00:00:00', '2000-01-03 12:00:00'],
              dtype='datetime64[ns]', freq='12H')

In [100]:
pd.date_range('2000-01-01', periods=10, freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

WOM（Week Of Month）是一种非常实用的频率类，它以WOM开头。它使你
能获得诸如“每月第3个星期五”之类的日期

In [105]:
rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-2FRI')

In [106]:
list(rng)

[Timestamp('2012-01-13 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-02-10 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-03-09 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-04-13 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-05-11 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-06-08 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-07-13 00:00:00', freq='WOM-2FRI'),
 Timestamp('2012-08-10 00:00:00', freq='WOM-2FRI')]

In [107]:
ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods=4,freq='M'))

In [108]:
ts

2000-01-31    0.517292
2000-02-29   -0.060005
2000-03-31   -1.969851
2000-04-30   -1.245835
Freq: M, dtype: float64

In [109]:
ts.shift(-2)

2000-01-31   -1.969851
2000-02-29   -1.245835
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [110]:
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.517292
2000-04-30   -0.060005
Freq: M, dtype: float64

### 通过偏移量对日期进行位移

In [111]:
from pandas.tseries.offsets import Day, MonthEnd

In [112]:
now = datetime(2011,11,17)

In [114]:
now + Day()*3

Timestamp('2011-11-20 00:00:00')

In [115]:
now + MonthEnd()

Timestamp('2011-11-30 00:00:00')

In [116]:
now + MonthEnd(3)

Timestamp('2012-01-31 00:00:00')

In [117]:
offset = MonthEnd()

In [118]:
offset.rollback(now)

Timestamp('2011-10-31 00:00:00')

In [119]:
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

日期偏移量还有一个巧妙的用法，即结合groupby使用这两个“滚动”方法

In [120]:
ts = pd.Series(np.random.randn(20), index=pd.date_range('1/15/2000', periods=20,freq='4d'))

In [121]:
ts.groupby(offset.rollback).mean()

1999-12-31   -0.600776
2000-01-31    0.488623
2000-02-29   -0.142897
2000-03-31   -0.045862
dtype: float64

当然，更简单、更快速地实现该功能的办法是使用resample

In [122]:
ts.resample('M').mean()

2000-01-31   -0.163969
2000-02-29    0.332246
2000-03-31   -0.130767
Freq: M, dtype: float64

## 11.4 时区处理

In [124]:
import pytz

In [125]:
pytz.common_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

## 11.6 重采样及频率转换

重采样（resampling）指的是将时间序列从一个频率转换到另一个频率的处理过
程。将高频率数据聚合到低频率称为降采样（downsampling），而将低频率数
据转换到高频率则称为升采样（upsampling）。并不是所有的重采样都能被划分
到这两个大类中。例如，将W-WED（每周三）转换为W-FRI既不是降采样也不是升采样。

降采样：将数据聚合到规律的低频率是一件非常普通的时间序列处理任务。待聚合的数据不
必拥有固定的频率，期望的频率会自动定义聚合的面元边界，这些面元用于将时间
序列拆分为多个片段。例如，要转换到月度频率（'M'或'BM'），数据需要被划分
到多个单月时间段中。各时间段都是半开放的。一个数据点只能属于一个时间段，
所有时间段的并集必须能组成整个时间帧。

In [127]:
rng = pd.date_range('2000-01-01',periods=12,freq='T')

In [129]:
ts = pd.Series(np.arange(12),index=rng)

In [130]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [132]:
ts.resample('5min',closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32

### OHLC重采样

金融领域中有一种无所不在的时间序列聚合方式，即计算各面元的四个值：第一个
值（open，开盘）、最后一个值（close，收盘）、最大值（high，最高）以及
最小值（low，最低）。传入how='ohlc'即可得到一个含有这四种聚合值的
DataFrame

In [133]:
ts.resample('5min',closed='right').ohlc()

Unnamed: 0,open,high,low,close
1999-12-31 23:55:00,0,0,0,0
2000-01-01 00:00:00,1,5,1,5
2000-01-01 00:05:00,6,10,6,10
2000-01-01 00:10:00,11,11,11,11


# Chapter12 pandas高级应用

In [2]:
values = pd.Series(['apple','orange','b','apple'] * 2)

In [3]:
values

0     apple
1    orange
2         b
3     apple
4     apple
5    orange
6         b
7     apple
dtype: object

In [4]:
pd.unique(values)

array(['apple', 'orange', 'b'], dtype=object)

In [7]:
pd.value_counts(values)

apple     4
orange    2
b         2
dtype: int64

In [8]:
values = pd.Series([0, 1, 0, 0] * 2)

In [9]:
dim = pd.Series(['apple','orange'])

In [10]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [11]:
dim

0     apple
1    orange
dtype: object

In [13]:
dim.take(values)  #适合用来处理0和1两种情况的数据

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [14]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2

In [15]:
n = len(fruits)

In [24]:
df = pd.DataFrame({'fruit':fruits,
                 'basket_id':np.arange(n),
                  'count': np.random.randint(3, 15,size=n),
                  'weight': np.random.uniform(0, 4,size=n)}, 
                  columns=['basket_id', 'fruit', 'count','weight'])

In [25]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,9,1.68409
1,1,orange,6,2.467682
2,2,apple,8,1.773343
3,3,apple,5,0.811518
4,4,apple,5,2.263127
5,5,orange,12,0.78043
6,6,apple,12,3.119592
7,7,apple,13,1.067444


In [30]:
c=df['fruit'].astype('category')

In [31]:
c.values

[apple, orange, apple, apple, apple, orange, apple, apple]
Categories (2, object): [apple, orange]

In [32]:
type(c)

pandas.core.series.Series

In [35]:
my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo','bar'])

In [36]:
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [37]:
categories = ['foo', 'bar', 'baz']

In [38]:
codes = [0,1,2,0,1,1]

In [39]:
cats = pd.Categorical.from_codes(codes,categories)

In [40]:
cats

[foo, bar, baz, foo, bar, bar]
Categories (3, object): [foo, bar, baz]

In [41]:
ordered_cat = pd.Categorical.from_codes(codes, categories, ordered=True)

In [42]:
ordered_cat

[foo, bar, baz, foo, bar, bar]
Categories (3, object): [foo < bar < baz]

In [43]:
n = 10000000

In [44]:
draw = pd.Series(np.random.randn(n))

In [45]:
lables = pd.Series(['foo','bar','baz','qux'] * (n//4))

In [46]:
categories = lables.astype('category')

GroupBy使用分类操作明显更快，是因为底层的算法使用整数编码数组，而不是
字符串数组。

## GroupBy高级应用

In [52]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})

In [53]:
df

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [55]:
g = df.groupby('key').value

In [56]:
g.mean()

key
a    4.5
b    5.5
c    6.5
Name: value, dtype: float64

In [57]:
g.transform(lambda x:x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [59]:
g.transform(lambda x:x*2)

0      0.0
1      2.0
2      4.0
3      6.0
4      8.0
5     10.0
6     12.0
7     14.0
8     16.0
9     18.0
10    20.0
11    22.0
Name: value, dtype: float64

In [61]:
g.transform(lambda x:x.rank(ascending=True))

0     1.0
1     1.0
2     1.0
3     2.0
4     2.0
5     2.0
6     3.0
7     3.0
8     3.0
9     4.0
10    4.0
11    4.0
Name: value, dtype: float64

In [63]:
def normalize(x):
    return (x.min()-x.max())/x.max()

In [64]:
g.transform(normalize)

0    -1.000000
1    -0.900000
2    -0.818182
3    -1.000000
4    -0.900000
5    -0.818182
6    -1.000000
7    -0.900000
8    -0.818182
9    -1.000000
10   -0.900000
11   -0.818182
Name: value, dtype: float64

In [65]:
g.apply(normalize)

key
a   -1.000000
b   -0.900000
c   -0.818182
Name: value, dtype: float64

In [66]:
g.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64