In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

In [2]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
         datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)

ts  # 时间序列

2011-01-02    1.608209
2011-01-05    0.150012
2011-01-07   -0.199162
2011-01-08   -0.633558
2011-01-10    1.031363
2011-01-12   -0.108024
dtype: float64

In [3]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [4]:
ts.index.dtype  # pandas使用numpy的datetime64数据类型在纳秒级的分辨率下存储时间戳

dtype('<M8[ns]')

In [5]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

In [6]:
ts + ts[::2]  # 时间索引会自动对齐

2011-01-02    3.216417
2011-01-05         NaN
2011-01-07   -0.398324
2011-01-08         NaN
2011-01-10    2.062727
2011-01-12         NaN
dtype: float64

### 索引、选择、子集

In [7]:
stamp = ts.index[2]
print(ts[stamp])

print(ts['1/10/2011'])
print(ts['20110110'])

-0.19916212837293487
1.0313633374270375
1.0313633374270375


In [8]:
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts

2000-01-01    1.211994
2000-01-02   -1.133383
2000-01-03   -1.190788
2000-01-04   -1.384380
2000-01-05   -1.927081
                ...   
2002-09-22   -0.425516
2002-09-23    0.449275
2002-09-24    0.236186
2002-09-25    0.579538
2002-09-26   -0.381702
Freq: D, Length: 1000, dtype: float64

In [9]:
longer_ts['2001']  # '2001'会被自动解释为年份

2001-01-01    0.910115
2001-01-02   -0.217219
2001-01-03   -0.276753
2001-01-04    0.649023
2001-01-05   -1.471531
                ...   
2001-12-27   -1.145742
2001-12-28    1.445529
2001-12-29    0.214331
2001-12-30    0.364197
2001-12-31   -1.492766
Freq: D, Length: 365, dtype: float64

In [10]:
longer_ts['2001-05']

2001-05-01   -0.006339
2001-05-02    1.603279
2001-05-03    0.503252
2001-05-04   -0.068931
2001-05-05    0.075273
2001-05-06    0.697045
2001-05-07    0.097354
2001-05-08   -0.046130
2001-05-09   -0.890969
2001-05-10    1.239910
2001-05-11   -0.013624
2001-05-12    0.556206
2001-05-13    0.623514
2001-05-14    0.355818
2001-05-15   -0.871199
2001-05-16    1.269865
2001-05-17    1.230412
2001-05-18   -1.126494
2001-05-19   -1.519000
2001-05-20    0.665996
2001-05-21    0.549444
2001-05-22   -0.395556
2001-05-23   -1.908281
2001-05-24   -0.615338
2001-05-25    0.270478
2001-05-26   -1.815768
2001-05-27   -0.821305
2001-05-28   -0.034489
2001-05-29   -0.032945
2001-05-30   -0.229862
2001-05-31   -0.791281
Freq: D, dtype: float64

In [11]:
ts[datetime(2011, 1, 7):]  # 切片

2011-01-07   -0.199162
2011-01-08   -0.633558
2011-01-10    1.031363
2011-01-12   -0.108024
dtype: float64

In [12]:
ts['1/6/2011':'1/11/2011']  # 可以使用不包含在时间序列中的时间戳进行切片

2011-01-07   -0.199162
2011-01-08   -0.633558
2011-01-10    1.031363
dtype: float64

In [13]:
ts.truncate(after='1/9/2011')  # 也可以实例方法truncate()进行切片

2011-01-02    1.608209
2011-01-05    0.150012
2011-01-07   -0.199162
2011-01-08   -0.633558
dtype: float64

In [14]:
# 上面的操作对DataFrame同样适用
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas', 'New York', 'Ohio'])

long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.74898,-0.642998,0.04898,0.91811
2001-05-09,1.389099,0.260297,-0.29279,-1.419831
2001-05-16,-0.892697,1.828225,0.811874,1.093462
2001-05-23,0.707199,0.436346,-0.727763,-1.637269
2001-05-30,-0.976827,-2.837436,1.05465,-0.291494


### 含有重复索引的时间序列

In [15]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)

dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [16]:
dup_ts.index.is_unique

False

In [17]:
dup_ts['1/3/2000']  # 不重复

4

In [18]:
dup_ts['1/2/2000']  # 重复

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [19]:
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int64

In [20]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64