# 时间序列基础

In [4]:
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas import DataFrame, Series

In [2]:
dates = [datetime(2011, 1, 2),
         datetime(2011, 1, 5),
         datetime(2011, 1, 7),
         datetime(2011, 1, 8),
         datetime(2011, 1, 10),
         datetime(2011, 1, 12)]
ts = Series(np.random.randn(6), index=dates)
print(type(ts)) # 注意，和书上说的TimeSeries不一样。
print(type(ts.index))
ts

<class 'pandas.core.series.Series'>
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


2011-01-02    1.046782
2011-01-05   -0.750990
2011-01-07   -0.553274
2011-01-08    0.906677
2011-01-10   -0.503824
2011-01-12   -0.386186
dtype: float64

In [5]:
ts[::2] 

2011-01-02    1.046782
2011-01-07   -0.553274
2011-01-10   -0.503824
dtype: float64

In [4]:
ts + ts[::2] # 无法对齐的地方自动填充NA，且用+计算时，nan+1 = nan


2011-01-02    2.093565
2011-01-05         NaN
2011-01-07   -1.106548
2011-01-08         NaN
2011-01-10   -1.007648
2011-01-12         NaN
dtype: float64

In [6]:
ts.index.dtype # 以ns为单位保存时间戳


dtype('<M8[ns]')

In [7]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

## 索引、选取、子集构造

In [13]:
ts

2011-01-02    1.046782
2011-01-05   -0.750990
2011-01-07   -0.553274
2011-01-08    0.906677
2011-01-10   -0.503824
2011-01-12   -0.386186
dtype: float64

In [14]:
print(ts['1/10/2011'])
print(ts['20110110'])

# cc：当字段类型是datetime的时候；可以用不同格式来访问

-0.5038238867252985
-0.5038238867252985


### pd.date_range - 时间范围

In [15]:
longer_ts = Series(np.random.randn(1000),
                   index=pd.date_range('1/1/2000', periods=1000)) # 连续1000天的数据
print(longer_ts.head())
print(longer_ts.tail())

2000-01-01   -0.179448
2000-01-02    0.095848
2000-01-03    0.248319
2000-01-04   -0.812250
2000-01-05   -1.671145
Freq: D, dtype: float64
2002-09-22   -0.616625
2002-09-23    0.701875
2002-09-24   -1.682981
2002-09-25   -0.191608
2002-09-26   -1.456629
Freq: D, dtype: float64


### 时间索引

In [16]:
longer_ts['2001'].head() # 直接选年份

# cc：强！

2001-01-01    0.658280
2001-01-02   -0.997487
2001-01-03    1.116950
2001-01-04    0.652873
2001-01-05    0.051359
Freq: D, dtype: float64

In [17]:
longer_ts['2001-05'].head() # 年 + 月


2001-05-01    0.303465
2001-05-02   -1.188541
2001-05-03   -0.876990
2001-05-04   -1.465803
2001-05-05    2.142942
Freq: D, dtype: float64

### 切片

In [19]:
dates = [datetime(2011, 1, 2),
         datetime(2011, 1, 5),
         datetime(2011, 1, 7),
         datetime(2011, 1, 8),
         datetime(2011, 1, 10),
         datetime(2011, 1, 12)] # 重新构造一遍，便于查询数据
ts = Series(np.random.randn(6), index=dates)
ts

2011-01-02    0.892750
2011-01-05    0.715010
2011-01-07    0.695968
2011-01-08   -1.054984
2011-01-10    0.992040
2011-01-12   -0.458572
dtype: float64

In [20]:
ts[datetime(2011, 1, 7):] # 切片返回 2011-01-07 及以后


2011-01-07    0.695968
2011-01-08   -1.054984
2011-01-10    0.992040
2011-01-12   -0.458572
dtype: float64

In [21]:
ts['1/6/2011':'1/20/2011'] # 1/6和1/20不存在没关系，自动会查。


2011-01-07    0.695968
2011-01-08   -1.054984
2011-01-10    0.992040
2011-01-12   -0.458572
dtype: float64

### ts.truncate - 类似切片做时间

In [24]:
ts.truncate(after='1/8/2011') # 截取到2011/1/8

# cc：和切片一个道理。
# 见官方文档：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.truncate.html

2011-01-02    0.892750
2011-01-05    0.715010
2011-01-07    0.695968
2011-01-08   -1.054984
dtype: float64

In [25]:
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = DataFrame(np.random.randn(100, 4),
                    index=dates,
                    columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.head()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.698443,1.208438,-0.186166,-0.60344
2000-01-12,-0.696785,2.436136,-1.413933,-1.361121
2000-01-19,-0.83101,0.622417,0.040842,0.617581
2000-01-26,-0.161445,-0.461966,1.308103,0.310308
2000-02-02,-2.140641,-0.610529,1.333593,0.516684


In [26]:
long_df.loc['5-2001'] # 2001年5月


Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.555311,-0.503932,0.394058,-1.205138
2001-05-09,-0.625032,-0.8616,-0.366383,0.296367
2001-05-16,-0.270204,-0.55695,-0.149391,-0.149297
2001-05-23,0.277581,1.325862,-0.479486,0.705575
2001-05-30,-0.273454,0.101053,1.416024,0.056522


In [27]:
long_df['5-2001'] # cc：虽然这种方式也可以，但是见warning，建议还是用上面的方式；
# （这是对DataFrame的要求，对Series就没有这个要求了！）

  long_df['5-2001']


Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.555311,-0.503932,0.394058,-1.205138
2001-05-09,-0.625032,-0.8616,-0.366383,0.296367
2001-05-16,-0.270204,-0.55695,-0.149391,-0.149297
2001-05-23,0.277581,1.325862,-0.479486,0.705575
2001-05-30,-0.273454,0.101053,1.416024,0.056522


## 带有重复索引的时间序列

In [30]:
dates = pd.DatetimeIndex(['1/1/2000',
                          '1/2/2000',
                          '1/2/2000',
                          '1/2/2000',
                          '1/3/2000'])
dup_ts = Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [31]:
dup_ts.index.is_unique


False

In [32]:
dup_ts['1/3/2000'] # 不重复


4

In [33]:
dup_ts['1/2/2000'] # 重复


2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [5]:
# 查看相关参数
DataFrame.groupby?

In [35]:
grouped = dup_ts.groupby(level=0)
# 参数level：
# If the axis is a MultiIndex (hierarchical), group by a particular level or levels.
# 如果轴是 MultiIndex (分层) ，则按特定级别进行分组。
print(grouped.mean())
print(grouped.count())


2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int64
2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64
