# datetime模块

In [115]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse

In [101]:
now = datetime.now()
now

datetime.datetime(2018, 5, 30, 16, 51, 57, 182245)

In [102]:
now.year,now.month,now.day

(2018, 5, 30)

# delta

In [103]:
delta = datetime(2011,1,7)-datetime(2008,6,24,8,15)

delta     #926 days, 15:45:00

datetime.timedelta(926, 56700)

In [104]:
delta.days

926

In [105]:
delta.seconds

56700

In [107]:
start = datetime(2011,1,7)
start + timedelta(12)  #12 days

datetime.datetime(2011, 1, 19, 0, 0)

In [108]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

# 字符串和datetime的相互转换

In [110]:
stamp = datetime(2011,1,3)
stamp

datetime.datetime(2011, 1, 3, 0, 0)

In [111]:
str(stamp)

'2011-01-03 00:00:00'

## strftime格式化

In [112]:
stamp.strftime('%Y-%m-%d')    #相当于datetime通过strftime格式化

'2011-01-03'

## strptime已知格式日期解析

In [113]:
value = '2011-01-03'
datetime.strptime(value,'%Y-%m-%d')    #相当于通过strptime将格式化的转位datetime

datetime.datetime(2011, 1, 3, 0, 0)

In [114]:
datestrs = ['7/6/2011','8/6/2011']
[datetime.strptime(x,'%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

## dateutil中的parser.parse

In [116]:
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [117]:
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

### dayfirst = True

In [122]:
parse('6/12/2011',dayfirst = True)

datetime.datetime(2011, 12, 6, 0, 0)

### to_datetime

In [119]:
datestrs

['7/6/2011', '8/6/2011']

In [120]:
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)

### 处理缺失值

In [123]:
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06', '2011-08-06', 'NaT'], dtype='datetime64[ns]', freq=None)

In [124]:
idx[2] #NaT(not a time)是pandas中时间戳数据的NA值

NaT

In [125]:
pd.isnull(idx)

array([False, False,  True], dtype=bool)

# 时间序列基础

In [126]:
dates = [datetime(2011,1,2),datetime(2011,1,5),datetime(2011,1,7),
        datetime(2011,1,8),datetime(2011,1,10),datetime(2011,1,12)]
dates

[datetime.datetime(2011, 1, 2, 0, 0),
 datetime.datetime(2011, 1, 5, 0, 0),
 datetime.datetime(2011, 1, 7, 0, 0),
 datetime.datetime(2011, 1, 8, 0, 0),
 datetime.datetime(2011, 1, 10, 0, 0),
 datetime.datetime(2011, 1, 12, 0, 0)]

In [127]:
ts = Series(np.random.randn(6),index = dates)
ts   #ts就变成了一个TimeSeries了

2011-01-02    1.010280
2011-01-05   -1.346911
2011-01-07    0.897244
2011-01-08   -1.101381
2011-01-10   -1.088600
2011-01-12   -0.772412
dtype: float64

In [129]:
ts + ts[::2]

2011-01-02    2.020561
2011-01-05         NaN
2011-01-07    1.794489
2011-01-08         NaN
2011-01-10   -2.177200
2011-01-12         NaN
dtype: float64

In [None]:
#pandas用Numpy的datetime64数据类型以纳秒形式存储时间戳

In [130]:
ts.index.dtype

dtype('<M8[ns]')

In [133]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

# 索引、选取、子集构造

In [134]:
ts

2011-01-02    1.010280
2011-01-05   -1.346911
2011-01-07    0.897244
2011-01-08   -1.101381
2011-01-10   -1.088600
2011-01-12   -0.772412
dtype: float64

## 数据索引

In [135]:
stamp = ts.index[2]
ts[stamp]

0.89724445392747554

In [136]:
ts['1/10/2011']

-1.088599889389624

In [137]:
ts['20110110']

-1.088599889389624

## 数据切片

In [138]:
longer_ts = Series(np.random.randn(1000),index = pd.date_range('1/1/2000',periods = 1000))  #1000days
longer_ts

2000-01-01    0.794262
2000-01-02   -0.604685
2000-01-03    0.215134
2000-01-04    0.737016
2000-01-05   -2.328088
2000-01-06    1.385121
2000-01-07    2.253525
2000-01-08    0.629555
2000-01-09   -1.066542
2000-01-10   -0.182923
2000-01-11    0.847798
2000-01-12   -0.564487
2000-01-13   -0.293050
2000-01-14    0.465107
2000-01-15    0.364380
2000-01-16    0.532751
2000-01-17    1.029708
2000-01-18   -0.536929
2000-01-19   -0.031914
2000-01-20    0.621849
2000-01-21   -0.802742
2000-01-22    0.098112
2000-01-23    0.334317
2000-01-24    0.231633
2000-01-25   -1.509906
2000-01-26    0.164808
2000-01-27   -1.090271
2000-01-28   -0.544581
2000-01-29    0.821682
2000-01-30   -0.834326
                ...   
2002-08-28    0.915462
2002-08-29   -0.469665
2002-08-30    0.971101
2002-08-31   -0.123663
2002-09-01    1.054634
2002-09-02    1.567873
2002-09-03   -1.256964
2002-09-04   -0.227342
2002-09-05    1.715745
2002-09-06   -0.316304
2002-09-07    0.973518
2002-09-08    0.013857
2002-09-09 

In [139]:
longer_ts['2001']

2001-01-01   -0.936508
2001-01-02   -1.212426
2001-01-03    0.407243
2001-01-04    0.222173
2001-01-05    0.705255
2001-01-06    0.533705
2001-01-07   -0.271035
2001-01-08    1.010877
2001-01-09   -1.007625
2001-01-10   -0.059173
2001-01-11   -1.643034
2001-01-12    0.546990
2001-01-13    0.413876
2001-01-14   -0.561846
2001-01-15    0.131112
2001-01-16   -0.113195
2001-01-17   -2.152430
2001-01-18   -0.825520
2001-01-19    0.617260
2001-01-20   -1.251264
2001-01-21   -1.211746
2001-01-22   -0.485480
2001-01-23    0.174256
2001-01-24    0.007197
2001-01-25   -1.792217
2001-01-26    2.066472
2001-01-27    0.116981
2001-01-28    1.441366
2001-01-29   -0.316217
2001-01-30   -0.254210
                ...   
2001-12-02   -0.312558
2001-12-03   -0.773886
2001-12-04   -0.862418
2001-12-05   -0.387320
2001-12-06   -0.658100
2001-12-07    1.498470
2001-12-08   -0.058024
2001-12-09    0.269673
2001-12-10    0.040084
2001-12-11   -0.694445
2001-12-12    1.477408
2001-12-13   -1.980331
2001-12-14 

In [140]:
ts[datetime(2011,1,7):]

2011-01-07    0.897244
2011-01-08   -1.101381
2011-01-10   -1.088600
2011-01-12   -0.772412
dtype: float64

In [141]:
ts['1/6/2011':'1/11/2011']

2011-01-07    0.897244
2011-01-08   -1.101381
2011-01-10   -1.088600
dtype: float64

## truncate

In [142]:
ts.truncate(after = '1/9/2011')

2011-01-02    1.010280
2011-01-05   -1.346911
2011-01-07    0.897244
2011-01-08   -1.101381
dtype: float64

## 对dataframe有效

In [143]:
dates = pd.date_range('1/1/2000',periods = 100, freq = 'W-WED')
dates

DatetimeIndex(['2000-01-05', '2000-01-12', '2000-01-19', '2000-01-26',
               '2000-02-02', '2000-02-09', '2000-02-16', '2000-02-23',
               '2000-03-01', '2000-03-08', '2000-03-15', '2000-03-22',
               '2000-03-29', '2000-04-05', '2000-04-12', '2000-04-19',
               '2000-04-26', '2000-05-03', '2000-05-10', '2000-05-17',
               '2000-05-24', '2000-05-31', '2000-06-07', '2000-06-14',
               '2000-06-21', '2000-06-28', '2000-07-05', '2000-07-12',
               '2000-07-19', '2000-07-26', '2000-08-02', '2000-08-09',
               '2000-08-16', '2000-08-23', '2000-08-30', '2000-09-06',
               '2000-09-13', '2000-09-20', '2000-09-27', '2000-10-04',
               '2000-10-11', '2000-10-18', '2000-10-25', '2000-11-01',
               '2000-11-08', '2000-11-15', '2000-11-22', '2000-11-29',
               '2000-12-06', '2000-12-13', '2000-12-20', '2000-12-27',
               '2001-01-03', '2001-01-10', '2001-01-17', '2001-01-24',
      

In [144]:
long_df = DataFrame(np.random.randn(100,4),index = dates,columns=['Colorado','Texas','New York','Ohio'])
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.869380,1.776743,0.897022,0.217430
2000-01-12,0.424569,0.969125,-0.545287,0.672040
2000-01-19,-0.820177,2.787700,-1.246517,0.239366
2000-01-26,0.573652,0.206010,1.909127,0.161647
2000-02-02,-1.512479,-1.252114,-0.975916,1.184940
2000-02-09,-0.826200,0.606439,-0.351658,-0.405876
2000-02-16,-0.073725,0.378664,0.823275,-0.589091
2000-02-23,-0.006472,0.627340,-2.113636,-0.772990
2000-03-01,-0.659447,-0.208751,0.972212,1.183451
2000-03-08,-1.231874,0.599023,-1.399217,0.080979


# 带有重复索引的时间序列

In [145]:
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000',
                         '1/3/2000'])
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', '2000-01-02',
               '2000-01-03'],
              dtype='datetime64[ns]', freq=None)

In [146]:
dup_ts = Series(np.arange(5),index = dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [147]:
dup_ts.index.is_unique

False

In [148]:
dup_ts['1/3/2000']

4

In [149]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [150]:
grouped = dup_ts.groupby(level=0)

In [151]:
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [152]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

# 日期的范围、频率及移动

In [153]:
ts.resample('D').mean()

2011-01-02    1.010280
2011-01-03         NaN
2011-01-04         NaN
2011-01-05   -1.346911
2011-01-06         NaN
2011-01-07    0.897244
2011-01-08   -1.101381
2011-01-09         NaN
2011-01-10   -1.088600
2011-01-11         NaN
2011-01-12   -0.772412
Freq: D, dtype: float64

# 生成日期范围

In [154]:
index = pd.date_range('4/1/2012','6/1/2012')

In [155]:
pd.date_range(start='4/1/2012',periods=20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [156]:
pd.date_range(end='6/1/2012',periods=20)

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

# BM（business end of month）

In [157]:
pd.date_range('1/1/2000','12/1/2000',freq = 'BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [158]:
pd.date_range('5/2/2012 12:56:31',periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [159]:
pd.date_range('5/2/2012 12:56:31',periods=5,normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

# 频率和日期偏量

In [161]:
from pandas.tseries.offsets import Hour,Minute

In [162]:
hour = Hour()
hour

<Hour>

In [163]:
four_hours = Hour(4)
four_hours

<4 * Hours>

In [164]:
pd.date_range('1/1/2000','1/3/2000 23:59',freq='4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [165]:
Hour(2)+Minute(30)

<150 * Minutes>

In [166]:
pd.date_range('1/1/2000',periods=10,freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

# WOM日期(week of month)

In [167]:
rng = pd.date_range('1/1/2012','9/1/2012',freq='WOM-3FRI')  #每月第三个星期五

In [168]:
rng

DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20',
               '2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

# 移动数据

In [169]:
ts = Series(np.random.randn(4),
           index = pd.date_range('1/1/2000',periods=4,freq='M'))
ts

2000-01-31    0.927850
2000-02-29   -1.213890
2000-03-31   -1.220074
2000-04-30   -0.524616
Freq: M, dtype: float64

In [170]:
ts.shift(2)

2000-01-31        NaN
2000-02-29        NaN
2000-03-31    0.92785
2000-04-30   -1.21389
Freq: M, dtype: float64

In [171]:
ts.shift(-2)

2000-01-31   -1.220074
2000-02-29   -0.524616
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [172]:
ts.shift(2,freq='M')

2000-03-31    0.927850
2000-04-30   -1.213890
2000-05-31   -1.220074
2000-06-30   -0.524616
Freq: M, dtype: float64

In [173]:
ts.shift(3,freq='D')

2000-02-03    0.927850
2000-03-03   -1.213890
2000-04-03   -1.220074
2000-05-03   -0.524616
dtype: float64

In [174]:
ts.shift(1,freq='3D')

2000-02-03    0.927850
2000-03-03   -1.213890
2000-04-03   -1.220074
2000-05-03   -0.524616
dtype: float64

In [175]:
ts.shift(1,freq='90T')

2000-01-31 01:30:00    0.927850
2000-02-29 01:30:00   -1.213890
2000-03-31 01:30:00   -1.220074
2000-04-30 01:30:00   -0.524616
Freq: M, dtype: float64