In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 7)
pd.set_option('precision', 7)

# useful for date/time manipulations
import datetime
from datetime import datetime

# And some items for matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
pd.options.display.mpl_style = 'default'

# Time-series data and the DatetimeIndex

In [2]:
# create a a DatetimeIndex from an array of datetime's
dates = [datetime(2014, 8, 1), datetime(2014, 8, 2)]
dti = pd.DatetimeIndex(dates)
dti

<class 'pandas.tseries.index.DatetimeIndex'>
[2014-08-01, 2014-08-02]
Length: 2, Freq: None, Timezone: None

In [3]:
# a Series given a datetime list will automatically create
# a DatetimeIndex as its index
np.random.seed(123456)
ts = pd.Series(np.random.randn(2), dates)
type(ts.index)

pandas.tseries.index.DatetimeIndex

In [4]:
# retrieve a value using a datetime object
ts[datetime(2014, 8, 2)]

-0.28286334432866328

In [5]:
# this can also be performed with a string
ts['2014-8-2']

-0.28286334432866328

In [6]:
# create a Series with a DatetimeIndex using strings as dates
np.random.seed(123456)
dates = ['2014-08-01', '2014-08-02']
ts = pd.Series(np.random.randn(2), dates)
ts

2014-08-01    0.469112
2014-08-02   -0.282863
dtype: float64

In [7]:
# convert a list of items to a DatetimeIndex
dti = pd.to_datetime(['Aug 1, 2014', '2014-08-02', 
                      '2014.8.3', None])
dti

<class 'pandas.tseries.index.DatetimeIndex'>
[2014-08-01, ..., NaT]
Length: 4, Freq: None, Timezone: None

In [8]:
# watch out as a failure to convert an item on the list
# to a date/time will result in the return value being a
# NumPy array instead of a DatetimeIndex
dti2 = pd.to_datetime(['Aug 1, 2014', 'foo'])
type(dti2)

numpy.ndarray

In [9]:
# coerce pandas to convert all to datetime and a DatetimeIndex
# substituting NaT where values can not be converted
pd.to_datetime(['Aug 1, 2014', 'foo'], coerce=True)

<class 'pandas.tseries.index.DatetimeIndex'>
[2014-08-01, NaT]
Length: 2, Freq: None, Timezone: None

In [10]:
# demonstrate two representations of the same date, one 
# month first, the other day first, converting to the 
# same date representation in pandas
dti1 = pd.to_datetime(['8/1/2014'])
dti2 = pd.to_datetime(['1/8/2014'], dayfirst=True)
dti1[0], dti2[0]

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-01 00:00:00'))

In [11]:
# create a Series with a DatetimeIndex starting at 8/1/2014
# and consisting of 10 consequtive days
np.random.seed(123456)
dates = pd.date_range('8/1/2014', periods=10)
s1 = pd.Series(np.random.randn(10), dates)
s1[:5]

2014-08-01    0.469112
2014-08-02   -0.282863
2014-08-03   -1.509059
2014-08-04   -1.135632
2014-08-05    1.212112
Freq: D, dtype: float64

In [12]:
# for examples of data retrieval / slicing, we will use the 
# following data from Yahoo! Finance
import pandas.io.data as web
msft = web.DataReader("MSFT", 'yahoo', '2012-1-1', '2013-12-30')
msft.head(5)

             Open   High    Low  Close    Volume  Adj Close
Date                                                       
2012-01-03  26.55  26.96  26.39  26.77  64731500   24.42183
2012-01-04  26.82  27.47  26.78  27.40  80516100   24.99657
2012-01-05  27.38  27.73  27.29  27.68  56081400   25.25201
2012-01-06  27.53  28.19  27.53  28.11  99455500   25.64429
2012-01-09  28.05  28.10  27.72  27.74  59706800   25.30675

In [13]:
# extract just the Adj Close values
msftAC = msft['Adj Close']
msftAC.head(3)

Date
2012-01-03    24.42183
2012-01-04    24.99657
2012-01-05    25.25201
Name: Adj Close, dtype: float64

In [14]:
# slicing using a DatetimeIndex nicely works with dates 
# passed as strings
msft['2012-01-01':'2012-01-05']

             Open   High    Low  Close    Volume  Adj Close
Date                                                       
2012-01-03  26.55  26.96  26.39  26.77  64731500   24.42183
2012-01-04  26.82  27.47  26.78  27.40  80516100   24.99657
2012-01-05  27.38  27.73  27.29  27.68  56081400   25.25201

In [15]:
# returns a Series representing all the values of the 
# single row indexed by the column names
msft.loc['2012-01-03']

Open               26.55000
High               26.96000
Low                26.39000
Close              26.77000
Volume       64731500.00000
Adj Close          24.42183
Name: 2012-01-03 00:00:00, dtype: float64

In [16]:
# this is an error as this tries to retrieve a column
# named '2012-01-03'
# msft['2012-01-03'] # commented to prevent killing the notebook

In [17]:
# this is a Series, so the lookup works
msftAC['2012-01-03']

24.42183

In [18]:
# we can lookup using partial date specifications
# such as only year and month
msft['2012-02'].head(5)

             Open   High    Low  Close    Volume  Adj Close
Date                                                       
2012-02-01  29.79  30.05  29.76  29.89  67409900   27.26815
2012-02-02  29.90  30.17  29.71  29.95  52223300   27.32289
2012-02-03  30.14  30.40  30.09  30.24  41838500   27.58745
2012-02-06  30.04  30.22  29.97  30.20  28039700   27.55096
2012-02-07  30.15  30.49  30.05  30.35  39242400   27.68781

In [19]:
# slice starting at the beginning of Feb 2012 and 
# end on Feb 9 2012
msft['2012-02':'2012-02-09'][:5]

             Open   High    Low  Close    Volume  Adj Close
Date                                                       
2012-02-01  29.79  30.05  29.76  29.89  67409900   27.26815
2012-02-02  29.90  30.17  29.71  29.95  52223300   27.32289
2012-02-03  30.14  30.40  30.09  30.24  41838500   27.58745
2012-02-06  30.04  30.22  29.97  30.20  28039700   27.55096
2012-02-07  30.15  30.49  30.05  30.35  39242400   27.68781

# Creating time-series with specific frequencies

In [20]:
# create a time-series with one minute frequency
bymin = pd.Series(np.arange(0, 90*60*24),
                  pd.date_range('2014-08-01', 
                                '2014-10-29 23:59:00',
                                freq='T'))
bymin

2014-08-01 00:00:00         0
2014-08-01 00:01:00         1
2014-08-01 00:02:00         2
                        ...  
2014-10-29 23:57:00    129597
2014-10-29 23:58:00    129598
2014-10-29 23:59:00    129599
Freq: T, dtype: int64

In [21]:
# slice at the minute level
bymin['2014-08-01 12:30':'2014-08-01 12:59']

2014-08-01 12:30:00    750
2014-08-01 12:31:00    751
2014-08-01 12:32:00    752
                      ... 
2014-08-01 12:57:00    777
2014-08-01 12:58:00    778
2014-08-01 12:59:00    779
Freq: T, dtype: int64

# Representing intervals of time using periods

In [22]:
# create a period representing a start of 
# 2014-08 and for a duration of one month
aug2014 = pd.Period('2014-08', freq='M')
aug2014

Period('2014-08', 'M')

In [23]:
# pandas determined the following start and end
# for the period
aug2014.start_time, aug2014.end_time

(Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

In [24]:
# what is the one month period following the given period?
sep2014 = aug2014 + 1
sep2014

Period('2014-09', 'M')

In [25]:
# the calculated start and end are
sep2014.start_time, sep2014.end_time

(Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

In [26]:
# create a pandas PeriodIndex
mp2013 = pd.period_range('1/1/2013', '12/31/2013', freq='M')
mp2013

<class 'pandas.tseries.period.PeriodIndex'>
[2013-01, ..., 2013-12]
Length: 12, Freq: M

In [27]:
# dump all the calculated periods
for p in mp2013: 
    print "{0} {1} {2} {3}".format(p, 
                                   p.freq, 
                                   p.start_time, 
                                   p.end_time)

2013-01 M 2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
2013-02 M 2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
2013-03 M 2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
2013-04 M 2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
2013-05 M 2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
2013-06 M 2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
2013-07 M 2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
2013-08 M 2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
2013-09 M 2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
2013-10 M 2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
2013-11 M 2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
2013-12 M 2013-12-01 00:00:00 2013-12-31 23:59:59.999999999


In [28]:
# and now create a Series using the PeriodIndex
np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013)
ps

2013-01    0.469112
2013-02   -0.282863
2013-03   -1.509059
             ...   
2013-10   -2.104569
2013-11   -0.494929
2013-12    1.071804
Freq: M, dtype: float64

# Shifting and lagging time-series data

In [29]:
# refresh our memory on the data in the MSFT closing prices Series
msftAC[:5]

Date
2012-01-03    24.42183
2012-01-04    24.99657
2012-01-05    25.25201
2012-01-06    25.64429
2012-01-09    25.30675
Name: Adj Close, dtype: float64

In [30]:
# shift the prices one index position forward
shifted_forward = msftAC.shift(1)
shifted_forward[:5]

Date
2012-01-03         NaN
2012-01-04    24.42183
2012-01-05    24.99657
2012-01-06    25.25201
2012-01-09    25.64429
Name: Adj Close, dtype: float64

In [31]:
# the last item is also shifted away 
msftAC.tail(5), shifted_forward.tail(5)

(Date
 2013-12-23    35.39210
 2013-12-24    35.83668
 2013-12-26    36.18461
 2013-12-27    36.03964
 2013-12-30    36.03964
 Name: Adj Close, dtype: float64, Date
 2013-12-23    35.56607
 2013-12-24    35.39210
 2013-12-26    35.83668
 2013-12-27    36.18461
 2013-12-30    36.03964
 Name: Adj Close, dtype: float64)

In [32]:
# shift backwards 2 index labels
shifted_backwards = msftAC.shift(-2)
shifted_backwards[:5]

Date
2012-01-03    25.25201
2012-01-04    25.64429
2012-01-05    25.30675
2012-01-06    25.39797
2012-01-09    25.28850
Name: Adj Close, dtype: float64

In [33]:
# this has resulted in 2 NaN values at 
# the end of the resulting Series
shifted_backwards.tail(5)

Date
2013-12-23    36.18461
2013-12-24    36.03964
2013-12-26    36.03964
2013-12-27         NaN
2013-12-30         NaN
Name: Adj Close, dtype: float64

In [36]:
# shift by a different frequency does not realign
# and ends up essentially changing the index labels by
# the specific amount of time
msftAC.shift(1, freq="S")

Date
2012-01-03 00:00:01    24.42183
2012-01-04 00:00:01    24.99657
2012-01-05 00:00:01    25.25201
                         ...   
2013-12-26 00:00:01    36.18461
2013-12-27 00:00:01    36.03964
2013-12-30 00:00:01    36.03964
Name: Adj Close, dtype: float64

In [37]:
# resulting Series has one day added to all index labels
msftAC.tshift(1, freq="D")

Date
2012-01-04    24.42183
2012-01-05    24.99657
2012-01-06    25.25201
                ...   
2013-12-27    36.18461
2013-12-28    36.03964
2013-12-31    36.03964
Name: Adj Close, dtype: float64

In [38]:
# calculate the percentage change in closing price
msftAC / msftAC.shift(1) - 1

Date
2012-01-03         NaN
2012-01-04    0.023534
2012-01-05    0.010219
                ...   
2013-12-26    0.009709
2013-12-27   -0.004006
2013-12-30    0.000000
Name: Adj Close, dtype: float64

# Frequency conversion of time-series data

In [39]:
# take a two item sample of the msftAC data for demonstrations
sample = msftAC[:2]
sample

Date
2012-01-03    24.42183
2012-01-04    24.99657
Name: Adj Close, dtype: float64

In [40]:
# demonstrate resampling to hour intervals
# realignment causes many NaN's
sample.asfreq("H")

2012-01-03 00:00:00    24.42183
2012-01-03 01:00:00         NaN
2012-01-03 02:00:00         NaN
                         ...   
2012-01-03 22:00:00         NaN
2012-01-03 23:00:00         NaN
2012-01-04 00:00:00    24.99657
Freq: H, Name: Adj Close, dtype: float64

In [41]:
# fill NaN's with the last know non-NaN valuen
sample.asfreq("H", method="ffill")

2012-01-03 00:00:00    24.42183
2012-01-03 01:00:00    24.42183
2012-01-03 02:00:00    24.42183
                         ...   
2012-01-03 22:00:00    24.42183
2012-01-03 23:00:00    24.42183
2012-01-04 00:00:00    24.99657
Freq: H, Name: Adj Close, dtype: float64

In [42]:
# fill with the "next known" value
sample.asfreq("H", method="bfill")

2012-01-03 00:00:00    24.42183
2012-01-03 01:00:00    24.99657
2012-01-03 02:00:00    24.99657
                         ...   
2012-01-03 22:00:00    24.99657
2012-01-03 23:00:00    24.99657
2012-01-04 00:00:00    24.99657
Freq: H, Name: Adj Close, dtype: float64

## Up and down resampling of time-series

In [43]:
# calculate the cumulative daily returns for MSFT
msft_cum_ret = (1 + (msftAC / msftAC.shift() - 1)).cumprod()
msft_cum_ret

Date
2012-01-03         NaN
2012-01-04    1.023534
2012-01-05    1.033993
                ...   
2013-12-26    1.481650
2013-12-27    1.475714
2013-12-30    1.475714
Name: Adj Close, dtype: float64

In [44]:
# resample to a monthly cumulative return
msft_monthly_cum_ret = msft_cum_ret.resample("M")
msft_monthly_cum_ret

Date
2012-01-31    1.068675
2012-02-29    1.155697
2012-03-31    1.210570
                ...   
2013-10-31    1.350398
2013-11-30    1.471915
2013-12-31    1.482362
Freq: M, Name: Adj Close, dtype: float64

In [45]:
# verify the monthly average for 2012-01
msft_cum_ret['2012-01'].mean()

1.0686746674033674

In [46]:
# verify that the default resample techique is mean
msft_cum_ret.resample("M", how="mean")

Date
2012-01-31    1.068675
2012-02-29    1.155697
2012-03-31    1.210570
                ...   
2013-10-31    1.350398
2013-11-30    1.471915
2013-12-31    1.482362
Freq: M, Name: Adj Close, dtype: float64

In [47]:
# resample to monthly and give us open, high, low, close
msft_cum_ret.resample("M", how="ohlc")[:5]

                open      high       low     close
Date                                              
2012-01-31  1.023534  1.110572  1.023534  1.103100
2012-02-29  1.116548  1.198349  1.116548  1.193461
2012-03-31  1.214142  1.235198  1.186693  1.213014
2012-04-30  1.214142  1.219030  1.141195  1.203990
2012-05-31  1.203613  1.203613  1.099860  1.104780

In [48]:
# this will return an index with periods instead of timestamps
by_periods = msft_cum_ret.resample("M", how="mean", kind="period")
for i in by_periods.index[:5]: 
    print ("{0}:{1} {2}".format(i.start_time, 
                                i.end_time, 
                                by_periods[i]))

2012-01-01 00:00:00:2012-01-31 23:59:59.999999999 1.0686746674
2012-02-01 00:00:00:2012-02-29 23:59:59.999999999 1.15569744364
2012-03-01 00:00:00:2012-03-31 23:59:59.999999999 1.21056956383
2012-04-01 00:00:00:2012-04-30 23:59:59.999999999 1.18464361598
2012-05-01 00:00:00:2012-05-31 23:59:59.999999999 1.14051599439


In [49]:
# upsampling will be demonstrated using the second
# and third values (first is NaN)
sample = msft_cum_ret[1:3]
sample

Date
2012-01-04    1.023534
2012-01-05    1.033993
Name: Adj Close, dtype: float64

In [50]:
# upsampling this will have a lot of NaN's
by_hour = sample.resample("H")
by_hour

Date
2012-01-04 00:00:00    1.023534
2012-01-04 01:00:00         NaN
2012-01-04 02:00:00         NaN
                         ...   
2012-01-04 22:00:00         NaN
2012-01-04 23:00:00         NaN
2012-01-05 00:00:00    1.033993
Freq: H, Name: Adj Close, dtype: float64

In [51]:
by_hour.interpolate()

Date
2012-01-04 00:00:00    1.023534
2012-01-04 01:00:00    1.023970
2012-01-04 02:00:00    1.024405
                         ...   
2012-01-04 22:00:00    1.033122
2012-01-04 23:00:00    1.033558
2012-01-05 00:00:00    1.033993
Freq: H, Name: Adj Close, dtype: float64