## Date Ranges, Frequencies and Shifting

*[Coding along with Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython, Wes Mckinney, O'Reilly, 1st Edition October 2012; the notebook accompanying this chapter can be found on [GitHub](https://github.com/wesm/pydata-book/blob/3rd-edition/ch11.ipynb)]*

In [1]:
# https://github.com/wesm/pydata-book/blob/3rd-edition/ch11.ipynb
from datetime import datetime
import pandas as pd
import numpy as np

In [2]:
# generic time series in pandas are assumed to be irregular (= have no fixed frequency)
# if we want to have fixed frequency (or relative to fixed frequency) -> pandas to the rescue
# pandas has a full suit of standard time series frequencies and tools for resampling, interferring frequencies
# and generating fixed frequency date ranges

# a basic kind of time series on pandas is a Series indexed by timestamps
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.standard_normal(6), index=dates)
ts

2011-01-02    0.808089
2011-01-05    0.129277
2011-01-07   -1.133082
2011-01-08    0.014581
2011-01-10    1.124164
2011-01-12    0.869448
dtype: float64

In [25]:
# converting the sample time series to be of fixed daily frequency by calling resample
resampler = ts.resample("D") # D is interpreted as daily frequency
resampler

<pandas.core.resample.DatetimeIndexResampler object at 0x11b166450>

### Generating Date Ranges

In [27]:
# pd.date_range is responsible for generating a DatetimeIndex
# by default date_range generates daily timestamps
index = pd.date_range("2012-04-01", "2012-06-01")
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [28]:
# by default date_range generates daily timestamps
# if you pass only a start- or enddate you must pass a number of periods to generate
pd.date_range(start="2012-04-01", periods=20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [29]:
pd.date_range(end="2012-06-01", periods=20)

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

In [30]:
# example: a date index containing the last business day of each month by passing 'BME' frequency
pd.date_range("2000-01-01", "2000-12-01", freq="BME")

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BME')

In [7]:
# by default time of start or end timestamp is preserved
pd.date_range("2012-05-02 12:56:31", periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [31]:
# normalizing time of timestamps to midnight
pd.date_range("2012-05-02 12:56:31", periods=5, normalize=True) # midnight is convention for normalize=True

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

### Frequencies and Date Offsets

In [32]:
# frequencies in pandas are composed of a base frequency and a multiplier
# base frequencies are referred to by string aliases like 'm' or 'h'
# for each base frequency there's an object referred to as a date offset
# hourly frequency with the Hour class
from pandas.tseries.offsets import Hour, Minute
hour = Hour()
hour

<Hour>

In [33]:
four_hours = Hour(4) # defining multiple by passing integer
four_hours

<4 * Hours>

In [34]:
# creating multiple with putting integer before base frequency
pd.date_range("2000-01-01", "2000-01-03 23:59", freq="4h")

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4h')

In [35]:
Hour(2) + Minute(30) # combining offsets with addition

<150 * Minutes>

In [36]:
# passing frequency strings like "1h30min"
pd.date_range("2000-01-01", periods=10, freq="1h30min")

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90min')

#### Week of month dates

In [37]:
# frequency class "week of month" starting with WOM
monthly_dates = pd.date_range("2012-01-01", "2012-09-01", freq="WOM-3FRI") # third Friday of each month
list(monthly_dates)

[Timestamp('2012-01-20 00:00:00'),
 Timestamp('2012-02-17 00:00:00'),
 Timestamp('2012-03-16 00:00:00'),
 Timestamp('2012-04-20 00:00:00'),
 Timestamp('2012-05-18 00:00:00'),
 Timestamp('2012-06-15 00:00:00'),
 Timestamp('2012-07-20 00:00:00'),
 Timestamp('2012-08-17 00:00:00')]

### Shifting (Leading and Lagging) Data

In [38]:
# shifting refers to moving data forwards and backwards in time
# Series and DataFrame have shift methods (forwards and backwards) that leave the index unmodified
ts = pd.Series(np.random.standard_normal(4),
               index=pd.date_range("2000-01-01", periods=4, freq="ME"))
ts

2000-01-31   -0.744564
2000-02-29   -1.978780
2000-03-31    0.844636
2000-04-30   -1.154180
Freq: ME, dtype: float64

In [41]:
ts.shift(2) # missing data is introduced at the start of the time series

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.744564
2000-04-30   -1.978780
Freq: ME, dtype: float64

In [43]:
ts.shift(-2)  # missing data is introduced at the end of the time series

2000-01-31    0.844636
2000-02-29   -1.154180
2000-03-31         NaN
2000-04-30         NaN
Freq: ME, dtype: float64

In [44]:
# common use of shift: computing percent changes in a time series or multiple time series as DataFrame columns
ts / ts.shift(1) - 1

2000-01-31         NaN
2000-02-29    1.657637
2000-03-31   -1.426847
2000-04-30   -2.366483
Freq: ME, dtype: float64

In [46]:
# passing the frequency to shift to advance the timestamps (???)
ts.shift(2, freq="ME")

2000-03-31   -0.744564
2000-04-30   -1.978780
2000-05-31    0.844636
2000-06-30   -1.154180
Freq: ME, dtype: float64

In [50]:
# passing other frequencies, giving some flexibility in how to lead and lag the data
ts.shift(3, freq="D")

2000-02-03   -0.744564
2000-03-03   -1.978780
2000-04-03    0.844636
2000-05-03   -1.154180
dtype: float64

In [51]:
ts.shift(1, freq="90min") # FutureWarning: 'T' is deprecated and will be removed in a future version, please use 'min' instead.

2000-01-31 01:30:00   -0.744564
2000-02-29 01:30:00   -1.978780
2000-03-31 01:30:00    0.844636
2000-04-30 01:30:00   -1.154180
dtype: float64

#### Shifting dates with offsets

In [52]:
# pandas date offsets with datetime or Timestamp objects
from pandas.tseries.offsets import Day, MonthEnd
now = datetime(2011, 11, 17)
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [53]:
# adding an anchored offset like MonthEnd
# -> the first increment will "roll forward" a date to the next date according to the frequency rule
now + MonthEnd()

Timestamp('2011-11-30 00:00:00')

In [54]:
now + MonthEnd(2)

Timestamp('2011-12-31 00:00:00')

In [56]:
# using rollforward and rollback to let anchored offsets explicitly "roll" dates forward
offset = MonthEnd()
offset.rollforward(now)

Timestamp('2011-11-30 00:00:00')

In [57]:
offset.rollback(now)

Timestamp('2011-10-31 00:00:00')

In [58]:
# using date offsets with groupby
ts = pd.Series(np.random.standard_normal(20),
               index=pd.date_range("2000-01-15", periods=20, freq="4D"))
ts

2000-01-15   -0.820358
2000-01-19    0.249372
2000-01-23   -0.580437
2000-01-27   -0.134233
2000-01-31   -0.092852
2000-02-04    1.344548
2000-02-08    1.178955
2000-02-12    0.641349
2000-02-16   -0.298810
2000-02-20   -2.182057
2000-02-24   -0.663204
2000-02-28    0.669059
2000-03-03    0.542601
2000-03-07    2.302754
2000-03-11   -0.046770
2000-03-15    0.877305
2000-03-19    0.853229
2000-03-23    1.589876
2000-03-27    0.453524
2000-03-31    0.400898
Freq: 4D, dtype: float64

In [59]:
ts.groupby(MonthEnd().rollforward).mean()

2000-01-31   -0.275702
2000-02-29    0.098549
2000-03-31    0.871677
dtype: float64

In [60]:
# using resample to achieve the same result 
ts.resample("ME").mean()

2000-01-31   -0.275702
2000-02-29    0.098549
2000-03-31    0.871677
Freq: ME, dtype: float64