In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
    # The standard datetime time series module
from datetime import timedelta
    # For adding and subtracting time deltas
    
from dateutil.parser import parse
%matplotlib inline 
# THIS LINE MAKES PLOTS SHOW IN JUPYTER

# Key Lessons & Topics

'''
## Very important things learned

- Pandas Series with a DatetimeIndex give access to a new range of datetime functions.
- It's easy to create a DatetimeIndex from a list of standard python datetime values
- Random not from other learnings: matplotlib does not play well with a DatetimeIndex. 
 - Just pass integer values then reset the xticks to be the values from the DatetimeIndex

- Pandas lets you easily generate date ranges is extremely flexible with frequencies between dates.  See Table 10-4
- pd.date_range() -- much specify two of start, end, periods.




### Common date and time functions
- Find current timestamp
        datetime.now()

- Datetime object to a string
        str(datetime(2011, 1, 3))
        
- Datetime object to a string with a particular format - format_string follows pattern like '%Y-%m-$d'
        datetime(2011, 1, 3).strftime(format_string)
        
- String --> Datetime.  Need to tell the format of the passed value.
        value = '2011-01-03'
        datetime.strptime(value, '%Y-%m-%d')
        
- An array of strings to dates using list comprehension
        values = ['7/6/2011', '8/6/2011']
        [datetime.strptime(x, '%d/%m/%Y') for x in values]
        
- Create a Pandas datetime index object by passing a list
        datestrs=['7/6/2011', '8/6/2011']
        pd.to_datetime(datestrs)
        
- Pandas Time Series iwth DatetimeIndex
        dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
        ts = pd.Series(np.random.randn(6), index=dates)
        ts.index
        # Pandas automatically makes the index a DatetimeIndex because the index values passed were datetime objects
        # Individual index values are recast as Pandas Timestampe objects
        ts.index[0] # Yields a "Timestamp" object
        
- Creating a series with a specific start date and number of periods
        longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
    
        ## That time series can be accessed by passing just a year or year-month
            longer_ts['2001']     # returns all values in year 2001
            longer_ts['2001-05']  # returns all values in month May 2001
        
= Create a Data Frames and slice on its row index
        n = 100
        dates = pd.date_range('1/1/2000', periods = n, freq='W-WED') # Weekly every Wednesday
        long_df = pd.DataFrame(np.random.randn(n,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])
        long_df.ix['5-2001'] # Returns May 2001 values
        
- Convert any time series to a fixed frequency with resample
        ts.resample('D').asfreq()
        
- pd.date_range requires specification of two of start, end, periods.
        ## freq parameter tells which days to sample between the beginning and ending range
        pd.date_range('1/1/2000', '12/1/2000', freq='BM')
        
        ## normalize=True parameter resets the time to midnight of given day
        pd.date_range('5/2/2012 12:56:31', periods=5)
        
- p.to_timestamp converts a Period or other Date-esque object to a timestamp objects
        p = pd.Period('2007-01-01', freq='Q-JAN')
        # Combine operations - e.g. "timestamp at 4 PM on the 2nd to last business day of the quarter"
        (p.asfreq('B', how='end') - 1) # Second to last business day of the month.
        (p.asfreq('B', how='end') - 1).asfreq('T', how='start') # Get 0:00 (midnight) for that day
        (p.asfreq('B', how='end') - 1).asfreq('T', how='start') + 16 * 60 # Add 16 hours.  'T' is based on minutes.
        (p.asfreq('B', how='end') - 1).asfreq('T', how='start').to_timestamp() # Convert to timestamp

### Frequencies 
#### Can pass compound frequencies like '1h30min' or simple frequencies
- pd.date_range('1/1/2000', periods = 10, freq='1H30MIN')
- pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI') 
            # Go CRAZY!!
        
- See Table 10-4 for the long list of understood frequencies.

#### These are optimized around financial use so have ones like "business day" or "last business day of month" or "quarter end"
    
- Time Shifts
  ts.shift(2)
- Loses values punted out, creates NaN values
ts.shift(2, 'D')
- Shifts the index with the values; no NaNs, no loss of data

### Periods and Period Arithmetic
- Period = a data type - represents a time span
- Adding / subtracting by integers will add/subtract by the frequency of their object
- Adding / subtracting two instances of the same frequency will yield integers
 - An error will occur if they have different frequencies

#### Period Index - can serve as an axis index in a time series
- rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')
- pd.Series(np.random.randn(6), index=rng)
- Can create the PeriodIndex directly 
 - values = ['2001Q3', '2002Q2', '2003Q1']
 - pd.PeriodIndex(values, freq='Q-DEC')

#### Convert PeriodIndex objects (or Series indexed by such objects) to another frequency with asfreq
- Going from low to high frequency must specify whether to start at beginning or end of low frequency range:
 - p = pd.Period('2007', freq = 'A-DEC')
 - p.asfreq('M', 'start') or pd.asfreq('M', 'end')
- From high --> low frequnecy will by default start at the next instance of the specified frequency:
 - p = pd.Period('Aug-2007', 'M')
 - p.asfreq('A-JUN')
  - Output is "Period('2008', 'A-JUN')" because the next A-JUN would be period ending June, 2008.
- Change a whole PeriodIndex object similarly
 - rng = pd.period_range('2006', '2009', freq='A-DEC')
 - ts = p.Series(np.random.randn(len(rng)), index = rng)
 - ts.asfreq('M', how='start') # Output is a list with [2006-01, 2007-01, 2008-01, 2009-01]

### Quarterly Period Frequencies

- p  = pd.Period('2012Q4', freq='Q-JAN')
 - Q-JAN is the Quarter End (last calendar day of each month) for the year specified.
- Ranges work as expected
 - rng = pd.period_range('2012Q1', '2015Q4', freq='Q-DEC')
- Have fun changing ranges of period any arithmetically feasible date range you want
 - (new_rng = rng.asfreq('B', how='end').asfreq('T', how='start') + 16*60).to_timestamp
  - 4 PM on the last day of each quarter.
  - to_timestamp() gives it a seconds field and sets frequency to 'BQ'
  - Without it the frequency is left as "T" because the last operation done was an asfreq('T')
  - With to_timestamp() Pandas will turn it back into 'BQ' - end of frequency quarter

### Timestamps to Periods (and back)
- A timestamp-indexed time series by default shows the last day of the frequency range of each element
 - rng = pd.date_range(start='1/1/2000', periods = 3, freq='M')
 - ts = pd.Series(np.random.randn(3), index=rng)
   - By default the timestamps will be at the END of the date range specified 
 - pts = ts.to_period()
 - new_ts = pts.to_timestamp()
   - By default the timestamps will be set to the BEGINNING of the period range
   
### PeriodIndex from Arrays
- Roles in a PeriodIndex be set explicitly from DataFrame columns (or any derived Series) where needed
    - index = pd.PeriodIndex(year = data['year'], quarter = data['quarter'], freq='Q-DEC')

## Resampling and Frequency Conversion

### Example 1
### Example 2


### Date & Time Data Types
    datetime.datetime() --> (Y, m, d, H, M, S, u)
    datetime.timedelta() --> (d, s)



### Date Parsing with dateutil library
    from dateutil.parser import parse
    parse('2011-01-03')
    parse('Jan 31, 2017 11:52 PM')
    parse('6/12/2011', dayfirst=True)




# Work

## Date and Time Data Types and Tools - Core Python

In [2]:
### Date and Time Data Types and Tools
now = datetime.now()
now # Units are (year, month, day, hour, minute, second, microseconds)

datetime.datetime(2017, 2, 27, 14, 56, 14, 583471)

In [3]:
# Can access the individual attributes of now in the usual way
now.year, now.month, now.day

(2017, 2, 27)

In [4]:
# Can do arithmetic on datetime objects

delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta  # delta only has methods days and seconds WTF 

datetime.timedelta(926, 56700)

In [5]:
# Can do math between datetime and timedelta objects

start = datetime(2011, 1, 7)
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [6]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

## Converting between string and datetime

In [7]:
# Cast a datetime as a string
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [8]:
# Cast a datetime as a string with a specific format
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [9]:
# Stings to dates
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [10]:
# An array of strings to dates using list comprehension
values = ['7/6/2011', '8/6/2011']
[datetime.strptime(x, '%d/%m/%Y') for x in values]

[datetime.datetime(2011, 6, 7, 0, 0), datetime.datetime(2011, 6, 8, 0, 0)]

In [11]:
# Use dateutil.parser's methods to parse nearly any common format
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [12]:
# Parse a human-readable format
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [13]:
# Pandas to_datetime
datestrs=['7/6/2011', '8/6/2011']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)

In [14]:
# pd.to_datetime will accept empty values and convert them to "NaT"
idx = pd.to_datetime(datestrs + [None])
print idx[2]
print pd.isnull(idx)

NaT
[False False  True]


## Pandas Time Series Basics

In [15]:
# Create a pd.Series with an index of type DatetimeIndex
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [16]:
# Arithetic operations align dates and then perform the operations
ts + ts[::2]

2011-01-02   -0.889312
2011-01-05         NaN
2011-01-07    1.332112
2011-01-08         NaN
2011-01-10   -1.195339
2011-01-12         NaN
dtype: float64

### Indexing, Selection, Subsetting

In [17]:
# You can index a time series like a usual series
stamp = ts.index[2]
ts[stamp]

0.66605602975129896

In [18]:
# Can be indexed by pasing a interpretable string - Example #1
ts['1/10/2011'] # returns a value even though the form of the index is 2011-01-10

-0.59766947394952563

In [19]:
# Can be indexed by pasing a interpretable string - Example #2
ts['20110110'] # returns a value even though the form of the index is 2011-01-10

-0.59766947394952563

In [20]:
# Creating a time series from periods and accessing via year or year-month
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts['2001-01']
longer_ts['2001-05']

2001-05-01    0.003405
2001-05-02   -1.013181
2001-05-03   -0.955466
2001-05-04    1.381803
2001-05-05   -1.017602
2001-05-06    0.374289
2001-05-07    0.076382
2001-05-08   -0.279276
2001-05-09   -0.363604
2001-05-10   -3.323995
2001-05-11    0.083612
2001-05-12    2.727985
2001-05-13   -1.095151
2001-05-14    0.794943
2001-05-15    1.283253
2001-05-16    0.660082
2001-05-17    0.769284
2001-05-18    0.461196
2001-05-19    2.052661
2001-05-20   -1.262820
2001-05-21   -1.195756
2001-05-22    0.613881
2001-05-23   -0.586687
2001-05-24    1.165092
2001-05-25    0.274173
2001-05-26   -0.785689
2001-05-27    0.964138
2001-05-28    0.079778
2001-05-29   -0.578807
2001-05-30    1.443541
2001-05-31   -0.368346
Freq: D, dtype: float64

In [21]:
# Slicing works as before
ts[:datetime(2011, 1, 7)]

2011-01-02   -0.444656
2011-01-05    0.048787
2011-01-07    0.666056
dtype: float64

In [22]:
# You can slice by passing date values that are not in the Series itself because Pandas knows about date ordering
ts['2011-01-03':'2011-01-011']

2011-01-05    0.048787
2011-01-07    0.666056
2011-01-08    0.510230
2011-01-10   -0.597669
dtype: float64

In [23]:
# Create a Data Frames and slice on its row index
dates = pd.date_range('1/1/2000', periods = 100, freq='W-WED') # Weekly every Wednesday
long_df = pd.DataFrame(np.random.randn(100,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.ix['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.042188,0.257519,-0.058993,-0.216478
2001-05-09,-1.400603,-0.609729,-0.034599,-0.987848
2001-05-16,2.525571,0.205301,-0.798865,0.755072
2001-05-23,2.284552,-0.849102,-0.301464,-1.398773
2001-05-30,-1.241376,-0.009889,0.742892,0.811432


### Time Series with Duplicate Indices are fine

In [24]:
# Create a DatetimeIndex by passing a list of strings (in a date format).  Some of the strings are duplicated.
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000','1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [25]:
# Grouping by a DatetimeIndex requires passing a Level
grouped = dup_ts.groupby(level=0)
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## Date Ranges, Frequencies, and Shifting

In [26]:
# Convert any time series to a fixed frequency with resample
# Note that the Pandas book version I have uses an older version of resample.  
# The modern version of resample is essentially a groupby()
ts.resample('D').asfreq()

2011-01-02   -0.444656
2011-01-03         NaN
2011-01-04         NaN
2011-01-05    0.048787
2011-01-06         NaN
2011-01-07    0.666056
2011-01-08    0.510230
2011-01-09         NaN
2011-01-10   -0.597669
2011-01-11         NaN
2011-01-12    0.460854
Freq: D, dtype: float64

### Generating Date Ranges

In [27]:
# By default, daily date ranges are generated
index = pd.date_range('4/1/2012', '6/1/2012')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [28]:
# Pass the start or end and # of periods 
index_start = pd.date_range('4/1/2012', periods = 20)
index_end = pd.date_range('6/1/2012', periods = 20)
print index_start
print index_end

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2012-06-01', '2012-06-02', '2012-06-03', '2012-06-04',
               '2012-06-05', '2012-06-06', '2012-06-07', '2012-06-08',
               '2012-06-09', '2012-06-10', '2012-06-11', '2012-06-12',
               '2012-06-13', '2012-06-14', '2012-06-15', '2012-06-16',
               '2012-06-17', '2012-06-18', '2012-06-19', '2012-06-20'],
              dtype='datetime64[ns]', freq='D')


In [29]:
# Pass a frequency.  It accepts off-the-wall frequencies like "last business day of month'
pd.date_range('1/1/2000', '12/1/2000', freq='BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [30]:
# If timestamps are included with the past dates, then they're preserved in the future dates created
pd.date_range('5/2/2012 12:56:31', periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [31]:
# Reset time information to midnight using "normalize"
pd.date_range(start='5/2/2012 12:56:31', periods=5, normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

### Frequencies and Date Offsets

In [32]:
# Typically are referenced by a string alias
    # 'M' --> calendar month end
    # 'Q' --> quarterly
    # 'H' --> Hourly
pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [33]:
# Can pass compound frequencies like '1h30min'
pd.date_range('1/1/2000', periods = 10, freq='1H30MIN')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [34]:
rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI')
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

### Use time shifting to calculate changes over time.
- Move data in a time series forward or backward by a pre-specified amount
- By default the date outside the original range is lost, but pass a parameter to shift() to shift the indexes as well

In [36]:
# Create a 4-value time series
ts = pd.Series(np.random.randn(4), index=pd.date_range('1/1/2000', periods = 4, freq='M'))
ts

2000-01-31    0.454212
2000-02-29   -0.495571
2000-03-31    1.379088
2000-04-30    0.278667
Freq: M, dtype: float64

In [39]:
# ts.shift(1) shifts everything forward by 1 month.
# The 1st index is preserved but its value is a NaN.
# The last index value is popped out of the Series.
ts.shift(1)

2000-01-31         NaN
2000-02-29    0.454212
2000-03-31   -0.495571
2000-04-30    1.379088
Freq: M, dtype: float64

In [41]:
# Calculate the month-on-month deltas
ts.shift(1) / ts - 1

2000-01-31         NaN
2000-02-29   -1.916544
2000-03-31   -1.359347
2000-04-30    3.948874
Freq: M, dtype: float64

In [43]:
# Pass freq into shift() to move the indexes as well so no data points are lost.
ts.shift(2, freq='M')

2000-03-31    0.454212
2000-04-30   -0.495571
2000-05-31    1.379088
2000-06-30    0.278667
Freq: M, dtype: float64

In [52]:
# Passing another frequency shift everything forward by the passed frequency.
ts.shift(3, freq='D')

2000-02-03    0.454212
2000-03-03   -0.495571
2000-04-03    1.379088
2000-05-03    0.278667
dtype: float64

#### Anchoring date shifts - e.g. Month End and Roll Forward

In [56]:
from pandas.tseries.offsets import Day, MonthEnd
now = datetime.now()
now

datetime.datetime(2017, 2, 27, 15, 14, 33, 181216)

In [57]:
# Adding 3 days directly does what you'd expect
now + 3 * Day()

Timestamp('2017-03-02 15:14:33.181216')

In [66]:
# An achored value will roll forward a day to the next date - like setting you to the 0th spot of MonthEnd()
print "The date for the end of this month is: ", now + MonthEnd()

The date for the end of this month is:  2017-02-28 15:14:33.181216


In [69]:
# An anchored offset can be used with "rollforward" or "rollback" but that seems stupid & redundant
offset = MonthEnd()
print "End of this month", offset.rollforward(now)
print "End of last month",offset.rollback(now)

End of this month 2017-02-28 15:14:33.181216
End of last month 2017-01-31 15:14:33.181216


## Time Zones are hard and there is stuff in the book about them.  Check it out later.

In [None]:
See Location 7095

### Periods and Period Arithmetic
- Period = a data type - represents a time span
- Adding / subtracting by integers will add/subtract by the frequency of their object
- Adding / subtracting two instances of the same frequency will yield integers
 - An error will occur if they have different frequencies

#### Period Index - can serve as an axis index in a time series
- rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')
- pd.Series(np.random.randn(6), index=rng)
- Can create the PeriodIndex directly 
 - values = ['2001Q3', '2002Q2', '2003Q1']
 - pd.PeriodIndex(values, freq='Q-DEC')
 
#### Convert PeriodIndex objects (or Series indexed by such objects) to another frequency with asfreq
- Going from low to high frequency must specify whether to start at beginning or end of low frequency range:
 - p = pd.Period('2007', freq = 'A-DEC')
 - p.asfreq('M', 'start') or pd.asfreq('M', 'end')
- From high --> low frequnecy will by default start at the next instance of the specified frequency:
 - p = pd.Period('Aug-2007', 'M')
 - p.asfreq('A-JUN')
  - Output is "Period('2008', 'A-JUN')" because the next A-JUN would be period ending June, 2008.
- Change a whole PeriodIndex object similarly
 - rng = pd.period_range('2006', '2009', freq='A-DEC')
 - ts = p.Series(np.random.randn(len(rng)), index = rng)
 - ts.asfreq('M', how='start') # Output is a list with [2006-01, 2007-01, 2008-01, 2009-01]

In [73]:
# Year 2007, Annual, finishing in December (i.e. normal year span)
p = pd.Period(2007, freq='A-DEC')
p

Period('2007', 'A-DEC')

In [74]:
# Adding / subtracting by integers will add/subtract by the frequency of their object
p + 5

Period('2012', 'A-DEC')

In [77]:
# Adding / subtracting two instances of the same frequency will yield integers
pd.Period('2014', freq='A-DEC') - p

7

In [80]:
# An error will occur if they have different frequencies
pd.Period('2014', freq='M') - p

IncompatibleFrequency: Input has different freq=A-DEC from Period(freq=M)

In [81]:
# Create a PeriodIndex that can serve as an axis index
rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [82]:
# Apply the PeriodIndex to the series to create a time series.
pd.Series(np.random.randn(6), index=rng)

2000-01    0.642514
2000-02   -1.732654
2000-03    0.450163
2000-04    1.120333
2000-05    1.145970
2000-06    1.531645
Freq: M, dtype: float64

In [84]:
# Create a PeriodIndex directly from a list
values = ['2001Q1', '2002Q2', '2003Q3']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

PeriodIndex(['2001Q1', '2002Q2', '2003Q3'], dtype='period[Q-DEC]', freq='Q-DEC')

In [88]:
# Convert PeriodIndex to another frequency
# Low --> High Frequency
p = pd.Period('2007', freq='A-DEC')
p

Period('2007', 'A-DEC')

In [89]:
# Convert annual to monthly, beginning with beginning of annual phase
p.asfreq('M', how='start')

Period('2007-01', 'M')

In [90]:
# Convert annual to monthly, beginning with end of annual phase
p.asfreq('M', how='end')

Period('2007-12', 'M')

In [93]:
# High --> Low frequency 
p = pd.Period('Aug-2007', 'M')
p

Period('2007-08', 'M')

In [94]:
# Changed to A-JUN, the period reads '2008' because you're saying the year period that ends with the next June,
# which in this case in June 2008
p.asfreq('A-JUN')

Period('2008', 'A-JUN')

In [96]:
# Whole ranges rather than individual periods can be changed.  
# Recall a PeriodIndex is just composed of  period start, period end, and a frequency
rng = pd.period_range('2006', '2009', freq='A-DEC')
rng

PeriodIndex(['2006', '2007', '2008', '2009'], dtype='period[A-DEC]', freq='A-DEC')

In [98]:
ts = pd.Series(np.random.randn(len(rng)), index = rng)
ts

2006    1.487996
2007   -0.316745
2008   -0.882702
2009   -0.625288
Freq: A-DEC, dtype: float64

In [100]:
ts.asfreq('M', how='start')

2006-01    1.487996
2007-01   -0.316745
2008-01   -0.882702
2009-01   -0.625288
Freq: M, dtype: float64

### Quarterly Period Frequencies

- p  = pd.Period('2012Q4', freq='Q-JAN')
 - Q-JAN is the Quarter End (last calendar day of each month) for the year specified.
- Ranges work as expected
 - rng = pd.period_range('2012Q1', '2015Q4', freq='Q-DEC')
- Have fun changing ranges of period any arithmetically feasible date range you want
 - (new_rng = rng.asfreq('B', how='end').asfreq('T', how='start') + 16*60).to_timestamp
  - 4 PM on the last day of each quarter.
  - to_timestamp() gives it a seconds field and sets frequency to 'BQ'
    - Without it the frequency is left as "T" because the last operation done was an asfreq('T')
    - With to_timestamp() Pandas will turn it back into 'BQ' - end of frequency quarter

In [104]:
p  = pd.Period('2012Q4', freq='Q-JAN')
# Q-JAN is the Quarter End (last calendar day of each month) for the year specified.
# So Jan 31, 2012
p

Period('2012Q4', 'Q-JAN')

In [103]:
# Verify that by setting the freq to Monthly and pegging to the end of the range
p.asfreq('M', how='end')

Period('2012-01', 'M')

In [112]:
# Combine operations - e.g. "timestamp at 4 PM on the 2nd to last business day of the quarter"
(p.asfreq('B', how='end') - 1) # Second to last business day of the month.
(p.asfreq('B', how='end') - 1).asfreq('T', how='start') # Get 0:00 (midnight) for that day
(p.asfreq('B', how='end') - 1).asfreq('T', how='start') + 16 * 60 # Add 16 hours.  'T' is based on minutes.
(p.asfreq('B', how='end') - 1).asfreq('T', how='start').to_timestamp() # Convert to timestamp

Timestamp('2012-01-30 00:00:00')

In [134]:
rng = pd.period_range('2012Q1', '2015Q4', freq='Q-DEC')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012Q1    0.570928
2012Q2    1.493439
2012Q3    1.370811
2012Q4   -0.267135
2013Q1    0.030314
2013Q2   -0.648338
2013Q3   -1.486801
2013Q4    1.327489
2014Q1   -0.337925
2014Q2    0.122831
2014Q3   -0.329096
2014Q4   -0.078348
2015Q1   -0.030753
2015Q2    0.585982
2015Q3    1.129129
2015Q4   -1.130287
Freq: Q-DEC, dtype: float64

In [136]:
# Change the ranges to 4 PM on the last business day of each month.
new_range = (rng.asfreq('B', how='end').asfreq('T', how='start') + 16*60).to_timestamp()
# to_timestamp() will tell Pandas to figure out a new frequency based on this data
# Without it the frequency is left as "T" because the last operation done was an asfreq('T')
# With to_timestamp() Pandas will turn it back into 'BQ' - end of frequency quarter
ts.index = new_range
ts

2012-03-30 16:00:00    0.570928
2012-06-29 16:00:00    1.493439
2012-09-28 16:00:00    1.370811
2012-12-31 16:00:00   -0.267135
2013-03-29 16:00:00    0.030314
2013-06-28 16:00:00   -0.648338
2013-09-30 16:00:00   -1.486801
2013-12-31 16:00:00    1.327489
2014-03-31 16:00:00   -0.337925
2014-06-30 16:00:00    0.122831
2014-09-30 16:00:00   -0.329096
2014-12-31 16:00:00   -0.078348
2015-03-31 16:00:00   -0.030753
2015-06-30 16:00:00    0.585982
2015-09-30 16:00:00    1.129129
2015-12-31 16:00:00   -1.130287
Freq: BQ-DEC, dtype: float64

### Timestamps to Periods (and back)
- A timestamp-indexed time series by default shows the last day of the frequency range of each element
 - rng = pd.date_range(start='1/1/2000', periods = 3, freq='M')
 - ts = pd.Series(np.random.randn(3), index=rng)
   - By default the timestamps will be at the END of the date range specified 
 - pts = ts.to_period()
 - new_ts = pts.to_timestamp()
   - By default the timestamps will be set to the BEGINNING of the period range

In [148]:
# A timestamp-indexed time series by default shows the last day of the frequency range of each element
rng = pd.date_range(start='1/1/2000', periods = 3, freq='M')
rng
ts = pd.Series(np.random.randn(3), index=rng)
ts

2000-01-31    1.548425
2000-02-29    0.215686
2000-03-31   -0.934329
Freq: M, dtype: float64

In [150]:
# A period-indexed time series hows the period identifier
pts = ts.to_period()
pts

2000-01    1.548425
2000-02    0.215686
2000-03   -0.934329
Freq: M, dtype: float64

In [151]:
# When converting back to a timestamp, for some reason Pandas gives you the beginning of the range
pts.to_timestamp()

2000-01-01    1.548425
2000-02-01    0.215686
2000-03-01   -0.934329
Freq: MS, dtype: float64

In [152]:
# Specify how = 'end' to get the end of the range
pts.to_timestamp(how='end')

2000-01-31    1.548425
2000-02-29    0.215686
2000-03-31   -0.934329
Freq: M, dtype: float64

### PeriodIndex from Arrays
- Roles in a PeriodIndex be set explicitly from DataFrame columns (or any derived Series) where needed
    - index = pd.PeriodIndex(year = data['year'], quarter = data['quarter'], freq='Q-DEC')

In [168]:
# Combine year and quarter from different columns
data = pd.read_csv('ch08/macrodata.csv')
data[:5]

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [163]:
# The role of each data column can be set explicitly in declaring the PeriodIndex object
index = pd.PeriodIndex(year = data['year'], quarter = data['quarter'], freq='Q-DEC')
index
    # freq='Q-DEC' is the default behaviour

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')

In [169]:
# Set this as a new index
data.index = index
data[:5]

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959Q1,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1959Q2,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
1959Q3,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
1959Q4,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
1960Q1,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [170]:
## Resampling and Frequency Conversion