In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
    # The standard datetime time series module
from datetime import timedelta
    # For adding and subtracting time deltas
    
from dateutil.parser import parse
%matplotlib inline 
# THIS LINE MAKES PLOTS SHOW IN JUPYTER

# Key Lessons & Topics

In [59]:
'''
-- Very important things learned

Pandas Series with a DatetimeIndex give access to a new range of datetime functions.
It's easy to create a DatetimeIndex from a list of standard python datetime values
# Random not from other learnings: matplotlib does not play well with a DatetimeIndex. 
# Just pass integer values then reset the xticks to be the values from the DatetimeIndex

Pandas lets you easily generate date ranges is extremely flexible with frequencies between dates.  See Table 10-4




-- Common date and time functions
    # Find current timestamp
        datetime.now()

    # Datetime object to a string
        str(datetime(2011, 1, 3))
        
    # Datetime object to a string with a particular format - format_string follows pattern like '%Y-%m-$d'
        datetime(2011, 1, 3).strftime(format_string)
        
    # String --> Datetime.  Need to tell the format of the passed value.
        value = '2011-01-03'
        datetime.strptime(value, '%Y-%m-%d')
        
    # An array of strings to dates using list comprehension
        values = ['7/6/2011', '8/6/2011']
        [datetime.strptime(x, '%d/%m/%Y') for x in values]
        
    # Create a Pandas datetime index object by passing a list
        datestrs=['7/6/2011', '8/6/2011']
        pd.to_datetime(datestrs)
        
    # Pandas Time Series iwth DatetimeIndex
        dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
        ts = pd.Series(np.random.randn(6), index=dates)
        ts.index
        # Pandas automatically makes the index a DatetimeIndex because the index values passed were datetime objects
        # Individual index values are recast as Pandas Timestampe objects
        ts.index[0] # Yields a "Timestamp" object
        
    # Creating a series with a specific start date and number of periods
        longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
    
        ## That time series can be accessed by passing just a year or year-month
            longer_ts['2001']     # returns all values in year 2001
            longer_ts['2001-05']  # returns all values in month May 2001
        
    # Create a Data Frames and slice on its row index
        n = 100
        dates = pd.date_range('1/1/2000', periods = n, freq='W-WED') # Weekly every Wednesday
        long_df = pd.DataFrame(np.random.randn(n,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])
        long_df.ix['5-2001'] # Returns May 2001 values
        
    # Convert any time series to a fixed frequency with resample
        ts.resample('D').asfreq()
        
    # pd.date_range requires specification of two of start, end, periods.
        ## freq parameter tells which days to sample between the beginning and ending range
        pd.date_range('1/1/2000', '12/1/2000', freq='BM')
        
        ## normalize=True parameter resets the time to midnight of given day
        pd.date_range('5/2/2012 12:56:31', periods=5)
        

-- Frequencies 
    # Can pass compound frequencies like '1h30min' or simple frequencies
        pd.date_range('1/1/2000', periods = 10, freq='1H30MIN')
        pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI') 
            # Go CRAZY!!
        
    # See Table 10-4 for the long list of understood frequencies.
    # These are optimized around financial use so have ones like "business day" or "last business day of month" or "quarter end"
    
    # 

    
-- Date & Time Data Types
    datetime.datetime() --> (Y, m, d, H, M, S, u)
    datetime.timedelta() --> (d, s)



-- Date Parsing with dateutil library
    from dateutil.parser import parse
    parse('2011-01-03')
    parse('Jan 31, 2017 11:52 PM')
    parse('6/12/2011', dayfirst=True)


'''

'\n\n-- Common date and time functions\n    # Find current timestamp\n        datetime.now()\n\n    # Datetime object to a string\n        str(datetime(2011, 1, 3))\n        \n    # Datetime object to a string with a particular format - format_string follows pattern like \'%Y-%m-$d\'\n        datetime(2011, 1, 3).strftime(format_string)\n        \n    # String --> Datetime.  Need to tell the format of the passed value.\n        value = \'2011-01-03\'\n        datetime.strptime(value, \'%Y-%m-%d\')\n        \n    # An array of strings to dates using list comprehension\n        values = [\'7/6/2011\', \'8/6/2011\']\n        [datetime.strptime(x, \'%d/%m/%Y\') for x in values]\n        \n    # Create a Pandas datetime index object\n        datestrs=[\'7/6/2011\', \'8/6/2011\']\n        pd.to_datetime(datestrs)\n        \n    # Pandas Time Series iwth DatetimeIndex\n        dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10

# Work

## Date and Time Data Types and Tools - Core Python

In [6]:
### Date and Time Data Types and Tools
now = datetime.now()
now # Units are (year, month, day, hour, minute, second, microseconds)

datetime.datetime(2017, 2, 24, 16, 24, 49, 240626)

In [7]:
# Can access the individual attributes of now in the usual way
now.year, now.month, now.day

(2017, 2, 24)

In [13]:
# Can do arithmetic on datetime objects

delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta  # delta only has methods days and seconds WTF 

datetime.timedelta(926, 56700)

In [5]:
# Can do math between datetime and timedelta objects

start = datetime(2011, 1, 7)
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [6]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

## Converting between string and datetime

In [11]:
# Cast a datetime as a string
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [12]:
# Cast a datetime as a string with a specific format
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [13]:
# Stings to dates
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [15]:
# An array of strings to dates using list comprehension
values = ['7/6/2011', '8/6/2011']
[datetime.strptime(x, '%d/%m/%Y') for x in values]

[datetime.datetime(2011, 6, 7, 0, 0), datetime.datetime(2011, 6, 8, 0, 0)]

In [19]:
# Use dateutil.parser's methods to parse nearly any common format
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [20]:
# Parse a human-readable format
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [21]:
# Pandas to_datetime
datestrs=['7/6/2011', '8/6/2011']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06', '2011-08-06'], dtype='datetime64[ns]', freq=None)

In [27]:
# pd.to_datetime will accept empty values and convert them to "NaT"
idx = pd.to_datetime(datestrs + [None])
print idx[2]
print pd.isnull(idx)

NaT
[False False  True]


## Pandas Time Series Basics

In [32]:
# Create a pd.Series with an index of type DatetimeIndex
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [34]:
# Arithetic operations align dates and then perform the operations
ts + ts[::2]

2011-01-02   -1.228432
2011-01-05         NaN
2011-01-07   -1.977642
2011-01-08         NaN
2011-01-10   -0.468716
2011-01-12         NaN
dtype: float64

### Indexing, Selection, Subsetting

In [44]:
# You can index a time series like a usual series
stamp = ts.index[2]
ts[stamp]

-0.988821198409642

In [46]:
# Can be indexed by pasing a interpretable string - Example #1
ts['1/10/2011'] # returns a value even though the form of the index is 2011-01-10

-0.234357877450382

In [47]:
# Can be indexed by pasing a interpretable string - Example #2
ts['20110110'] # returns a value even though the form of the index is 2011-01-10

-0.234357877450382

In [51]:
# Creating a time series from periods and accessing via year or year-month
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts['2001-01']
longer_ts['2001-05']

2001-05-01    0.546238
2001-05-02    0.280615
2001-05-03   -0.410547
2001-05-04   -0.354834
2001-05-05    1.084424
2001-05-06    1.327322
2001-05-07    1.347081
2001-05-08   -1.888758
2001-05-09   -1.155923
2001-05-10   -0.502922
2001-05-11    0.317845
2001-05-12   -0.361681
2001-05-13   -2.564729
2001-05-14    0.176038
2001-05-15   -1.265134
2001-05-16   -0.757674
2001-05-17    1.380417
2001-05-18   -1.016104
2001-05-19    0.112499
2001-05-20    2.373924
2001-05-21    0.691358
2001-05-22   -0.749373
2001-05-23    0.906505
2001-05-24   -1.507107
2001-05-25    2.271611
2001-05-26    1.323091
2001-05-27   -0.341347
2001-05-28   -0.356685
2001-05-29    1.209649
2001-05-30    0.592155
2001-05-31   -0.379500
Freq: D, dtype: float64

In [54]:
# Slicing works as before
ts[:datetime(2011, 1, 7)]

2011-01-02   -0.614216
2011-01-05    0.572496
2011-01-07   -0.988821
dtype: float64

In [56]:
# You can slice by passing date values that are not in the Series itself because Pandas knows about date ordering
ts['2011-01-03':'2011-01-011']

2011-01-05    0.572496
2011-01-07   -0.988821
2011-01-08    1.270666
2011-01-10   -0.234358
dtype: float64

In [60]:
# Create a Data Frames and slice on its row index
dates = pd.date_range('1/1/2000', periods = 100, freq='W-WED') # Weekly every Wednesday
long_df = pd.DataFrame(np.random.randn(100,4), index=dates, columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.ix['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,0.045626,-0.607504,0.556818,-0.004877
2001-05-09,-0.461817,0.359351,-1.179367,0.209323
2001-05-16,-1.517911,-0.889724,-0.162152,-1.166955
2001-05-23,-0.270613,0.437736,0.835822,-0.683391
2001-05-30,0.452099,0.75832,-1.103144,-1.162637


### Time Series with Duplicate Indices are fine

In [73]:
# Create a DatetimeIndex by passing a list of strings (in a date format).  Some of the strings are duplicated.
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000','1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [76]:
# Grouping by a DatetimeIndex requires passing a Level
grouped = dup_ts.groupby(level=0)
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## Date Ranges, Frequencies, and Shifting

In [84]:
# Convert any time series to a fixed frequency with resample
# Note that the Pandas book version I have uses an older version of resample.  
# The modern version of resample is essentially a groupby()
ts.resample('D').asfreq()

2011-01-02   -0.614216
2011-01-03         NaN
2011-01-04         NaN
2011-01-05    0.572496
2011-01-06         NaN
2011-01-07   -0.988821
2011-01-08    1.270666
2011-01-09         NaN
2011-01-10   -0.234358
2011-01-11         NaN
2011-01-12   -0.352303
Freq: D, dtype: float64

### Generating Date Ranges

In [86]:
# By default, daily date ranges are generated
index = pd.date_range('4/1/2012', '6/1/2012')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [90]:
# Pass the start or end and # of periods 
index_start = pd.date_range('4/1/2012', periods = 20)
index_end = pd.date_range('6/1/2012', periods = 20)
print index_start
print index_end

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2012-06-01', '2012-06-02', '2012-06-03', '2012-06-04',
               '2012-06-05', '2012-06-06', '2012-06-07', '2012-06-08',
               '2012-06-09', '2012-06-10', '2012-06-11', '2012-06-12',
               '2012-06-13', '2012-06-14', '2012-06-15', '2012-06-16',
               '2012-06-17', '2012-06-18', '2012-06-19', '2012-06-20'],
              dtype='datetime64[ns]', freq='D')


In [91]:
# Pass a frequency.  It accepts off-the-wall frequencies like "last business day of month'
pd.date_range('1/1/2000', '12/1/2000', freq='BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [92]:
# If timestamps are included with the past dates, then they're preserved in the future dates created
pd.date_range('5/2/2012 12:56:31', periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [95]:
# Reset time information to midnight using "normalize"
pd.date_range(start='5/2/2012 12:56:31', periods=5, normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

### Frequencies and Date Offsets

In [97]:
# Typically are referenced by a string alias
    # 'M' --> calendar month end
    # 'Q' --> quarterly
    # 'H' --> Hourly
pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [98]:
# Can pass compound frequencies like '1h30min'
pd.date_range('1/1/2000', periods = 10, freq='1H30MIN')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [102]:
rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI')
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]