# Time Series 
how you mark and refer to time series data depends on the application, and you may have one of the following  
* Timestamps, specific instants in time
* fixed periods, such as the month jan 2007 or the full year 2010
* intervals of time, indicated by a start and end timestamp. Periods can be thought of as special cases of intervals
* Experiment or elapsed time; each timestamp is a measure of time relative to a particular start time(e.g. the diameter of a cookie baking each seocond since being placed in the oven)

In [1]:
import pandas as pd

In [2]:

from datetime import datetime
now = datetime.now()
now

datetime.datetime(2020, 8, 4, 9, 30, 29, 933331)

In [3]:
now.year, now.month, now.day

(2020, 8, 4)

In [4]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(926, 56700)

In [5]:
from datetime import timedelta
start = datetime(2011, 1, 7)
start + 2 * timedelta(12)

datetime.datetime(2011, 1, 31, 0, 0)

## Converting Between String and Datetime

In [6]:
stamp = datetime(1998, 12, 28)
str(stamp)

'1998-12-28 00:00:00'

In [7]:
stamp.strftime('%Y-%m-%d')

'1998-12-28'

## Datetime format specification 
%Y -- four  
%y -- Two-digit year  
%m -- Two-digit month  
%d -- Two-digit day  
%H -- Hour (24 hour clock)  
%I -- House (12 hour clock)  
%M -- Two-digit minute   
%S -- Second [00,61] 61 account for leap second  
%w -- Weekday as integer [0(sunday), 6]  
%U -- Week number of the year [0-53]; sunday is considered the first day of the week, and days before the first sunday of the year are "week 0"   
%W -- Week number of the year [00, 53] Monday is considered the first day of the week, and days before the first monday of the year are week "0"
%z -- UTC time zone offset as +HHMM or -HHMM; empty if time zone naive   
%F -- Shortcut for %Y-%m-%d  
%D -- Shortcut for %m/%d/%y   
<br/>
you can use the same methods to convert strings to dates using datetime.strptime 

In [8]:
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [12]:
import numpy as np
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7), datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02    0.495205
2011-01-05   -0.269290
2011-01-07    0.856273
2011-01-08   -0.707436
2011-01-10    0.942841
2011-01-12   -0.609319
dtype: float64

In [13]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [14]:
ts + ts[::2] # selects every other point and doubles them. 

2011-01-02    0.990409
2011-01-05         NaN
2011-01-07    1.712547
2011-01-08         NaN
2011-01-10    1.885682
2011-01-12         NaN
dtype: float64

In [16]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

In [17]:
ts[stamp]

0.4952045210405504

In [18]:
ts['2011/01/02'] # can pass a date as a string 

0.4952045210405504

In [23]:
#for larger series sets 
longer_ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
longer_ts.tail()

2002-09-22   -0.535371
2002-09-23   -1.157224
2002-09-24   -0.263808
2002-09-25    1.188756
2002-09-26    2.559750
Freq: D, dtype: float64

In [28]:
longer_ts['2001'].tail()

2001-12-27    1.400298
2001-12-28    0.590972
2001-12-29    0.281058
2001-12-30    1.065767
2001-12-31   -0.380578
Freq: D, dtype: float64

In [30]:
longer_ts['2002-05'].tail()

2002-05-27   -1.366162
2002-05-28    1.653971
2002-05-29    0.288634
2002-05-30    1.218602
2002-05-31   -0.278151
Freq: D, dtype: float64

In [32]:
ts[datetime(2009, 5, 5):]

2011-01-02    0.495205
2011-01-05   -0.269290
2011-01-07    0.856273
2011-01-08   -0.707436
2011-01-10    0.942841
2011-01-12   -0.609319
dtype: float64

In [47]:
ts['1/6/2011':'1/11/2011']

2011-01-07    0.856273
2011-01-08   -0.707436
2011-01-10    0.942841
dtype: float64

In [48]:
ts.truncate(after='1/9/2011')

2011-01-02    0.495205
2011-01-05   -0.269290
2011-01-07    0.856273
2011-01-08   -0.707436
dtype: float64

## Time Series with Duplicate Indices

In [53]:
dates = pd.DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/2/2002', '1/2/2002' ])
dup_ts = pd.Series(np.random.randn(5), index=dates)
dup_ts

2002-01-01    0.338043
2002-01-02    0.420432
2002-01-03    1.040398
2002-01-02    0.015719
2002-01-02    0.919147
dtype: float64

In [55]:
dup_ts.index.unique()

DatetimeIndex(['2002-01-01', '2002-01-02', '2002-01-03'], dtype='datetime64[ns]', freq=None)

In [59]:
# suppose you wanted to group non-unique dates
non_unique = dup_ts.groupby(level=0)
non_unique.mean()

2002-01-01    0.338043
2002-01-02    0.451766
2002-01-03    1.040398
dtype: float64

In [60]:
non_unique.count()

2002-01-01    1
2002-01-02    3
2002-01-03    1
dtype: int64

## Date Ranges Frequencies and shifting  
often you time series may have non-uniform date ranges. For most purposes this is fine but should you whish to have equidistant dates you may whish to use the following

In [61]:
ts

2011-01-02    0.495205
2011-01-05   -0.269290
2011-01-07    0.856273
2011-01-08   -0.707436
2011-01-10    0.942841
2011-01-12   -0.609319
dtype: float64

In [66]:
resampler = ts.resample('D') #D for daily
resampler.mean()

2011-01-02    0.495205
2011-01-03         NaN
2011-01-04         NaN
2011-01-05   -0.269290
2011-01-06         NaN
2011-01-07    0.856273
2011-01-08   -0.707436
2011-01-09         NaN
2011-01-10    0.942841
2011-01-11         NaN
2011-01-12   -0.609319
Freq: D, dtype: float64

In [68]:
index = pd.date_range('2009-04-01', '2009-05-01')
index

DatetimeIndex(['2009-04-01', '2009-04-02', '2009-04-03', '2009-04-04',
               '2009-04-05', '2009-04-06', '2009-04-07', '2009-04-08',
               '2009-04-09', '2009-04-10', '2009-04-11', '2009-04-12',
               '2009-04-13', '2009-04-14', '2009-04-15', '2009-04-16',
               '2009-04-17', '2009-04-18', '2009-04-19', '2009-04-20',
               '2009-04-21', '2009-04-22', '2009-04-23', '2009-04-24',
               '2009-04-25', '2009-04-26', '2009-04-27', '2009-04-28',
               '2009-04-29', '2009-04-30', '2009-05-01'],
              dtype='datetime64[ns]', freq='D')

In [72]:
index2 = pd.date_range(start='1998-12-28', periods=365) #can also used 'end' as arg
# should you have wanted a index that just referenced the last business day of the month
index2 = pd.date_range(start='1998-12-28', end='2020-12-28', freq='BM') # 'BM' for business month

### Base time series frequencies 
D -- Day   
B -- BusinessDay   
H -- Hour  
T/min -- Minute   
S -- Second  
L/ms -- milli  
U -- Micro  
M -- MonthEnd  
BM -- BusinessMonthEnd  
MS -- MonthBegin  
BMS -- BusinessMonthBegin   
W-MON, W-TUE -- Week    
WOM-1MON/WOM-2MON -- Generates dates in first/second/third/fourth week of the month.    
Q-JAN/Q-FEB -- Quater end anchord on last calendar name of each month  
BQ-JAN/BQ-FEB -- Business Quater End. Quaterly dates anchored on last weekday of each month  
QS-JAN/QS-FEB -- Quaterly dates anchored on first calendar day of each month.    
BQS-JAN/BQS-FEB -- Quarterly dates anchored onf irst weekday of each mont, for year ending in indicated month.   
A-JAN/A-FEB -- Annual dates anchored on last calendar day of given month   
BA-JAN/BA-FEB -- Annual dates anchored on last weekday of given month   
AS-JAN/AS-FEB -- Annual dates anchored on first daty of given month  
BAS-JAN/BAS-FEB -- Annual dates anchored on frist weekday of given month.  


In [73]:
pd.date_range('2001-11-23 12:56:31', periods=5)

DatetimeIndex(['2001-11-23 12:56:31', '2001-11-24 12:56:31',
               '2001-11-25 12:56:31', '2001-11-26 12:56:31',
               '2001-11-27 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [74]:
pd.date_range('2001-11-23 12:56:31', periods=5, normalize=True) #should you want to normalise them to midnight 

DatetimeIndex(['2001-11-23', '2001-11-24', '2001-11-25', '2001-11-26',
               '2001-11-27'],
              dtype='datetime64[ns]', freq='D')

## Shifting Data
moving information around by shifting date

In [87]:
dates = pd.date_range('1998-12-05', periods = 6)
ts = pd.Series(np.random.randn(6), index=dates)
ts

1998-12-05    0.800759
1998-12-06    0.549604
1998-12-07    0.309382
1998-12-08    0.343953
1998-12-09    0.609053
1998-12-10    1.144723
Freq: D, dtype: float64

In [88]:
ts.shift(3)

1998-12-05         NaN
1998-12-06         NaN
1998-12-07         NaN
1998-12-08    0.800759
1998-12-09    0.549604
1998-12-10    0.309382
Freq: D, dtype: float64

In [89]:
ts.shift(-2) 

1998-12-05    0.309382
1998-12-06    0.343953
1998-12-07    0.609053
1998-12-08    1.144723
1998-12-09         NaN
1998-12-10         NaN
Freq: D, dtype: float64

In [92]:
ts.shift(1, freq='90S')

1998-12-05 00:01:30    0.800759
1998-12-06 00:01:30    0.549604
1998-12-07 00:01:30    0.309382
1998-12-08 00:01:30    0.343953
1998-12-09 00:01:30    0.609053
1998-12-10 00:01:30    1.144723
Freq: D, dtype: float64

# Time Zone Handling
working with time zones is considered to be a pain. As a result many time series used choose to with with UTC, the succsesor to GMT. IT is the current international standard. Time zones are expressed as offsets from UTC. In python, time zone information comes from pytz lirary. You can install it with pip or conda. pandas wraps pytz's functionalty so you can ignore its API outside of the time zone names. Time zone names can be found interactively and in the docs. 

In [93]:
import pytz 
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [95]:
# To get a time zone object from pytz, use pytz.timezone
tz = pytz.timezone('America/New_York')
tz

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

## Time Zone Localization and Conversion
By default, Time series in pandas are time zone naive. For example, consider the following time series

In [99]:
rng = pd.date_range('3/9/2012 9:20', periods=7, freq='D')
rng

DatetimeIndex(['2012-03-09 09:20:00', '2012-03-10 09:20:00',
               '2012-03-11 09:20:00', '2012-03-12 09:20:00',
               '2012-03-13 09:20:00', '2012-03-14 09:20:00',
               '2012-03-15 09:20:00'],
              dtype='datetime64[ns]', freq='D')

In [103]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-03-09 09:20:00    0.625552
2012-03-10 09:20:00    0.098082
2012-03-11 09:20:00    1.700966
2012-03-12 09:20:00    1.195476
2012-03-13 09:20:00   -0.120678
2012-03-14 09:20:00   -0.109268
2012-03-15 09:20:00    1.063441
Freq: D, dtype: float64

In [105]:
print(ts.index.tz) # Shows the series does not have an associated timezone

None


In [107]:
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC') #can pass a series a timezone

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [112]:
ts_utc = ts.tz_localize('UTC') #localises time series to UTC 
ts_utc

2012-03-09 09:20:00+00:00    0.625552
2012-03-10 09:20:00+00:00    0.098082
2012-03-11 09:20:00+00:00    1.700966
2012-03-12 09:20:00+00:00    1.195476
2012-03-13 09:20:00+00:00   -0.120678
2012-03-14 09:20:00+00:00   -0.109268
2012-03-15 09:20:00+00:00    1.063441
Freq: D, dtype: float64

In [113]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:20:00+00:00', '2012-03-10 09:20:00+00:00',
               '2012-03-11 09:20:00+00:00', '2012-03-12 09:20:00+00:00',
               '2012-03-13 09:20:00+00:00', '2012-03-14 09:20:00+00:00',
               '2012-03-15 09:20:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [114]:
ts_utc.tz_convert('America/New_York') # converts it to another time zone

2012-03-09 04:20:00-05:00    0.625552
2012-03-10 04:20:00-05:00    0.098082
2012-03-11 05:20:00-04:00    1.700966
2012-03-12 05:20:00-04:00    1.195476
2012-03-13 05:20:00-04:00   -0.120678
2012-03-14 05:20:00-04:00   -0.109268
2012-03-15 05:20:00-04:00    1.063441
Freq: D, dtype: float64

In [119]:
ts_eastern = ts.tz_localize('America/New_York') #localise to America/New_York
ts_eastern.tz_convert('UTC') #Converts to UTC

2012-03-09 14:20:00+00:00    0.625552
2012-03-10 14:20:00+00:00    0.098082
2012-03-11 13:20:00+00:00    1.700966
2012-03-12 13:20:00+00:00    1.195476
2012-03-13 13:20:00+00:00   -0.120678
2012-03-14 13:20:00+00:00   -0.109268
2012-03-15 13:20:00+00:00    1.063441
Freq: D, dtype: float64

In [120]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:20:00+01:00    0.625552
2012-03-10 15:20:00+01:00    0.098082
2012-03-11 14:20:00+01:00    1.700966
2012-03-12 14:20:00+01:00    1.195476
2012-03-13 14:20:00+01:00   -0.120678
2012-03-14 14:20:00+01:00   -0.109268
2012-03-15 14:20:00+01:00    1.063441
Freq: D, dtype: float64