In [1]:
import pandas as pd
import numpy as np

# Time in Pandas
https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases

4 kinds in Pandas:
- Date times: 

    Creation: `to_datetime`, `date_range`

    Scalar (single value): `Timestamp`

- Time deltas:

    Creation: `to_timedelta`, `timedelta_range`

    Scalar (single value): `Timedelta`

- Time spans:

    Creation: `Period`, `Period_range`

    Scalar (single value): `Period`

- Date offsets:

    Creation: `DateOffset`

    Scalar: `DateOffset`


## Frequency

*S after a freq represents start. B in front of a freq means business setting*. 

B: business day

C: custom business day

D: calendar day

W: weekly

M: month **end**

SM: semi-month end

BM: business month end

MS: month **start**

Q: quarter

BQ: business quarter end

Y: year end

H: hour

T: minute

S: second

In [2]:
pd.date_range('2022-01-01', periods=7, freq='AS')

DatetimeIndex(['2022-01-01', '2023-01-01', '2024-01-01', '2025-01-01',
               '2026-01-01', '2027-01-01', '2028-01-01'],
              dtype='datetime64[ns]', freq='AS-JAN')

**Custom Frequency**

When customizing frequency, one have to use `bdate_range` rather than `date_range`.

In [3]:
weekmask = 'Mon Wed Fri'
holidays = ['2022-01-05']
pd.bdate_range('2022-01-01', periods=10, freq='C', weekmask=weekmask, holidays=holidays)

DatetimeIndex(['2022-01-03', '2022-01-07', '2022-01-10', '2022-01-12',
               '2022-01-14', '2022-01-17', '2022-01-19', '2022-01-21',
               '2022-01-24', '2022-01-26'],
              dtype='datetime64[ns]', freq='C')

## Time Zone
- `tz_localize` sets time zone for an unset time series.
- `tz_convert` turns same time to a different time zone only. Time will change.
> What if you want to change time zone while keep time value unchanged? Firstly, set time zone to None, then set it to the one you want. 
`
s = s.tz_localize("UTC")
s = s.tz_convert("America/New_York")
s = s.tz_localize(None)
s.tz_localize("UTC")
`


In [31]:
time = pd.date_range('2022-01-01', periods=7, freq='D', tz='UTC')
# time = time.tz_localize('UTC')
print(time)
time = time.tz_convert('US/Pacific')
print(time)

DatetimeIndex(['2022-01-01 00:00:00+00:00', '2022-01-02 00:00:00+00:00',
               '2022-01-03 00:00:00+00:00', '2022-01-04 00:00:00+00:00',
               '2022-01-05 00:00:00+00:00', '2022-01-06 00:00:00+00:00',
               '2022-01-07 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')
DatetimeIndex(['2021-12-31 16:00:00-08:00', '2022-01-01 16:00:00-08:00',
               '2022-01-02 16:00:00-08:00', '2022-01-03 16:00:00-08:00',
               '2022-01-04 16:00:00-08:00', '2022-01-05 16:00:00-08:00',
               '2022-01-06 16:00:00-08:00'],
              dtype='datetime64[ns, US/Pacific]', freq='D')


## Resample
Only apply to `pd.Series`.

In [5]:
# time.resample('2D').mean() <- this does not work. 
ts = pd.Series(range(len(time)), index=time)
ts.resample('2D').mean()

2021-12-31 00:00:00-08:00    0.5
2022-01-02 00:00:00-08:00    2.5
2022-01-04 00:00:00-08:00    4.5
2022-01-06 00:00:00-08:00    6.0
Freq: 2D, dtype: float64

## Offsets
A easier way to increment certain time period than `pd.Timedelta`.

In [6]:
tp = pd.Timestamp('2022-01-01')
print(tp.day_name, tp.day_of_week, tp.day_of_year, tp.is_leap_year, tp.is_month_start)
tp = tp + pd.Timedelta('1 day')
print(tp)

tp = tp + pd.offsets.MonthEnd(2)
print(tp)

<built-in method day_name of Timestamp object at 0x7feab5dba3d0> 5 1 False True
2022-01-02 00:00:00
2022-02-28 00:00:00


In [7]:
from pandas.tseries.holiday import USFederalHolidayCalendar
bm_us = pd.offsets.CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar())
dt = pd.to_datetime('2013-12-17')
print(dt + bm_us)

2014-01-02 00:00:00


## Periods

In [8]:
pd.period_range('2022-01-01', freq='M', periods=3)

PeriodIndex(['2022-01', '2022-02', '2022-03'], dtype='period[M]')

In [9]:
pd.Period('2022')

Period('2022', 'A-DEC')

## Timestamps
`errors` parameter can be `raise`, `ignore`, `coerce`.

In [10]:
pd.to_datetime(['2006-12-31 00:00', '2022-01-01 00:00'], format='%Y-%m-%d %H:%M')

DatetimeIndex(['2006-12-31', '2022-01-01'], dtype='datetime64[ns]', freq=None)

Generate time from data frame. 

In [11]:
df = pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]})
pd.to_datetime(df[['year', 'month', 'day']])

0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

Generate from `unit`, which can be D, s, ms, us, ns.

> **Warning**
cannot take in tz for `to_datetime`

In [12]:
pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s").tz_localize(
    'UTC')

DatetimeIndex(['2012-10-08 18:15:05+00:00', '2012-10-09 18:15:05+00:00',
               '2012-10-10 18:15:05+00:00', '2012-10-11 18:15:05+00:00',
               '2012-10-12 18:15:05+00:00'],
              dtype='datetime64[ns, UTC]', freq=None)

In [13]:
pd.to_datetime([1, 2, 3], unit='D', origin='1960-01-01')

DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)

## Indexing

In [14]:
dft = pd.DataFrame(np.random.randn(100000, 1), columns=["A"], index=pd.date_range(
    "20130101", periods=100000, freq="T"))
dft['2013-02']
dft.loc['2013-02', ]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,A
2013-02-01 00:00:00,1.107649
2013-02-01 00:01:00,0.017097
2013-02-01 00:02:00,1.145414
2013-02-01 00:03:00,-0.284023
2013-02-01 00:04:00,-0.094950
...,...
2013-02-28 23:55:00,-0.623602
2013-02-28 23:56:00,-0.205251
2013-02-28 23:57:00,0.818553
2013-02-28 23:58:00,-0.692909


In particular, ``loc`` includes both start time/index and end time/index. 

Now, consider multi-indexing...

In [15]:
dft2 = pd.DataFrame(np.random.randn(20, 1), columns=["A"], 
                    index=pd.MultiIndex.from_product([pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]]), )

# dft2['2013-01-05'] <- this can not work. 
dft2.loc['2013-01-05']

Unnamed: 0,Unnamed: 1,A
2013-01-05 00:00:00,a,-0.903898
2013-01-05 00:00:00,b,-1.341907
2013-01-05 12:00:00,a,0.974539
2013-01-05 12:00:00,b,-1.667954


Another way to select: `truncate`. Use `before` and `after` to truncate the unwanted dates. 

In [16]:
dft.truncate(before='2013-02-05', after='2013-02-20')

Unnamed: 0,A
2013-02-05 00:00:00,-0.597508
2013-02-05 00:01:00,-0.366146
2013-02-05 00:02:00,-0.524385
2013-02-05 00:03:00,0.538110
2013-02-05 00:04:00,-0.630086
...,...
2013-02-19 23:56:00,0.328519
2013-02-19 23:57:00,-0.904153
2013-02-19 23:58:00,0.222557
2013-02-19 23:59:00,-2.068167


## Shift
When takes in a `freq` parameter, the dataframe change the datetime index instead of the values. 

In [17]:
index = pd.date_range(start='2020-01-01', periods=60, freq='2B')
ts = pd.Series(range(len(index)), index=index)
ts.head()

2020-01-01    0
2020-01-03    1
2020-01-07    2
2020-01-09    3
2020-01-13    4
Freq: 2B, dtype: int64

In [18]:
ts.shift(1, freq='B').head()

2020-01-02    0
2020-01-06    1
2020-01-08    2
2020-01-10    3
2020-01-14    4
dtype: int64

## Frequency Change
For `asfreq`, basically it's based on date_range and reindex. 

https://pandas.pydata.org/docs/reference/api/pandas.Series.asfreq.html#pandas.Series.asfreq

‘pad’ / ‘ffill’: propagate last valid observation forward to next valid

‘backfill’ / ‘bfill’: use NEXT valid observation to fill.

In [19]:
print(ts.asfreq(freq='B', fill_value=-1))
ts.asfreq(freq='3B', method="pad").head()

2020-01-01     0
2020-01-02    -1
2020-01-03     1
2020-01-06    -1
2020-01-07     2
              ..
2020-06-09    57
2020-06-10    -1
2020-06-11    58
2020-06-12    -1
2020-06-15    59
Freq: B, Length: 119, dtype: int64


2020-01-01    0
2020-01-06    1
2020-01-09    3
2020-01-14    4
2020-01-17    6
Freq: 3B, dtype: int64

Another powerful tool is `resample`, which is based on groupby.
For downsampling, it should be followed by methods:
- sum, mean, std, sem, max, min, median, first, last, `ohlc`, first, last, quantile, agg

In [20]:
ts.resample('10B', closed="left", label='left').ohlc()

Unnamed: 0,open,high,low,close
2020-01-01,0,4,0,4
2020-01-15,5,9,5,9
2020-01-29,10,14,10,14
2020-02-12,15,19,15,19
2020-02-26,20,24,20,24
2020-03-11,25,29,25,29
2020-03-25,30,34,30,34
2020-04-08,35,39,35,39
2020-04-22,40,44,40,44
2020-05-06,45,49,45,49


In [21]:
ts.resample('10B').agg(['mean', np.std])

Unnamed: 0,mean,std
2020-01-01,2.0,1.581139
2020-01-15,7.0,1.581139
2020-01-29,12.0,1.581139
2020-02-12,17.0,1.581139
2020-02-26,22.0,1.581139
2020-03-11,27.0,1.581139
2020-03-25,32.0,1.581139
2020-04-08,37.0,1.581139
2020-04-22,42.0,1.581139
2020-05-06,47.0,1.581139


For upsampling, it should be followed by: 
- pad, asfreq, bfill, apply, interpolate, transform(lambda...), ffill, bfill

In [22]:
ts.resample('T').interpolate(method='polynomial', order=2)

2020-01-01 00:00:00     0.000000
2020-01-01 00:01:00     0.000439
2020-01-01 00:02:00     0.000877
2020-01-01 00:03:00     0.001316
2020-01-01 00:04:00     0.001754
                         ...    
2020-06-14 23:56:00    58.999889
2020-06-14 23:57:00    58.999916
2020-06-14 23:58:00    58.999944
2020-06-14 23:59:00    58.999972
2020-06-15 00:00:00    59.000000
Freq: T, Length: 239041, dtype: float64

## Period
`Period`, `period_range`, `PeriodIndex`

In [23]:
pd.Period('2012-1-1', freq='H')

Period('2012-01-01 00:00', 'H')

In [24]:
pd.Period('2012-1-1', freq='H') + pd.offsets.Hour(2)
pd.Period('2012-1-1', freq='H') + pd.Timedelta('2H')

Period('2012-01-01 02:00', 'H')

In [25]:
pd.period_range('2011-01-01', '2012-01-01', freq='M')
pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M")

PeriodIndex(['2011-01', '2011-02', '2011-03'], dtype='period[M]')

In [26]:
pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M").astype('period[D]')

PeriodIndex(['2011-01-31', '2011-02-28', '2011-03-31'], dtype='period[D]')

`asfreq`

In [27]:
p = pd.Period("2011", freq="A-DEC") # Annual, end in December
p.asfreq('M', how='start')

Period('2011-01', 'M')

## Link between Periods and Timestamp

In [28]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(range(len(rng)), index=rng)
ps = ts.to_period()
ps

2012-01    0
2012-02    1
2012-03    2
2012-04    3
2012-05    4
Freq: M, dtype: int64

In [29]:
ps.to_timestamp()

2012-01-01    0
2012-02-01    1
2012-03-01    2
2012-04-01    3
2012-05-01    4
Freq: MS, dtype: int64

## More on Time deltas
> **Limitations** timedeltas use 64 bit integers in nanosecond. can be looked up by `pd.Timedelta.min` and `pd.Timedelta.max`.

- Timedelta supports numerical calculations.
- pd.Timedelta_range.