In [1]:
import pandas as pd 
import numpy as np

In [2]:
fb = pd.read_csv('../datasets/fb.csv', index_col='date', parse_dates=True)
fb = fb.assign(
    trading_volume = lambda x: pd.cut(
        x.volume, bins = 3, labels = ['low', 'med', 'high']
    )
)

In [3]:
fb.head(3)

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,181.580002,177.550003,177.679993,181.419998,18151900,low
2018-01-03,184.779999,181.330002,181.880005,184.669998,16886600,low
2018-01-04,186.210007,184.100006,184.899994,184.330002,13880900,low


In [4]:
# slice dates
fb['2018-10-11':'2018-10-15']

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-11,154.809998,149.160004,150.130005,153.350006,35338900,low
2018-10-12,156.889999,151.300003,156.729996,153.740005,25293500,low
2018-10-15,155.570007,152.550003,153.320007,153.520004,15433500,low


In [5]:
fb.loc['2018-10-11':'2018-10-15'] # same as above

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-10-11,154.809998,149.160004,150.130005,153.350006,35338900,low
2018-10-12,156.889999,151.300003,156.729996,153.740005,25293500,low
2018-10-15,155.570007,152.550003,153.320007,153.520004,15433500,low


In [6]:
fb.loc['2018-10-11'] # series

high              154.809998
low               149.160004
open              150.130005
close             153.350006
volume              35338900
trading_volume           low
Name: 2018-10-11 00:00:00, dtype: object

In [8]:
fb['2018-10-11'] # error

In [10]:
# other slicing options
fb.loc['2018-q1'].tail(2) 

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-28,155.880005,150.800003,151.649994,153.029999,60029200,low
2018-03-29,161.419998,154.139999,155.149994,159.789993,59434300,low


#### Reindex Time Series (all days, including weekends)

In [52]:
fb_reindex = fb.reindex(pd.date_range('2018-01-01', '2018-12-31', freq='1D'))
fb_reindex.head(2) # weekends and holidays get null values

Unnamed: 0,high,low,open,close,volume,trading_volume
2018-01-01,,,,,,
2018-01-02,181.580002,177.550003,177.679993,181.419998,18151900.0,low


`first_valid_index` and `last_valid_index` to get not-null values

In [55]:
fb_reindex.loc['2018-Q1'].head(2)

Unnamed: 0,high,low,open,close,volume,trading_volume
2018-01-01,,,,,,
2018-01-02,181.580002,177.550003,177.679993,181.419998,18151900.0,low


In [57]:
fb_reindex.loc['2018-Q1'].tail(3)

Unnamed: 0,high,low,open,close,volume,trading_volume
2018-03-29,161.419998,154.139999,155.149994,159.789993,59434300.0,low
2018-03-30,,,,,,
2018-03-31,,,,,,


In [58]:
fb_reindex.loc['2018-Q1'].first_valid_index()

Timestamp('2018-01-02 00:00:00', freq='D')

In [59]:
fb_reindex.loc['2018-Q1'].last_valid_index()

Timestamp('2018-03-29 00:00:00', freq='D')

In [64]:
# all values on the 1st day are null
fb_reindex.first('1D').isna().squeeze().all()

True

In [67]:
fb_reindex.loc['2018-Q1'].last('1D')

Unnamed: 0,high,low,open,close,volume,trading_volume
2018-03-31,,,,,,


#### `asof`: returns the last valid value

In [68]:
# last day of the 1st quarter was a weekend. 
fb_reindex.asof('2018-03-31')

high              161.419998
low               154.139999
open              155.149994
close             159.789993
volume            59434300.0
trading_volume           low
Name: 2018-03-31 00:00:00, dtype: object

In [69]:
fb_reindex.asof('2018-03-31') == fb_reindex.loc['2018-03-29']

high              True
low               True
open              True
close             True
volume            True
trading_volume    True
dtype: bool

#### `first` and `last` to slice the date

In [15]:
fb.first('1M').head(2) # first month of data set

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,181.580002,177.550003,177.679993,181.419998,18151900,low
2018-01-03,184.779999,181.330002,181.880005,184.669998,16886600,low


In [16]:
fb.loc['2018-03'].last('1W') # last week of data

Unnamed: 0_level_0,high,low,open,close,volume,trading_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-03-26,161.100006,149.020004,160.820007,160.059998,126116600,high
2018-03-27,162.850006,150.75,156.309998,152.220001,79117000,med
2018-03-28,155.880005,150.800003,151.649994,153.029999,60029200,low
2018-03-29,161.419998,154.139999,155.149994,159.789993,59434300,low


`first` and `last` with `groupby` and __aggregation__

In [18]:
fb.groupby(pd.Grouper(freq='1M')).agg({
    'open': 'first', # first open of the month
    'high':'max',
    'low':'min',
    'close':'last' # last close of the month
})

Unnamed: 0_level_0,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-31,177.679993,190.660004,175.800003,186.889999
2018-02-28,188.220001,195.320007,167.179993,178.320007
2018-03-31,179.009995,186.100006,149.020004,159.789993
2018-04-30,157.809998,177.100006,150.509995,172.0
2018-05-31,172.0,192.720001,170.229996,191.779999
2018-06-30,193.070007,203.550003,186.429993,194.320007
2018-07-31,193.369995,218.619995,166.559998,172.580002
2018-08-31,173.929993,188.300003,170.270004,175.729996
2018-09-30,173.5,173.889999,158.869995,164.460007
2018-10-31,163.029999,165.880005,139.029999,151.789993


#### Working with minutes

[Parsing dates Python documentation](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior)

Example:
`pd.to_datetime(df, format='%Y-%m-%d %H-%M')`

In [20]:
fb_per_minute = pd.read_csv(
    '../datasets/fb_per_minute.csv',
    index_col='date',
    parse_dates=True,
    #date_parser = pd.to_datetime(fb_per_minute, format='%Y-%m-%d %H-%M')
    # from above didn't work because fb_per_minute is not defined yet
    date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d %H-%M')
)

In [21]:
fb_per_minute.head(2)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0


`at_time` and `between_time`

In [22]:
fb_per_minute.at_time('10:38')

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 10:38:00,183.92,183.92,183.92,183.92,30764.0
2019-05-21 10:38:00,184.533,184.533,184.533,184.533,22949.0
2019-05-22 10:38:00,185.94,185.94,185.94,185.94,41518.0
2019-05-23 10:38:00,182.66,182.66,182.66,182.66,56811.0
2019-05-24 10:38:00,182.199,182.199,182.199,182.199,45535.0


In [23]:
fb_per_minute.between_time('10:38', '10:40').head(10)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 10:38:00,183.92,183.92,183.92,183.92,30764.0
2019-05-20 10:39:00,183.89,183.89,183.89,183.89,9535.0
2019-05-20 10:40:00,183.98,183.98,183.98,183.98,8618.0
2019-05-21 10:38:00,184.533,184.533,184.533,184.533,22949.0
2019-05-21 10:39:00,184.485,184.485,184.485,184.485,20916.0
2019-05-21 10:40:00,184.4538,184.4538,184.4538,184.4538,12996.0
2019-05-22 10:38:00,185.94,185.94,185.94,185.94,41518.0
2019-05-22 10:39:00,185.98,185.98,185.98,185.98,36220.0
2019-05-22 10:40:00,186.045,186.045,186.045,186.045,28520.0
2019-05-23 10:38:00,182.66,182.66,182.66,182.66,56811.0


In [28]:
# fb_per_minute.resample('1D').last()

In [29]:
fb_per_minute.tail(2)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-24 15:59:00,181.07,181.07,181.07,181.07,52994.0
2019-05-24 16:00:00,181.06,181.06,181.06,181.06,764906.0


In [31]:
# check the volume in the last 2 minutes of the day
fb_per_minute[['volume']].between_time('15:59', '16:00')

Unnamed: 0_level_0,volume
date,Unnamed: 1_level_1
2019-05-20 15:59:00,134569.0
2019-05-20 16:00:00,1113672.0
2019-05-21 15:59:00,61606.0
2019-05-21 16:00:00,801080.0
2019-05-22 15:59:00,96099.0
2019-05-22 16:00:00,1220993.0
2019-05-23 15:59:00,109648.0
2019-05-23 16:00:00,1329217.0
2019-05-24 15:59:00,52994.0
2019-05-24 16:00:00,764906.0


Every single day the volume is higher in the very last minute of trades. Check if the __first 30 min__ of trading volume is higher than the __last 30 min__

In [32]:
fb_per_minute.head(2)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0


In [45]:
first_30_min = fb_per_minute.between_time('09:00', '09:30').groupby(
    pd.Grouper(freq='1D')
    ).filter(
        lambda x: (x.volume > 0).all()
    ).volume.mean()
first_30_min

86683.2

In [46]:
# same result
fb_per_minute.between_time('09:00', '09:30').groupby(
    pd.Grouper(freq='1D')).volume.mean().mean()

86683.2

In [48]:
fb_per_minute.between_time('09:00', '09:30').groupby(
    pd.Grouper(freq='1D')
    ).filter(
        lambda x: (x.volume > 0).all()
    )

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-21 09:30:00,184.53,184.53,184.53,184.53,58171.0
2019-05-22 09:30:00,184.81,184.81,184.81,184.81,41585.0
2019-05-23 09:30:00,182.5,182.5,182.5,182.5,121930.0
2019-05-24 09:30:00,182.33,182.33,182.33,182.33,52681.0


In [50]:
# same as above
fb_per_minute.between_time('09:00', '09:30').groupby(
    pd.Grouper(freq='1D')
    ).mean()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20,181.62,181.62,181.62,181.62,159049.0
2019-05-21,184.53,184.53,184.53,184.53,58171.0
2019-05-22,184.81,184.81,184.81,184.81,41585.0
2019-05-23,182.5,182.5,182.5,182.5,121930.0
2019-05-24,182.33,182.33,182.33,182.33,52681.0


#### Shifting - lagged data
Before: `shift` shits columns, `tschif` shifts index. Now: Warning: `tshift` is deprecated and will be removed in a future version. Please use `shift` instead. 
- If you use the whole dataframe with `shift()`, shifts the index, not column

In [71]:
fb_per_minute.head(2)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-20 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-20 09:31:00,182.61,182.61,182.61,182.61,468017.0


In [76]:
# shifts index date
fb_per_minute.tshift(periods=1, freq='D').head(2)

  fb_per_minute.tshift(periods=1, freq='D').head(2)


Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-21 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-21 09:31:00,182.61,182.61,182.61,182.61,468017.0


In [81]:
fb_per_minute.shift(1, 'D').head(2) # if use shift without column name it shits date index

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-21 09:30:00,181.62,181.62,181.62,181.62,159049.0
2019-05-21 09:31:00,182.61,182.61,182.61,182.61,468017.0


#### Differencing
$diff=x_{t+1} -x_t$, another words `data.diff() = data - data.shift()`

In [83]:
(fb.volume - fb.volume.shift()).equals(fb.volume.diff())

True