In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

In [56]:
parquet_path = data_path/"repartition_wp_w_div.parquet"

read_ddf = dd.read_parquet(parquet_path)
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 55


### Create sample data for code testing

In [164]:
sample_ddf = \
(read_ddf
 .query("User in (0, 1, 2)")
 .groupby("User")
 .apply(lambda x: x.iloc[0:5])
 .reset_index(drop=True)
 .astype({"Year":"string"})
)
# sample_ddf.compute()
sample_ddf.to_parquet(data_path/"sample.parquet", write_index=False)

**!!IMPORTANT!!** to `order by` have to `sort_values` the dataframe before .groupby()

`Categorical` datatype will interfere the Aggregation, even not in the .groupby, better cast to `string`

In [177]:
sample_ddf = dd.read_parquet(data_path/"sample.parquet")[["Year", "dt", "User", "amount"]]
sample_ddf.compute()

Unnamed: 0,Year,dt,User,amount
0,2007,2007-07-06 23:53:00,2,1.73
1,2007,2007-07-06 16:35:00,2,16.42
2,2007,2007-07-06 09:40:00,2,44.9
3,2007,2007-07-06 04:36:00,2,35.59
4,2007,2007-07-05 14:54:00,2,118.64
0,2007,2007-03-31 06:02:00,0,33.1
1,2007,2007-03-29 09:43:00,0,176.53
2,2007,2007-03-29 06:00:00,0,114.56
3,2007,2007-03-29 05:45:00,0,117.62
4,2007,2007-03-27 16:22:00,0,110.43


`.groupby().first()` = First row of each group

In [178]:
first_of_each_group = (sample_ddf
 .sort_values("dt", ascending = True)
 .groupby(["User"])
 .first()
 )
first_of_each_group.compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2007,2007-03-27 16:22:00,110.43
2,2007,2007-07-05 14:54:00,118.64
1,2007,2007-12-06 22:18:00,80.0


In [179]:
sample_ddf.query("User == 0").sort_values("dt").compute().head(1)

Unnamed: 0,Year,dt,User,amount
4,2007,2007-03-27 16:22:00,0,110.43


In [180]:
sample_ddf.query("User == 1").sort_values("dt").compute().head(1)

Unnamed: 0,Year,dt,User,amount
4,2007,2007-12-06 22:18:00,1,80.0


`.groupby().last()` for last row in the group

In [181]:
(sample_ddf
 .sort_values("dt", ascending = True)
 .groupby(["User"])
 .last()
 ).compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2007,2007-03-31 06:02:00,33.1
2,2007,2007-07-06 23:53:00,1.73
1,2007,2007-12-14 06:47:00,15.26


In [182]:
sample_ddf.query("User == 0").sort_values("dt").compute().tail(1)

Unnamed: 0,Year,dt,User,amount
0,2007,2007-03-31 06:02:00,0,33.1


## Lead , Lag function
**IMPORTANT!! sort data first before lag / lead**

In [187]:
(sample_ddf
 .sort_values(["User", "dt"], ascending=[True, True])
 .assign(lagged_amount = lambda x : x["amount"].shift(1))
 .assign(ledded_amount = lambda x : x["amount"].shift(-1))
).compute()

Unnamed: 0,Year,dt,User,amount,lagged_amount,ledded_amount
4,2007,2007-03-27 16:22:00,0,110.43,,117.62
3,2007,2007-03-29 05:45:00,0,117.62,110.43,114.56
2,2007,2007-03-29 06:00:00,0,114.56,117.62,176.53
1,2007,2007-03-29 09:43:00,0,176.53,114.56,33.1
0,2007,2007-03-31 06:02:00,0,33.1,176.53,80.0
4,2007,2007-12-06 22:18:00,1,80.0,33.1,21.52
3,2007,2007-12-07 14:42:00,1,21.52,80.0,17.4
2,2007,2007-12-09 06:36:00,1,17.4,21.52,62.18
1,2007,2007-12-09 19:49:00,1,62.18,17.4,15.26
0,2007,2007-12-14 06:47:00,1,15.26,62.18,118.64


## Lead , Lag with Partition
Currenly Dask support partly window function with partition.  
**Better use Pandas for this task**

In [185]:
# Pandas style
pdf = sample_ddf.compute()
(pdf
 .sort_values(["User", "dt"], ascending=True)
 .assign(by_usr_lag_amt = lambda x : x.groupby("User")["amount"].shift(1))
 .assign(by_usr_led_amt = lambda x : x.groupby("User")["amount"].shift(-1))
)

Unnamed: 0,Year,dt,User,amount,by_usr_lag_amt,by_usr_led_amt
4,2007,2007-03-27 16:22:00,0,110.43,,117.62
3,2007,2007-03-29 05:45:00,0,117.62,110.43,114.56
2,2007,2007-03-29 06:00:00,0,114.56,117.62,176.53
1,2007,2007-03-29 09:43:00,0,176.53,114.56,33.1
0,2007,2007-03-31 06:02:00,0,33.1,176.53,
4,2007,2007-12-06 22:18:00,1,80.0,,21.52
3,2007,2007-12-07 14:42:00,1,21.52,80.0,17.4
2,2007,2007-12-09 06:36:00,1,17.4,21.52,62.18
1,2007,2007-12-09 19:49:00,1,62.18,17.4,15.26
0,2007,2007-12-14 06:47:00,1,15.26,62.18,


In [None]:
# Pandas style, NOT APPLICABLE with Dask Dataframe
(sample_ddf
 .sort_values(["User", "dt"], ascending=True)
 .assign(by_usr_lag_amt = lambda x : x.groupby("User")["amount"].shift(1))
 .assign(by_usr_led_amt = lambda x : x.groupby("User")["amount"].shift(-1))
)#.compute() will error

**Dask Series**, have SeriesGroupby that could do the partition lag/lead with `.groupby()[].apply(lambda x : x.shift()`

Note, with `.apply()` have to specifiy keyward `meta`=(column name:data type)  

In [188]:
sorted_ddf = sample_ddf.sort_values(["User", "dt"], ascending=True)
lag_amt = sorted_ddf.groupby("User")["amount"].apply(lambda x: x.shift(1), meta=("lag_amt","float"))
lag_series = lag_amt.compute()

In [189]:
lag_series.reset_index().query("User == 0").head()

Unnamed: 0,User,level_1,lag_amt
5,0,4,
6,0,3,110.43
7,0,2,117.62
8,0,1,114.56
9,0,0,176.53


In [190]:
sorted_ddf.query("User == 0").compute().head()

Unnamed: 0,Year,dt,User,amount
4,2007,2007-03-27 16:22:00,0,110.43
3,2007,2007-03-29 05:45:00,0,117.62
2,2007,2007-03-29 06:00:00,0,114.56
1,2007,2007-03-29 09:43:00,0,176.53
0,2007,2007-03-31 06:02:00,0,33.1


In [191]:
# Exploring on transform, not Done
(sample_ddf
 .sort_values(["User", "dt"], ascending=True)
 .groupby("User")
 [["amount"]]
 .transform(lambda x : x.shift(1))
).compute().reset_index()

Unnamed: 0,index,amount
0,4,
1,3,118.64
2,2,35.59
3,1,44.9
4,0,16.42
5,4,
6,3,110.43
7,2,117.62
8,1,114.56
9,0,176.53


## Window Aggregate 
2 types of windows aggregate    
A) Without Partition  
B) With Partition  

with row position parameter like
- Rows between
- Unbound precedingm
- Unbound following  

In [3]:
sample_ddf = dd.read_parquet(data_path/"sample.parquet")[["Year", "dt", "User", "amount"]]
pdf = sample_ddf.compute()

In [4]:
display(pdf.sort_values(["User", "dt"], ascending=[True, True]))

Unnamed: 0,Year,dt,User,amount
4,2007,2007-03-27 16:22:00,0,110.43
3,2007,2007-03-29 05:45:00,0,117.62
2,2007,2007-03-29 06:00:00,0,114.56
1,2007,2007-03-29 09:43:00,0,176.53
0,2007,2007-03-31 06:02:00,0,33.1
4,2007,2007-12-06 22:18:00,1,80.0
3,2007,2007-12-07 14:42:00,1,21.52
2,2007,2007-12-09 06:36:00,1,17.4
1,2007,2007-12-09 19:49:00,1,62.18
0,2007,2007-12-14 06:47:00,1,15.26


**SQL equivalant**
```
select *
, sum(amount) over (order by dt asc 
                    rows between 2 preceding and current row) as rolling3_sum
from sample_ddf
```

#### Pandas

In [21]:
# Pandas - result same as SQL
(pdf
 .sort_values(["User", "dt"], ascending=True)
 .assign(rolling3_sum = lambda x : x["amount"].rolling(window=3, min_periods=1).sum().reset_index(drop=True).values)
)

Unnamed: 0,Year,dt,User,amount,rolling3_sum
4,2007,2007-03-27 16:22:00,0,110.43,110.43
3,2007,2007-03-29 05:45:00,0,117.62,228.05
2,2007,2007-03-29 06:00:00,0,114.56,342.61
1,2007,2007-03-29 09:43:00,0,176.53,408.71
0,2007,2007-03-31 06:02:00,0,33.1,324.19
4,2007,2007-12-06 22:18:00,1,80.0,289.63
3,2007,2007-12-07 14:42:00,1,21.52,134.62
2,2007,2007-12-09 06:36:00,1,17.4,118.92
1,2007,2007-12-09 19:49:00,1,62.18,101.1
0,2007,2007-12-14 06:47:00,1,15.26,94.84


In [23]:
# Pandas - without out min_period=1, Pandas will give NaN for row with not enough data to calculate windos
(pdf
 .sort_values(["User", "dt"], ascending=True)
 .assign(rolling3_sum = lambda x : x["amount"].rolling(window=3).sum().reset_index(drop=True).values)
)

Unnamed: 0,Year,dt,User,amount,rolling3_sum
4,2007,2007-03-27 16:22:00,0,110.43,
3,2007,2007-03-29 05:45:00,0,117.62,
2,2007,2007-03-29 06:00:00,0,114.56,342.61
1,2007,2007-03-29 09:43:00,0,176.53,408.71
0,2007,2007-03-31 06:02:00,0,33.1,324.19
4,2007,2007-12-06 22:18:00,1,80.0,289.63
3,2007,2007-12-07 14:42:00,1,21.52,134.62
2,2007,2007-12-09 06:36:00,1,17.4,118.92
1,2007,2007-12-09 19:49:00,1,62.18,101.1
0,2007,2007-12-14 06:47:00,1,15.26,94.84


#### Dask - Rolling
Dask need to `set_index` on datetime datatype (and convert to know division), then could do the rolling operation

In [74]:
sample_ddf.known_divisions

False

In [75]:
timed_idx_ddf = sample_ddf.set_index("dt", sort=True) 

In [76]:
timed_idx_ddf.known_divisions

True

In [77]:
timed_idx_ddf.npartitions

55

In [78]:
timed_idx_ddf.divisions[:10]

(Timestamp('2007-03-27 16:22:00'),
 Timestamp('2007-03-28 02:13:20.181818112'),
 Timestamp('2007-03-28 12:04:40.363636480'),
 Timestamp('2007-03-28 21:56:00.545454592'),
 Timestamp('2007-03-29 05:45:24.545454592'),
 Timestamp('2007-03-29 05:47:23.181818112'),
 Timestamp('2007-03-29 05:49:21.818181888'),
 Timestamp('2007-03-29 05:51:20.454545408'),
 Timestamp('2007-03-29 05:53:19.090909184'),
 Timestamp('2007-03-29 05:55:17.727272704'))

In [79]:
(timed_idx_ddf
 .assign(roll3_sum = lambda x : x["amount"].rolling(window = 2, min_periods=1).sum())
).compute()

Unnamed: 0_level_0,Year,User,amount,roll3_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-03-27 16:22:00,2007,0,110.43,110.43
2007-03-29 05:45:00,2007,0,117.62,228.05
2007-03-29 06:00:00,2007,0,114.56,232.18
2007-03-29 09:43:00,2007,0,176.53,291.09
2007-03-31 06:02:00,2007,0,33.1,209.63
2007-07-05 14:54:00,2007,2,118.64,151.74
2007-07-06 04:36:00,2007,2,35.59,154.23
2007-07-06 09:40:00,2007,2,44.9,80.49
2007-07-06 16:35:00,2007,2,16.42,61.32
2007-07-06 23:53:00,2007,2,1.73,18.15


Anyway, when increase rolling windows, there are conflict with npartition

In [80]:
(timed_idx_ddf
 .assign(roll3_sum = lambda x : x["amount"].rolling(window = 4, min_periods=1).sum())
).compute()

NotImplementedError: Partition size is less than overlapping window size. Try using ``df.repartition`` to increase the partition size.

## Windows aggregate With Partition

**SQL equivalant**
```
select *
, sum(amount) over (partition by User order by dt asc 
                    rows between 2 preceding and current row) as rolling3_sum
from sample_ddf
```

In [19]:
# Pandas - result same as SQL
(pdf
 .sort_values(["User", "dt"], ascending=True)
 .assign(rolling3_sum = lambda x : x["amount"].rolling(window=3, min_periods=1).sum().reset_index(drop=True).values)
)

Unnamed: 0,Year,dt,User,amount,rolling3_sum
4,2007,2007-03-27 16:22:00,0,110.43,110.43
3,2007,2007-03-29 05:45:00,0,117.62,228.05
2,2007,2007-03-29 06:00:00,0,114.56,342.61
1,2007,2007-03-29 09:43:00,0,176.53,408.71
0,2007,2007-03-31 06:02:00,0,33.1,324.19
4,2007,2007-12-06 22:18:00,1,80.0,289.63
3,2007,2007-12-07 14:42:00,1,21.52,134.62
2,2007,2007-12-09 06:36:00,1,17.4,118.92
1,2007,2007-12-09 19:49:00,1,62.18,101.1
0,2007,2007-12-14 06:47:00,1,15.26,94.84


### Pandas

In [81]:
(pdf
 .sort_values(["User", "dt"], ascending=True)
 .assign(rolling3_sum = lambda x : x.groupby(["User"])["amount"].rolling(window=3, min_periods=1).sum().reset_index(drop=True).values)
)

Unnamed: 0,Year,dt,User,amount,rolling3_sum
4,2007,2007-03-27 16:22:00,0,110.43,110.43
3,2007,2007-03-29 05:45:00,0,117.62,228.05
2,2007,2007-03-29 06:00:00,0,114.56,342.61
1,2007,2007-03-29 09:43:00,0,176.53,408.71
0,2007,2007-03-31 06:02:00,0,33.1,324.19
4,2007,2007-12-06 22:18:00,1,80.0,80.0
3,2007,2007-12-07 14:42:00,1,21.52,101.52
2,2007,2007-12-09 06:36:00,1,17.4,118.92
1,2007,2007-12-09 19:49:00,1,62.18,101.1
0,2007,2007-12-14 06:47:00,1,15.26,94.84


In [None]:
# Dask , must rolling with known division
(sample_ddf
 .sort_values(["User", "dt"], ascending=True)
 .assign(rolling3_sum = lambda x : x.groupby(["User"])["amount"].rolling(window=3).sum().reset_index(drop=True).values)
)

### Dask - Rolling
Dask need to `set_index` on datetime Series first, then could do the rolling operation

In [13]:
timed_idx_ddf = sample_ddf.set_index("dt", sort=True) 

In [15]:
timed_idx_ddf.known_divisions

True

In [16]:
timed_idx_ddf.divisions

(Timestamp('2007-03-27 16:22:00'),
 Timestamp('2007-03-28 02:13:20.181818112'),
 Timestamp('2007-03-28 12:04:40.363636480'),
 Timestamp('2007-03-28 21:56:00.545454592'),
 Timestamp('2007-03-29 05:45:24.545454592'),
 Timestamp('2007-03-29 05:47:23.181818112'),
 Timestamp('2007-03-29 05:49:21.818181888'),
 Timestamp('2007-03-29 05:51:20.454545408'),
 Timestamp('2007-03-29 05:53:19.090909184'),
 Timestamp('2007-03-29 05:55:17.727272704'),
 Timestamp('2007-03-29 05:57:16.363636480'),
 Timestamp('2007-03-29 05:59:15'),
 Timestamp('2007-03-29 06:36:29.454545408'),
 Timestamp('2007-03-29 07:35:16.909090816'),
 Timestamp('2007-03-29 08:34:04.363636480'),
 Timestamp('2007-03-29 09:32:51.818181888'),
 Timestamp('2007-03-30 05:03:17.454545408'),
 Timestamp('2007-03-31 04:25:18.545454592'),
 Timestamp('2007-05-17 13:26:26.181818112'),
 Timestamp('2007-07-05 15:01:28.363636480'),
 Timestamp('2007-07-05 18:38:10.909090816'),
 Timestamp('2007-07-05 22:14:53.454545408'),
 Timestamp('2007-07-06 01:51:3

In [205]:
(timed_idx_ddf
 .assign(roll_sum = timed_idx_ddf["amount"].rolling(2).sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,
1992-01-16 06:33:00,1992,486,120.0,123.29
1992-04-18 12:35:00,1992,1683,47.51,167.51
1992-05-21 07:22:00,1992,791,-96.0,-48.49
1992-06-28 06:51:00,1992,486,100.0,4.0


In [206]:
# Use `min_period` to handle NnN on the first nth row
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling(2, min_periods=1).sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29
1992-04-18 12:35:00,1992,1683,47.51,167.51
1992-05-21 07:22:00,1992,791,-96.0,-48.49
1992-06-28 06:51:00,1992,486,100.0,4.0


SQL equivalant
```
select *
, sum(amount) over (partition by User order by dt asc 
                    rows between 2 preceding and current row) as rolling3_sum
from sample_ddf
```

### Time aware rolling
Dask need to `set_index` on datetime Series first, then could do the rolling operation

In [232]:
timed_idx_ddf = ddf_samp.set_index("dt", sort=True) 

In [233]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("3D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0


In [234]:
timed_idx_ddf["roll_sum_default"] = timed_idx_ddf["amount"].rolling("2D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,120.0,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


In [235]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("5D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


In [236]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("30D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


Time awared rolling at monthly have issue on the cut point of each month end not equal (29, 30, 31)

In [237]:
sorted_ddf["rolling_mnth_sum"] = sorted_ddf["amount"].rolling("1M").sum()

ValueError: Can only rolling dataframes with known divisions
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
for more information.