In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [55]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

In [56]:
parquet_path = data_path/"repartition_wp_w_div.parquet"

read_ddf = dd.read_parquet(parquet_path)
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 55


## Windows function with Dask
**To order by have to `sort_values` the dataframe before .groupby()**

In [57]:
read_ddf.dtypes

User                        int16
Card                        int16
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dt                 datetime64[ns]
dtype: object

`Categorical` datatype will interfere the Aggregation, even not in the .groupby, better cast to `string`

In [58]:
ddf_small = read_ddf[["Year", "dt", "User", "amount"]].astype({"Year":"string"})

ddf_small.head()

Unnamed: 0,Year,dt,User,amount
0,1991,1991-02-08 13:39:00,791,42.41
1,1991,1991-12-10 11:19:00,791,16.08
2,1991,1991-12-10 22:59:00,791,17.96
3,1991,1991-12-12 10:38:00,791,145.92
4,1991,1991-12-12 11:01:00,791,9.28


`.groupby().first()` = First row of each group

In [59]:
first_of_each_group = (ddf_small
 .sort_values("dt")
 .groupby(["User"])
 .first()
 )
first_of_each_group.compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2002,2002-09-01 06:21:00,134.09
1,2003,2003-07-01 06:45:00,17.23
2,2002,2002-03-01 06:59:00,16.66
4,1999,1999-11-26 15:03:00,1300.73
5,2002,2002-01-01 12:24:00,45.61
...,...,...,...
1972,2020,2020-02-01 05:31:00,1.16
1973,2020,2020-01-01 03:35:00,14.51
1984,2020,2020-01-01 06:54:00,1.03
1991,2020,2020-02-01 22:21:00,28.59


In [60]:
ddf_small.query("User == 0").sort_values("dt").compute().head(1)

Unnamed: 0,Year,dt,User,amount
78930,2002,2002-09-01 06:21:00,0,134.09


In [54]:
ddf_small.query("User == 1").sort_values("dt").compute().head(1)

Unnamed: 0,Year,dt,User,amount
159203,2003,2003-07-01 06:45:00,1,17.23


`.groupby().last()` for last row in the group

In [61]:
ddf_small.groupby(["User"]).last().compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2020,2020-02-18 06:10:00,119.87
1,2020,2020-01-09 21:17:00,-239.00
2,2020,2020-01-08 13:25:00,27.92
4,2020,2020-01-30 15:09:00,44.58
5,2020,2020-02-28 18:42:00,113.72
...,...,...,...
1972,2020,2020-02-13 19:27:00,0.93
1973,2020,2020-01-06 05:43:00,3.47
1984,2020,2020-01-26 19:19:00,137.56
1991,2020,2020-02-11 22:29:00,30.21


In [None]:
ddf_small.groupby(["User"]).last().compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2020,2020-02-18 06:10:00,119.87
1,2020,2020-01-09 21:17:00,-239.00
2,2020,2020-01-08 13:25:00,27.92
4,2020,2020-01-30 15:09:00,44.58
5,2020,2020-02-28 18:42:00,113.72
...,...,...,...
1972,2020,2020-02-13 19:27:00,0.93
1973,2020,2020-01-06 05:43:00,3.47
1984,2020,2020-01-26 19:19:00,137.56
1991,2020,2020-02-11 22:29:00,30.21


### Lead , Lag function
**IMPORTANT!! sort data first before lag / lead**

In [63]:
(ddf_small
 .sort_values(["User", "dt"], ascending=False)
 .assign(lagged_amount = lambda x : x["amount"].shift(1))
 .assign(ledded_amount = lambda x : x["amount"].shift(-1))
)

Unnamed: 0,Year,dt,User,amount,lagged_amount,ledded_amount
14018,2020,2020-02-28 23:10:00,1999,45.13,,43.12
14017,2020,2020-02-28 20:10:00,1999,43.12,45.13,59.15
14016,2020,2020-02-28 07:43:00,1999,59.15,43.12,54.00
14015,2020,2020-02-27 22:24:00,1999,54.00,59.15,-54.00
14025,2020,2020-02-27 22:23:00,1999,-54.00,54.00,63.43
...,...,...,...,...,...,...
78907,2002,2002-09-03 06:23:00,0,104.71,86.19,128.95
78906,2002,2002-09-02 17:45:00,0,128.95,104.71,120.34
78905,2002,2002-09-02 06:22:00,0,120.34,128.95,38.48
78928,2002,2002-09-01 06:42:00,0,38.48,120.34,134.09


In [27]:
# Missing one row due to lag , fn at the edge
sorted_ddf.isna().sum().compute()

Year             0
dt               0
User             0
amount           0
lagged_amount    1
ledded_amount    1
dtype: int64

In [25]:
sorted_ddf.head()

Unnamed: 0,Year,dt,User,amount,lagged_amount,ledded_amount
427,1991,1991-01-02 07:10:00,791,68.0,,-68.0
428,1991,1991-01-02 07:17:00,791,-68.0,68.0,113.62
429,1991,1991-01-02 07:21:00,791,113.62,-68.0,114.73
430,1991,1991-01-02 17:30:00,791,114.73,113.62,251.71
432,1991,1991-01-03 09:03:00,791,251.71,114.73,16.28


In [26]:
sorted_ddf.tail()

Unnamed: 0,Year,dt,User,amount,lagged_amount,ledded_amount
37646,2020,2020-02-28 23:51:00,1659,7.67,21.27,49.06
60688,2020,2020-02-28 23:53:00,863,49.06,7.67,132.73
24088,2020,2020-02-28 23:56:00,1300,51.29,132.73,42.29
74092,2020,2020-02-28 23:56:00,1366,132.73,49.06,51.29
47216,2020,2020-02-28 23:58:00,446,42.29,51.29,


#### Lead / Lag with Partition
Currenly Dask support partly window function with partition.  
Better use Pandas for this task

In [34]:
# ddf_samp = ddf_small.sample(frac = 0.001)
# ddf_samp.to_parquet(data_path/"data_sample.parquet")

In [69]:
ddf_samp = dd.read_parquet(data_path/"data_sample.parquet")

In [104]:
# Pandas style
pdf = ddf_samp.compute()
pdf = pdf.sort_values("dt", ascending=True)
pdf["by_usr_lag_amt"] = pdf.groupby("User")["amount"].shift(1)
pdf["by_usr_led_amt"] = pdf.groupby("User")["amount"].shift(-1)
pdf[pdf["User"]==2].head(20)

Unnamed: 0,Year,dt,User,amount,by_usr_lag_amt,by_usr_led_amt
79844,2002,2002-06-28 14:48:00,2,46.28,,44.75
80003,2002,2002-07-31 14:53:00,2,44.75,46.28,8.54
77259,2002,2002-12-22 07:37:00,2,8.54,44.75,13.85
22050,2004,2004-04-18 06:28:00,2,13.85,8.54,12.7
21832,2004,2004-10-01 10:59:00,2,12.7,13.85,2.17
281608,2005,2005-09-30 23:47:00,2,2.17,12.7,43.65
281640,2005,2005-10-06 09:45:00,2,43.65,2.17,32.62
358161,2006,2006-03-19 09:48:00,2,32.62,43.65,48.77
357825,2006,2006-06-01 17:20:00,2,48.77,32.62,7.22
41257,2007,2007-07-16 07:31:00,2,7.22,48.77,2.07


In [59]:
# Pandas style, NOT APPLICABLE with Dask Dataframe
ddf_samp = ddf_samp.sort_values("dt", ascending=True)
ddf_samp["by_usr_lag_amt"] = ddf_samp.groupby("User")["amount"].shift(1)
ddf_samp["by_usr_led_amt"] = ddf_samp.groupby("User")["amount"].shift(-1)
ddf_samp[ddf_samp["User"]==1].head(20)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  ddf_samp["by_usr_lag_amt"] = ddf_samp.groupby("User")["amount"].shift(1)
  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  ddf_samp["by_usr_led_amt"] = ddf_samp.groupby("User")["amount"].shift(-1)


ValueError: cannot reindex on an axis with duplicate labels

For Dask, groupby as Series and do the lag/lead with apply x.shift()   
Also, .apply have to specifiy keyward `meta`=(column name:data type)  

In [103]:
sorted_ddf = ddf_samp.sort_values("dt", ascending=True)
lag_amt = sorted_ddf.groupby("User")["amount"].apply(lambda x: x.shift(1), meta=("lag_amt","float"))
lag_amt.head()


User       
2     79844      NaN
      80003    46.28
      77259    44.75
      22050     8.54
      21832    13.85
Name: lag_amt, dtype: float64

In [150]:
sorted_ddf[sorted_ddf["User"]==2].head()

Unnamed: 0,Year,dt,User,amount
79844,2002,2002-06-28 14:48:00,2,46.28
80003,2002,2002-07-31 14:53:00,2,44.75
77259,2002,2002-12-22 07:37:00,2,8.54
22050,2004,2004-04-18 06:28:00,2,13.85
21832,2004,2004-10-01 10:59:00,2,12.7


In [149]:
lag_amt.compute().reset_index()

Unnamed: 0,User,level_1,lag_amt
0,2,79844,
1,2,80003,46.28
2,2,77259,44.75
3,2,22050,8.54
4,2,21832,13.85
...,...,...,...
24381,1999,171020,
24382,1999,24096,46.35
24383,1999,25425,66.39
24384,1999,272866,29.23


Anyway, this method need to mapped shift series back to Dask DataFrame

### Rolling
Dask need to `set_index` on datetime Series first, then could do the rolling operation

In [203]:
timed_idx_ddf = ddf_samp.set_index("dt", sort=True) 

In [204]:
timed_idx_ddf.divisions

(Timestamp('1992-01-12 09:44:00'),
 Timestamp('2003-10-31 09:45:00'),
 Timestamp('2006-05-20 06:20:00'),
 Timestamp('2007-12-31 22:26:00'),
 Timestamp('2009-06-12 13:59:00'),
 Timestamp('2010-10-22 02:48:00'),
 Timestamp('2011-12-07 00:23:00'),
 Timestamp('2012-10-25 15:28:00'),
 Timestamp('2014-01-01 08:46:00'),
 Timestamp('2015-01-14 12:10:00'),
 Timestamp('2016-02-16 18:29:00'),
 Timestamp('2017-01-26 08:54:00'),
 Timestamp('2018-01-25 09:38:00'),
 Timestamp('2019-02-07 13:23:00'),
 Timestamp('2020-02-28 17:52:00'))

In [205]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling(2).sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,
1992-01-16 06:33:00,1992,486,120.0,123.29
1992-04-18 12:35:00,1992,1683,47.51,167.51
1992-05-21 07:22:00,1992,791,-96.0,-48.49
1992-06-28 06:51:00,1992,486,100.0,4.0


In [206]:
# Use `min_period` to handle NnN on the first nth row
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling(2, min_periods=1).sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29
1992-04-18 12:35:00,1992,1683,47.51,167.51
1992-05-21 07:22:00,1992,791,-96.0,-48.49
1992-06-28 06:51:00,1992,486,100.0,4.0


### Time aware rolling
Dask need to `set_index` on datetime Series first, then could do the rolling operation

In [232]:
timed_idx_ddf = ddf_samp.set_index("dt", sort=True) 

In [233]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("3D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0


In [234]:
timed_idx_ddf["roll_sum_default"] = timed_idx_ddf["amount"].rolling("2D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,120.0,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


In [235]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("5D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


In [236]:
timed_idx_ddf["roll_sum"] = timed_idx_ddf["amount"].rolling("30D").sum()
timed_idx_ddf.head()

Unnamed: 0_level_0,Year,User,amount,roll_sum,roll_sum_default
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1992-01-12 09:44:00,1992,486,3.29,3.29,3.29
1992-01-16 06:33:00,1992,486,120.0,123.29,120.0
1992-04-18 12:35:00,1992,1683,47.51,47.51,47.51
1992-05-21 07:22:00,1992,791,-96.0,-96.0,-96.0
1992-06-28 06:51:00,1992,486,100.0,100.0,100.0


Time awared rolling at monthly have issue on the cut point of each month end not equal (29, 30, 31)

In [237]:
sorted_ddf["rolling_mnth_sum"] = sorted_ddf["amount"].rolling("1M").sum()

ValueError: Can only rolling dataframes with known divisions
See https://docs.dask.org/en/latest/dataframe-design.html#partitions
for more information.