In [84]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [85]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da
from dask_sql import Context


data_path = Path(os.getcwd()).parent/"data"
c = Context()

In [86]:
pdf = pd.read_pickle(data_path/"sample_cust.pkl")

In [124]:
pdf

Unnamed: 0,cc_cst_real,uniq_key,net_interest_rec,mftp_total,annual_fee,interchange_fee,merchant_discount_fee,cash_adv_fee,gain_and_loss,collection_fee,...,util,cr_lmt_amt,mnth_cd,MOB,behv,pymt_amt,dlq_bck,ews,b_scor,cc_cst_first_num
36220,1000000000000230,3228730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60000.0,202301,171.0,Transactor,0.0,normal,11.0,2,1
36780,1000000000000230,3228730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60000.0,202302,172.0,Transactor,0.0,normal,11.0,2,1
38990,1000000000000230,3228730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60000.0,202303,173.0,Transactor,0.0,normal,11.0,2,1
36421,1000000000000230,3228730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60000.0,202304,174.0,Transactor,0.0,normal,11.0,2,1
38189,1000000000000230,3228730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60000.0,202305,175.0,Transactor,0.0,normal,11.0,2,1
86291,1000000042237360,1744875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,87000.0,202306,24.0,Inactive,0.0,normal,,11,1
58695,1000000045386891,2526582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,24000.0,202306,14.0,Inactive,0.0,90 days up,,11,1
123106,1000000045386891,4281444,0.0,10.211657,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,24000.0,202306,14.0,Inactive,4389.27,90 days up,,11,1
17998,1000000042237360,3055249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.0,202306,24.0,Inactive,0.0,normal,,11,1
31708,1000000042237360,2586199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,87000.0,202306,24.0,Inactive,0.0,normal,,11,1


## Use dtype category with order to aggregate customer

In [125]:
pdf.loc[:, ["cc_cst_real", "uniq_key", "mnth_cd", "card_status", "b_scor", "behv", "dlq_bck", "ews"]]

Unnamed: 0,cc_cst_real,uniq_key,mnth_cd,card_status,b_scor,behv,dlq_bck,ews
36220,1000000000000230,3228730,202301,Valid,2,Transactor,normal,11.0
36780,1000000000000230,3228730,202302,Valid,2,Transactor,normal,11.0
38990,1000000000000230,3228730,202303,Valid,2,Transactor,normal,11.0
36421,1000000000000230,3228730,202304,Valid,2,Transactor,normal,11.0
38189,1000000000000230,3228730,202305,Valid,2,Transactor,normal,11.0
86291,1000000042237360,1744875,202306,Valid,11,Inactive,normal,
58695,1000000045386891,2526582,202306,Invalid,11,Inactive,90 days up,
123106,1000000045386891,4281444,202306,Invalid,11,Inactive,90 days up,
17998,1000000042237360,3055249,202306,Invalid,11,Inactive,normal,
31708,1000000042237360,2586199,202306,Invalid,11,Inactive,normal,


In [146]:
ddf = dd.from_pandas(pdf)

In [135]:
ddf["behv"].value_counts().compute()

behv
Inactive       8
Revolver       2
Transactor    12
Name: count, dtype: int64[pyarrow]

In [136]:
ddf["ews"].value_counts().compute()

ews
11    12
Name: count, dtype: int64[pyarrow]

Convert to know category with .categorize()

In [147]:
ddf = (ddf
       .assign(cat_card_status = lambda x : x["card_status"])
       .assign(cat_behv = lambda x : x["behv"])
       .assign(cat_ews = lambda x : x["ews"])
       .categorize(columns=["cat_card_status", "cat_behv", "cat_ews"])
)

In [148]:
ddf["cat_card_status"].cat.known

True

In [149]:
ddf["cat_behv"].cat.known

True

In [150]:
ddf["cat_ews"].cat.known

True

Add order to category with .cat.set_categories([],order=True)

In [152]:
status_order = ["Invalid", "Valid"]
behv_order = ["Inactive", "Transactor", "Revolver"]
ews_order = [4,3,2,1]

ddf = (ddf
 .assign(cat_card_status = lambda x : x["cat_card_status"].cat.set_categories(status_order, ordered=True))
 .assign(cat_behv = lambda x : x["cat_behv"].cat.set_categories(behv_order, ordered=True))
 .assign(cat_ews = lambda x : x["cat_ews"].cat.set_categories(ews_order, ordered=True))
)

Aggregate top order with .groupby().max()  
**all the category dtype must be in the agg**

In [162]:
ddf.groupby(["cc_cst_real", "mnth_cd"]).agg({"uniq_key":"count","cat_behv":"max","cat_card_status":"max", "cat_ews":"max"}).compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,uniq_key,cat_behv,cat_card_status,cat_ews
cc_cst_real,mnth_cd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000000042237360,202306,6,Revolver,Valid,
1000000000000230,202301,1,Transactor,Valid,
1000000000000230,202304,1,Transactor,Valid,
1000000000000230,202302,1,Transactor,Valid,
1000000000000230,202306,1,Transactor,Valid,
1000000000000230,202308,1,Transactor,Valid,
1000000000000230,202305,1,Transactor,Valid,
1000000000000230,202307,1,Transactor,Valid,
1000000000000230,202303,1,Transactor,Valid,
1000000000000230,202310,1,Transactor,Valid,


Category dtype, with `NaN` will be max/min

In [156]:
pdf.query("cc_cst_real == 1000000042237360").loc[:,["cc_cst_real", "uniq_key", "mnth_cd", "behv", "card_status"]]

Unnamed: 0,cc_cst_real,uniq_key,mnth_cd,behv,card_status
86291,1000000042237360,1744875,202306,Inactive,Valid
17998,1000000042237360,3055249,202306,Inactive,Invalid
31708,1000000042237360,2586199,202306,Inactive,Invalid
127007,1000000042237360,3565641,202306,Revolver,Valid
75692,1000000042237360,1880373,202306,Inactive,Invalid
90171,1000000042237360,4267734,202306,Revolver,Valid


In [159]:
pdf.query("cc_cst_real == 1000000000000230").loc[:,["cc_cst_real", "uniq_key", "mnth_cd", "behv", "card_status"]]

Unnamed: 0,cc_cst_real,uniq_key,mnth_cd,behv,card_status
36220,1000000000000230,3228730,202301,Transactor,Valid
36780,1000000000000230,3228730,202302,Transactor,Valid
38990,1000000000000230,3228730,202303,Transactor,Valid
36421,1000000000000230,3228730,202304,Transactor,Valid
38189,1000000000000230,3228730,202305,Transactor,Valid
37728,1000000000000230,3228730,202306,Transactor,Valid
38405,1000000000000230,3228730,202307,Transactor,Valid
38092,1000000000000230,3228730,202308,Transactor,Valid
39840,1000000000000230,3228730,202309,Transactor,Valid
39114,1000000000000230,3228730,202310,Transactor,Valid


In [160]:
pdf.query("cc_cst_real == 1000000045386891").loc[:,["cc_cst_real", "uniq_key", "mnth_cd", "behv", "card_status"]]

Unnamed: 0,cc_cst_real,uniq_key,mnth_cd,behv,card_status
58695,1000000045386891,2526582,202306,Inactive,Invalid
123106,1000000045386891,4281444,202306,Inactive,Invalid
59345,1000000045386891,2526582,202307,Inactive,Invalid
124320,1000000045386891,4281444,202307,Inactive,Invalid


In [97]:
cat_agg_ddf = ddf.groupby(["cc_cst_real", "mnth_cd"]).agg({"uniq_key":"count","cat_behv":"max","cat_card_status":"max"})

In [98]:
cat_agg_ddf.dtypes

uniq_key              int64
cat_behv           category
cat_card_status    category
dtype: object

Convert category to string type and save

In [101]:
cat_agg_ddf.reset_index().astype({"cat_behv":"string", "cat_card_status":"string"}).to_parquet(data_path/"cat_agg_ddf.parquet")

In [102]:
cat_ddf = dd.read_parquet(data_path/"cat_agg_ddf.parquet")

In [103]:
cat_ddf.dtypes

cc_cst_real                 int64
mnth_cd                     int32
uniq_key                    int64
cat_behv           string[python]
cat_card_status    string[python]
dtype: object

## Use Dask .map to mapping value in series

In [104]:
pdf = pd.read_pickle(data_path/"sample_cust.pkl")
ddf = dd.from_pandas(pdf)

In [120]:
mapped = (ddf
 .assign(corp_flag = lambda x : (x["cc_cst_real"]/1e15).astype("int"))
 .assign(card_typ = lambda x : x["corp_flag"].map({1:"comm", 2:"corp"}, meta=("corp_flag", "string")))
 .assign(card_typ = lambda x : x["corp_flag"].map({2:"corp"}, meta=("corp_flag", "string"))) # Test if not complete list
)

In [123]:
mapped.compute().loc[: ,["cc_cst_real", "corp_flag", "card_typ"]]

Unnamed: 0,cc_cst_real,corp_flag,card_typ
17998,1000000042237360,1,
31708,1000000042237360,1,
36220,1000000000000230,1,
36421,1000000000000230,1,
36780,1000000000000230,1,
37728,1000000000000230,1,
38092,1000000000000230,1,
38189,1000000000000230,1,
38405,1000000000000230,1,
38990,1000000000000230,1,


## Collect as list/set/

In [163]:
pdf = pd.read_pickle(data_path/"sample_cust.pkl")
ddf = dd.from_pandas(pdf)

In [177]:
pdf.groupby("cc_cst_real")["behv"].agg(list).reset_index()

Unnamed: 0,cc_cst_real,behv
0,1000000000000230,"[Transactor, Transactor, Transactor, Transacto..."
1,1000000042237360,"[Inactive, Inactive, Inactive, Revolver, Inact..."
2,1000000045386891,"[Inactive, Inactive, Inactive, Inactive]"


In [171]:
list_agg = ddf.groupby("cc_cst_real")["behv"].agg(list)

In [173]:
list_agg.compute().reset_index()

Unnamed: 0,cc_cst_real,behv
0,1000000042237360,"[Inactive, Inactive, Inactive, Inactive, Revol..."
1,1000000000000230,"[Transactor, Transactor, Transactor, Transacto..."
2,1000000045386891,"[Inactive, Inactive, Inactive, Inactive]"


In [180]:
pdf.groupby("cc_cst_real")["behv"].agg(set_behv= lambda x : set(x))

Unnamed: 0_level_0,set_behv
cc_cst_real,Unnamed: 1_level_1
1000000000000230,{Transactor}
1000000042237360,"{Revolver, Inactive}"
1000000045386891,{Inactive}


In [186]:
set_agg = ddf.groupby("cc_cst_real")["behv"].agg(set_behv=set)

ValueError: unknown aggregate set

## GroupBy, nlargest

In [187]:
pdf = pd.read_pickle(data_path/"sample_cust.pkl")
ddf = dd.from_pandas(pdf)

In [210]:
pdf.groupby("cc_cst_real").nth(3).loc[:, ["cc_cst_real", "mnth_cd", "uniq_key"]]

Unnamed: 0,cc_cst_real,mnth_cd,uniq_key
36421,1000000000000230,202304,3228730
127007,1000000042237360,202306,3565641
124320,1000000045386891,202307,4281444


In [207]:
pdf.query("cc_cst_real == 1000000045386891").loc[:, ["cc_cst_real", "mnth_cd", "uniq_key"]]

Unnamed: 0,cc_cst_real,mnth_cd,uniq_key
58695,1000000045386891,202306,2526582
123106,1000000045386891,202306,4281444
59345,1000000045386891,202307,2526582
124320,1000000045386891,202307,4281444


In [212]:
ddf.groupby("cc_cst_real")["mnth_cd"].nth(3).loc[:, ["cc_cst_real", "mnth_cd", "uniq_key"]]

AttributeError: 'Column not found: nth'