Reference - Performance  
- https://stackoverflow.com/questions/72440603/dask-dataframe-parallel-task/74236686#74236686
- https://targomo.medium.com/how-we-learned-to-love-dask-and-achieved-a-40x-speedup-aa14e72d99c0#fa71

In [1]:
import dask
from dask import dataframe as dd

In [2]:
from dask.distributed import LocalCluster

In [3]:
# with dask.config.set({"dataframe.shuffle.method": "tasks"}):
dask.config.set({
    "distributed.workers.memory.target": 0.75,
    "distributed.workers.memory.spill": 0.85,
    "distributed.workers.memory.terminate": 0.98,
    "dataframe.shuffle.method": "p2p"})
client = LocalCluster(n_workers=4).get_client() # reduce # of worker for larger worker memory



In [4]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 12,Total memory: 29.66 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:60853,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 29.66 GiB

0,1
Comm: tcp://127.0.0.1:60872,Total threads: 3
Dashboard: http://127.0.0.1:60876/status,Memory: 7.42 GiB
Nanny: tcp://127.0.0.1:60856,
Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-qd3qv3uy,Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-qd3qv3uy

0,1
Comm: tcp://127.0.0.1:60874,Total threads: 3
Dashboard: http://127.0.0.1:60878/status,Memory: 7.42 GiB
Nanny: tcp://127.0.0.1:60858,
Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-n_41ieu0,Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-n_41ieu0

0,1
Comm: tcp://127.0.0.1:60875,Total threads: 3
Dashboard: http://127.0.0.1:60882/status,Memory: 7.42 GiB
Nanny: tcp://127.0.0.1:60860,
Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-x8bnf0jt,Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-x8bnf0jt

0,1
Comm: tcp://127.0.0.1:60873,Total threads: 3
Dashboard: http://127.0.0.1:60880/status,Memory: 7.42 GiB
Nanny: tcp://127.0.0.1:60862,
Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-81yj0wn_,Local directory: C:\Users\O_THAN~1.B\AppData\Local\Temp\dask-scratch-space\worker-81yj0wn_


In [5]:
%run "C:\Users\o_thanakrit.b\OneDrive - KASIKORNBANKGROUP\เอกสาร\code_and_finding\template.ipynb"

## Case 1) Compute multiple columns and aggregate 
without interim materialized as parquet

In [6]:
risk_full = dd.read_parquet(mart_risk_path/"risk_new_full2_blc_profit_usage.parquet")
card_type = dd.read_parquet(staging_path/"card_type.parquet")
ecl = dd.read_parquet(staging_path/"ecl_202112_202404.parquet")

In [7]:
apr24_status_blck = risk_full.query("mnth_cd == 202404").loc[:, ["uniq_key", "card_status", "blc_cd"]]

In [8]:
apr24_status_blck["uniq_key"].count().compute()

4390810

In [9]:
fy2023 = dd.read_parquet(mart_fy2023_path/"data_fy2023_fix_mkt_exp_add_usage_consu_pri_crd_blk_rm_wlx.parquet")

In [10]:
fy2023["uniq_key"].drop_duplicates().count().compute()

3997489

In [27]:
fy2023["cc_cst_real"].drop_duplicates().count().compute()

2329585

In [29]:
fy2023["gain_and_loss"].sum().compute()

339553470.2632297

In [33]:
fy2023.isna().sum().compute()

cc_cst_real                              0
uniq_key                                 0
net_interest_rec                         0
mftp_total                               0
annual_fee                               0
interchange_fee                          0
merchant_discount_fee                    0
cash_adv_fee                             0
gain_and_loss                            0
collection_fee                           0
fee_waived_off                           0
debt_collection_exp                      0
fee_paid_visa                            0
issuer_license_fee                       0
total_issuer_domestic                    0
total_issuer_inter                       0
total_account_card_fee                   0
total_concierge_fee                      0
cashback                                 0
loyalty_point_redem                      0
activity_based_costing_expense           0
specific_business_tax                    0
fraud_loss                               0
credit_limi

In [11]:
fy2023_lv2 = \
(fy2023
 .drop(columns=["fee_rev", "var_cost", "fix_cost", "profit", "w_flag"])
 # Level 2 - Revenue
 .assign(interchg_fee = lambda x : x["interchange_fee"] + x["merchant_discount_fee"])
 .assign(fx_fee = lambda x : x["gain_and_loss"])
 # Level 2 - Cost
 .assign(fixed_cost = lambda x : x["debt_collection_exp"] + x["activity_based_costing_expense"] + x["fee_waived_off"] + 
         x["marketing_exp"] + x["specific_business_tax"])
 .assign(reward_burn = lambda x : x["loyalty_point_redem"])
 .assign(scheme_fee = lambda x : x["fee_paid_visa"] + x["issuer_license_fee"] + x["total_account_card_fee"] + x["total_concierge_fee"] +
        x["total_issuer_domestic"] + x["total_issuer_inter"])
)

In [12]:
fy2023_lv1 = \
(fy2023_lv2
 # Level 1
 .assign(revenue = lambda x : x["annual_fee"] + x["cash_adv_fee"] + x["collection_fee"] + x["fx_fee"] + x["interchg_fee"] + x["int_inc"])
 .assign(cost = lambda x : x["cashback"] + x["cof"] + x["fixed_cost"] + x["fraud_loss"] + x["reward_burn"] + x["scheme_fee"])
)

In [13]:
fy2023_lv1.npartitions

243

In [14]:
fy2023_lv1.known_divisions

False

### Learning 1.a) Setting index, make dask ran with less memory consumption
Without setting index result as follow:-  
- In cluster mode, shown error `P2P shuffling d79d3838cc0a585ef76cba9e940d99d8 failed during unpack phase`
- For single worker, shown could `MemoryError: Unable to allocate for an array with shape (X, X) and data type X`

In [20]:
fy2023_profit = \
(fy2023_lv1
 .fillna(0.0)
 .loc[: , [
 "cc_cst_real", 
 "uniq_key",
 # Level 1        
 "revenue", "cost",
 # Level 2 - Cost
  "cashback", "cof", "fixed_cost", "fraud_loss", "reward_burn", "scheme_fee",
 # Level 2 - Revenue
  "annual_fee", "cash_adv_fee", "collection_fee", "fx_fee", "interchg_fee", "int_inc",
 # level 3 
  "mftp_total", "activity_based_costing_expense", "debt_collection_exp", "fee_waived_off", "marketing_exp", "specific_business_tax",
  "fee_paid_visa", "issuer_license_fee", "total_account_card_fee", "total_concierge_fee", "total_issuer_domestic", "total_issuer_inter",
  "gain_and_loss", "interchange_fee", "merchant_discount_fee", "net_interest_rec",
 # Variable
 'past_due_amt', 'stmt_amt', 'mth_end_bal', 'crn_pymt_due_amt', 'pymt_amt', 'cash_draw', 'smc_draw', 'spd_draw', 'smp_draw', 'usage',
  ]]
 # multi-groupby - consumed memory, avoid
 .groupby(["cc_cst_real", "uniq_key"])
 # .groupby(["uniq_key"])
 .sum()
 .reset_index()
)

In [21]:
fy2023_profit.npartitions

1

In [22]:
output_parquet = process_path/"temp"/"test_case1.parquet"
output_parquet

WindowsPath('C:/Users/o_thanakrit.b/OneDrive - KASIKORNBANKGROUP/เอกสาร/data_process/temp/test_write.parquet')

In [23]:
fy2023_profit.to_parquet(output_parquet, overwrite=True)

RuntimeError: P2P shuffling d79d3838cc0a585ef76cba9e940d99d8 failed during unpack phase

### Learning 1.b) After set_index on 1 of the aggregate key (uniq_key)  
- Performance improved, even muliple groupby columns with both index and non index column

In [15]:
fy2023_lv1 = fy2023_lv1.set_index("uniq_key", partition_size="100MB")

In [16]:
fy2023_lv1.npartitions

243

In [17]:
fy2023_profit = \
(fy2023_lv1
 .fillna(0.0)
 .loc[: , [
 "cc_cst_real",
 # "uniq_key",
 # Level 1        
 "revenue", "cost",
 # Level 2 - Cost
  "cashback", "cof", "fixed_cost", "fraud_loss", "reward_burn", "scheme_fee",
 # Level 2 - Revenue
  "annual_fee", "cash_adv_fee", "collection_fee", "fx_fee", "interchg_fee", "int_inc",
 # level 3 
  "mftp_total", "activity_based_costing_expense", "debt_collection_exp", "fee_waived_off", "marketing_exp", "specific_business_tax",
  "fee_paid_visa", "issuer_license_fee", "total_account_card_fee", "total_concierge_fee", "total_issuer_domestic", "total_issuer_inter",
  "gain_and_loss", "interchange_fee", "merchant_discount_fee", "net_interest_rec",
 # Variable
 'past_due_amt', 'stmt_amt', 'mth_end_bal', 'crn_pymt_due_amt', 'pymt_amt', 'cash_draw', 'smc_draw', 'spd_draw', 'smp_draw', 'usage',
  ]]
 .groupby(["cc_cst_real", "uniq_key"])
 # .groupby(["uniq_key"])
 .sum()
 .reset_index()
)

In [26]:
# Number of result partition still sames
fy2023_profit.npartitions

1

In [20]:
output_parquet = process_path/"temp"/"test_case1.parquet"
output_parquet
fy2023_profit.to_parquet(output_parquet, overwrite=True)

In [21]:
ddf = dd.read_parquet(output_parquet)

In [22]:
ddf.columns

Index(['cc_cst_real', 'uniq_key', 'revenue', 'cost', 'cashback', 'cof',
       'fixed_cost', 'fraud_loss', 'reward_burn', 'scheme_fee', 'annual_fee',
       'cash_adv_fee', 'collection_fee', 'fx_fee', 'interchg_fee', 'int_inc',
       'mftp_total', 'activity_based_costing_expense', 'debt_collection_exp',
       'fee_waived_off', 'marketing_exp', 'specific_business_tax',
       'fee_paid_visa', 'issuer_license_fee', 'total_account_card_fee',
       'total_concierge_fee', 'total_issuer_domestic', 'total_issuer_inter',
       'gain_and_loss', 'interchange_fee', 'merchant_discount_fee',
       'net_interest_rec', 'past_due_amt', 'stmt_amt', 'mth_end_bal',
       'crn_pymt_due_amt', 'pymt_amt', 'cash_draw', 'smc_draw', 'spd_draw',
       'smp_draw', 'usage'],
      dtype='object')

In [23]:
ddf["uniq_key"].count().compute()

3997498

In [28]:
ddf["cc_cst_real"].drop_duplicates().count().compute()

2329585

In [32]:
ddf["fx_fee"].sum().compute()

339553470.2632297

## Case 2) Merge with multiple key  

In [6]:
fin_profit = dd.read_parquet(staging_path/"fin_profit_full_fix.parquet")
risk_mob = dd.read_parquet(staging_path/"risk_new_full2.parquet")

In [7]:
fin_profit.npartitions

4704

In [32]:
fin_profit["uniq_key"].drop_duplicates().count().compute()

5040042

In [8]:
risk_mob.npartitions

3052

In [33]:
risk_mob["uniq_key"].drop_duplicates().count().compute()

5266283

### Learning 2.a) All are non-indexed the operation will be super slow

In [12]:
profit_risk = fin_profit.merge(risk_mob, on=["uniq_key", "mnth_cd"], how="inner")

In [13]:
profit_risk.columns

Index(['cc_cst_real', 'uniq_key', 'net_interest_rec', 'mftp_total',
       'annual_fee', 'interchange_fee', 'merchant_discount_fee',
       'cash_adv_fee', 'gain_and_loss', 'collection_fee', 'fee_waived_off',
       'fee_paid', 'debt_collection_exp', 'fee_paid_visa', 'membership_fee',
       'issuer_license_fee', 'total_issuer_domestic', 'total_issuer_inter',
       'total_account_card_fee', 'total_concierge_fee', 'net_total_income',
       'expense', 'marketing_exp', 'cashback', 'loyalty_point_redem',
       'activity_based_costing_expense', 'specific_business_tax', 'fraud_loss',
       'credit_limit_card', 'credit_limit_cust', 'credit_utilization',
       'mnth_cd', 'fund_rev', 'fee_rev', 'var_cost', 'fix_cost', 'profit',
       'opn_dt', 'card_status', 'past_due_amt', 'stmt_amt', 'mth_end_bal',
       'crn_pymt_due_amt', 'util', 'cr_lmt_amt', 'acq_cnl', 'lmt_at_app',
       'MOB', 'behv', 'pymt_amt', 'dlq_bck', 'ews', 'b_scor'],
      dtype='object')

In [14]:
profit_risk.groupby("mnth_cd").agg(n_card = ("uniq_key", "count")).compute()

Unnamed: 0_level_0,n_card
mnth_cd,Unnamed: 1_level_1
202211,4336132
202312,4412985
202310,4339072
202201,3768929
202203,3885359
202208,4174318
202202,3824222
202402,4316013
202204,3939578
202403,4355090


In [None]:
output_parquet = process_path/"temp"/"test_case2.parquet"
output_parquet
profit_risk.to_parquet(output_parquet)

### Learning 2.b) Set one column as index key  
- Merging process faster. Anyway, could not materialized dataframe

In [17]:
fin_profit = fin_profit.set_index("mnth_cd", partition_size="100MB")

In [18]:
risk_mob = risk_mob.set_index("mnth_cd", partition_size="100MB")

In [19]:
profit_risk = fin_profit.merge(risk_mob, on=["uniq_key", "mnth_cd"], how="inner")

In [20]:
profit_risk.columns

Index(['cc_cst_real', 'uniq_key', 'net_interest_rec', 'mftp_total',
       'annual_fee', 'interchange_fee', 'merchant_discount_fee',
       'cash_adv_fee', 'gain_and_loss', 'collection_fee', 'fee_waived_off',
       'fee_paid', 'debt_collection_exp', 'fee_paid_visa', 'membership_fee',
       'issuer_license_fee', 'total_issuer_domestic', 'total_issuer_inter',
       'total_account_card_fee', 'total_concierge_fee', 'net_total_income',
       'expense', 'marketing_exp', 'cashback', 'loyalty_point_redem',
       'activity_based_costing_expense', 'specific_business_tax', 'fraud_loss',
       'credit_limit_card', 'credit_limit_cust', 'credit_utilization',
       'fund_rev', 'fee_rev', 'var_cost', 'fix_cost', 'profit', 'opn_dt',
       'card_status', 'past_due_amt', 'stmt_amt', 'mth_end_bal',
       'crn_pymt_due_amt', 'util', 'cr_lmt_amt', 'acq_cnl', 'lmt_at_app',
       'MOB', 'behv', 'pymt_amt', 'dlq_bck', 'ews', 'b_scor'],
      dtype='object')

In [21]:
profit_risk.groupby("mnth_cd").agg(n_card = ("uniq_key", "count")).compute()

Unnamed: 0_level_0,n_card
mnth_cd,Unnamed: 1_level_1
202206,4061910
202211,4336132
202312,4412985
202403,4355090
202208,4174318
202404,4390810
202205,4001731
202209,4236370
202306,4181312
202201,3768929


In [22]:
output_parquet = process_path/"temp"/"test_case2.parquet"
output_parquet
profit_risk.to_parquet(output_parquet)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


RuntimeError: P2P shuffling 3735961f20e29c8c7a7f914e049d27a7 failed during unpack phase

### Learning 2.c) Create pseudo-key as index
- `np.int64` support data range (-9_223_372_036_854_775_808 to 9_223_372_036_854_775_807)
- Could create key on-the-fly , merge and materialized.

In [9]:
fin_profit.loc[:, ["uniq_key", "mnth_cd"]].dtypes

uniq_key    int64
mnth_cd     int32
dtype: object

In [10]:
risk_mob.loc[:, ["uniq_key", "mnth_cd"]].dtypes

uniq_key    int64
mnth_cd     int32
dtype: object

In [11]:
fin_profit["uniq_key"].max().compute()

5266283.0

In [12]:
risk_mob["uniq_key"].max().compute()

5266283

In [20]:
def create_pseudo_key(ddf):
    ddf["key"] = ddf["mnth_cd"]*1e10 + ddf["uniq_key"]
    return ddf

In [24]:
fin_profit_key = create_pseudo_key(fin_profit)
risk_mob_key = create_pseudo_key(risk_mob)

In [25]:
fin_profit_key = fin_profit_key.set_index("key", partition_size="100MB")
risk_mob_key = risk_mob_key.set_index("key", partition_size="100MB")

In [26]:
profit_risk = fin_profit_key.merge(risk_mob_key, left_index=True, right_index=True, how="inner")

In [27]:
output_parquet = process_path/"temp"/"test_case2.parquet"
output_parquet
profit_risk.to_parquet(output_parquet)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [28]:
ddf = dd.read_parquet(output_parquet)

In [30]:
ddf.dtypes

cc_cst_real                                 int64
uniq_key_x                                  int64
net_interest_rec                          float64
mftp_total                                float64
annual_fee                                float64
interchange_fee                           float64
merchant_discount_fee                     float64
cash_adv_fee                              float64
gain_and_loss                             float64
collection_fee                            float64
fee_waived_off                            float64
fee_paid                                  float64
debt_collection_exp                       float64
fee_paid_visa                             float64
membership_fee                            float64
issuer_license_fee                        float64
total_issuer_domestic                     float64
total_issuer_inter                        float64
total_account_card_fee                    float64
total_concierge_fee                       float64


In [34]:
ddf.npartitions

7755

In [36]:
ddf["uniq_key_x"].drop_duplicates().count().compute()

5040042