In [1]:
import os
from pathlib import Path

import pandas as pd

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

## Split single csv
For data ingestion testing

In [None]:
pdf = pd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})

AttributeError: 'numpy.ndarray' object has no attribute 'compute'

In [None]:
uniq_year = pdf["Year"].unique().tolist()

In [None]:
# ddf = dd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})
# uniq_year = ddf["Year"].unique().compute().values.tolist()



In [None]:
pdf.dtypes

User                int64
Card                int64
Year                int64
Month               int64
Day                 int64
Time               object
Amount             object
Use Chip           object
Merchant Name       int64
Merchant City      object
Merchant State     object
Zip               float64
MCC                 int64
Errors?            object
Is Fraud?          object
dtype: object

In [None]:
type(uniq_year)
uniq_year.sort()
uniq_year

[1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [None]:
for y in uniq_year:
# for y in [1991]:
    print(y)
    yrly_ddf = pdf[pdf["Year"] == y]
    yrly_ddf.to_csv(data_path/"credit"/f"yearly_data_{y}.csv", index=False)

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


# Working with Distributed Cluster 

Dask distributed client have problems with single machine run  
Fix: upgrade msgpack-python == 1.0.5  
https://github.com/dask/distributed/issues/8038  

 **reference for LocalCluster configuration :**
- https://stackoverflow.com/questions/57760475/difference-between-dask-distributed-localcluster-with-threads-vs-processes
- https://stackoverflow.com/questions/55784232/right-way-to-set-memory-parameters-for-localcluster-in-dask

*Parameter*  

n_workers=4 : Number of workers (Dask show _processes_ ), worker memory after allocated to core service (2G) will be distribued equally among each workers if Local machine have memory 16MB, for 4 workers each have 16 - 2 = 14 / 4 ~ 3.5G per worker.  

threads_per_worker= # machine cores / # worker

memory_limit=machine memory / # n_workers : possible to set beyond the machine limit. 

**reference from Dask official API**
- https://distributed.dask.org/en/stable/worker-memory.html#thresholds-configuration

distributed:  
  worker:  
   Fractions of worker process memory at which we take action to avoid memory  
   blowup. Set any of the values to False to turn off the behavior entirely.  
    memory:  
      target: 0.60     # fraction of managed memory where we start spilling to disk  
      spill: 0.70      # fraction of process memory where we start spilling to disk  
      pause: 0.80      # fraction of process memory at which we pause worker threads  
      terminate: 0.95  # fraction of process memory at which we terminate the worker

In [2]:
from dask.distributed import LocalCluster

dask.config.set({ "distributed.worker.memory.target": 0.6, 
                 "distributed.worker.memory.spill": 0.7, 
                 "distributed.worker.memory.pause": 0.8, 
                 "distributed.worker.memory.terminate": 0.95}) 

client = LocalCluster(n_workers=4).get_client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34359 instead


In [10]:
print(client)

<Client: 'tcp://127.0.0.1:38517' processes=4 threads=4, memory=15.02 GiB>


In [3]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:34359/status,

0,1
Dashboard: http://127.0.0.1:34359/status,Workers: 4
Total threads: 4,Total memory: 15.02 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39871,Workers: 4
Dashboard: http://127.0.0.1:34359/status,Total threads: 4
Started: Just now,Total memory: 15.02 GiB

0,1
Comm: tcp://127.0.0.1:35823,Total threads: 1
Dashboard: http://127.0.0.1:44139/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:38501,
Local directory: /tmp/dask-scratch-space/worker-_dnps93i,Local directory: /tmp/dask-scratch-space/worker-_dnps93i

0,1
Comm: tcp://127.0.0.1:34103,Total threads: 1
Dashboard: http://127.0.0.1:33717/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:32857,
Local directory: /tmp/dask-scratch-space/worker-86_92v6y,Local directory: /tmp/dask-scratch-space/worker-86_92v6y

0,1
Comm: tcp://127.0.0.1:35657,Total threads: 1
Dashboard: http://127.0.0.1:34929/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:34187,
Local directory: /tmp/dask-scratch-space/worker-urzedv2u,Local directory: /tmp/dask-scratch-space/worker-urzedv2u

0,1
Comm: tcp://127.0.0.1:33193,Total threads: 1
Dashboard: http://127.0.0.1:42639/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:37665,
Local directory: /tmp/dask-scratch-space/worker-me4jj8nn,Local directory: /tmp/dask-scratch-space/worker-me4jj8nn


In [4]:
client.dashboard_link

'http://127.0.0.1:34359/status'

# Datamart without Hive Partition
Testing
- Set index on each raw file, with non-unique index

In [16]:
schema = {
    "User": "int64",
    "Card": "int64",
    "Year": "int64",
    "Month": "int64",
    "Day": "int64",
    "Time": "string",
    "Amount": "string",
    "Use Chip": "string",
    "Merchant Name": "int64",
    "Merchant City": "string",
    "Merchant State": "string",
    "Zip": "float64",
    "MCC": "int64",
    "Errors?": "string",
    "Is Fraud?": "string"
}

In [18]:
yr_rng = list(range(1991, 2021))
data_combined = data_path/"credit"/"data_combined.parquet"

for y in yr_rng:
    print(y)
    yrly_ddf = dd.read_csv(data_path/"credit"/f"yearly_data_{y}.csv", dtype=schema)
    yrly_ddf = yrly_ddf.set_index("Card", partition_size="100MB")
    
    if data_combined.exists():    
        yrly_ddf.to_parquet(data_combined, append=True, ignore_divisions=True)
    else:
        yrly_ddf.to_parquet(data_combined)

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020




## Query performance
Reference tips to optimized query performance  

Indexing  
- https://stackoverflow.com/questions/75915860/dask-and-best-practices-with-multiple-indices
- https://stackoverflow.com/questions/16626058/what-is-the-performance-impact-of-non-unique-indexes-in-pandas

Joining performance 
- https://stackoverflow.com/questions/71233619/why-do-i-get-always-a-memory-error-after-i-run-dask-with-big-dataframes
- https://docs.dask.org/en/latest/dataframe-joins.html#sorted-joins
- https://www.coiled.io/blog/dask-dataframe-merge-join

In [19]:
data_combined = data_path/"credit"/"data_combined.parquet"
ddf = dd.read_parquet(data_combined)

The duplicated index (User) force output without division metadata

In [24]:
ddf.known_divisions

False

In [23]:
ddf.head()

Unnamed: 0_level_0,User,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
Card,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,791,1991,12,24,07:16,$-51.00,Swipe Transaction,1799189980464955940,Burke,VA,22015.0,5499,,No
1,791,1991,6,29,18:07,$143.27,Swipe Transaction,1913477460590765860,Vienna,VA,22182.0,5300,,No
1,791,1991,6,30,10:53,$23.34,Swipe Transaction,5701841789931834110,Burke,VA,22015.0,5411,,No
1,791,1991,7,1,10:33,$6.38,Swipe Transaction,2027553650310142703,Burke,VA,22015.0,5541,,No
1,791,1991,7,1,11:07,$16.70,Swipe Transaction,-7269691894846892021,Burke,VA,22015.0,5411,,No


In [29]:
ddf.groupby("User")["MCC"].nunique().compute()

User
1683    89
1741    83
491     84
1490    90
376     84
        ..
476     24
1623    19
312     18
1751    17
1754    18
Name: MCC, Length: 2000, dtype: int64

In [31]:
ddf.groupby("User").agg(uniq_mcc = ("MCC", lambda x : x.nunique())).compute()

ValueError: unknown aggregate lambda

# Datamart with Hive Parition
Reference  
- https://docs.dask.org/en/latest/dataframe-hive.html

## Query performance