In [1]:
import os
from pathlib import Path

import pandas as pd

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

# Working with Distributed Cluster 

Dask distributed client have problems with single machine run  
Fix: upgrade msgpack-python == 1.0.5  
https://github.com/dask/distributed/issues/8038  

 **reference for LocalCluster configuration : **
- https://stackoverflow.com/questions/57760475/difference-between-dask-distributed-localcluster-with-threads-vs-processes
- https://stackoverflow.com/questions/55784232/right-way-to-set-memory-parameters-for-localcluster-in-dask

*Parameter*  

\quad n_workers=4 : Number of workers (Dask show _processes_ ), worker memory after allocated to core service (2G) will be distribued equally among each workers if Local machine have memory 16MB, for 4 workers each have 16 - 2 = 14 / 4 ~ 3.5G per worker.  

threads_per_worker= # machine cores / # worker

memory_limit=machine memory / # n_workers : possible to set beyond the machine limit. 

**reference from Dask official API**
- https://distributed.dask.org/en/stable/worker-memory.html#thresholds-configuration

distributed:  
  worker:  
   Fractions of worker process memory at which we take action to avoid memory  
   blowup. Set any of the values to False to turn off the behavior entirely.  
    memory:  
      target: 0.60     # fraction of managed memory where we start spilling to disk  
      spill: 0.70      # fraction of process memory where we start spilling to disk  
      pause: 0.80      # fraction of process memory at which we pause worker threads  
      terminate: 0.95  # fraction of process memory at which we terminate the worker

In [2]:
from dask.distributed import LocalCluster, Client

dask.config.set({ "distributed.worker.memory.target": 0.6, 
                 "distributed.worker.memory.spill": 0.7, 
                 "distributed.worker.memory.pause": 0.8, 
                 "distributed.worker.memory.terminate": 0.95}) 

client = LocalCluster(n_workers=4).get_client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46017 instead


In [3]:
print(client)

<Client: 'tcp://127.0.0.1:42377' processes=4 threads=4, memory=15.02 GiB>


In [4]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:46017/status,

0,1
Dashboard: http://127.0.0.1:46017/status,Workers: 4
Total threads: 4,Total memory: 15.02 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42377,Workers: 4
Dashboard: http://127.0.0.1:46017/status,Total threads: 4
Started: Just now,Total memory: 15.02 GiB

0,1
Comm: tcp://127.0.0.1:45937,Total threads: 1
Dashboard: http://127.0.0.1:33723/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:44665,
Local directory: /tmp/dask-scratch-space/worker-n807u951,Local directory: /tmp/dask-scratch-space/worker-n807u951

0,1
Comm: tcp://127.0.0.1:36659,Total threads: 1
Dashboard: http://127.0.0.1:34429/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:40097,
Local directory: /tmp/dask-scratch-space/worker-r95a39fh,Local directory: /tmp/dask-scratch-space/worker-r95a39fh

0,1
Comm: tcp://127.0.0.1:36623,Total threads: 1
Dashboard: http://127.0.0.1:42041/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:44643,
Local directory: /tmp/dask-scratch-space/worker-e_9brkxo,Local directory: /tmp/dask-scratch-space/worker-e_9brkxo

0,1
Comm: tcp://127.0.0.1:35539,Total threads: 1
Dashboard: http://127.0.0.1:43023/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:34619,
Local directory: /tmp/dask-scratch-space/worker-gnue07xy,Local directory: /tmp/dask-scratch-space/worker-gnue07xy


In [5]:
client.dashboard_link

'http://127.0.0.1:46017/status'

## Split single csv
For data ingestion testing

In [None]:
pdf = pd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})
uniq_year = pdf["Year"].unique().values.tolist()

AttributeError: 'numpy.ndarray' object has no attribute 'compute'

In [4]:
uniq_year = pdf["Year"].unique().tolist()

In [None]:
# ddf = dd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})
# uniq_year = ddf["Year"].unique().compute().values.tolist()



In [5]:
pdf.dtypes

User                int64
Card                int64
Year                int64
Month               int64
Day                 int64
Time               object
Amount             object
Use Chip           object
Merchant Name       int64
Merchant City      object
Merchant State     object
Zip               float64
MCC                 int64
Errors?            object
Is Fraud?          object
dtype: object

In [6]:
type(uniq_year)
uniq_year.sort()
uniq_year

[1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [7]:
for y in uniq_year:
# for y in [1991]:
    print(y)
    yrly_ddf = pdf[pdf["Year"] == y]
    yrly_ddf.to_csv(data_path/"credit"/f"yearly_data_{y}.csv", index=False)

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


# Datamart without Hive Partition
Testing
- Set index on each raw file, with non-unique index

## Query performance

# Datamart with Hive Parition

## Query performance