In [28]:
import os
from pathlib import Path
import warnings

import pandas as pd

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

# Simulate source data with multiple .csv

In [None]:
pdf = pd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})

AttributeError: 'numpy.ndarray' object has no attribute 'compute'

In [None]:
uniq_year = pdf["Year"].unique().tolist()

In [None]:
# ddf = dd.read_csv(data_path/"credit_card_transactions-ibm_v2.csv", dtype={'Errors?': 'object'})
# uniq_year = ddf["Year"].unique().compute().values.tolist()



In [None]:
pdf.dtypes

User                int64
Card                int64
Year                int64
Month               int64
Day                 int64
Time               object
Amount             object
Use Chip           object
Merchant Name       int64
Merchant City      object
Merchant State     object
Zip               float64
MCC                 int64
Errors?            object
Is Fraud?          object
dtype: object

In [None]:
type(uniq_year)
uniq_year.sort()
uniq_year

[1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [None]:
for y in uniq_year:
# for y in [1991]:
    print(y)
    yrly_ddf = pdf[pdf["Year"] == y]
    yrly_ddf.to_csv(data_path/"credit"/f"yearly_data_{y}.csv", index=False)

1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


# Working with (Distributed) Cluster 

Dask distributed client have version conflicts with library `msgpack-python`  
Fix: upgrade msgpack-python == 1.0.5  
https://github.com/dask/distributed/issues/8038  

 **reference for Cluster configuration n_worker, threads_per_worker:**
- https://stackoverflow.com/questions/57760475/difference-between-dask-distributed-localcluster-with-threads-vs-processes
- https://stackoverflow.com/questions/55784232/right-way-to-set-memory-parameters-for-localcluster-in-dask

*Parameter*  

n_workers=4 : Number of workers (Dask show _processes_ ), worker memory after allocated to core service (2G) will be distribued equally among each workers if Local machine have memory 16MB, for 4 workers each have 16 - 2 = 14 / 4 ~ 3.5G per worker.
** Running dask without without cluster configuration = single worker cluster (n_worker = 1) **   

threads_per_worker= # machine cores / # worker

memory_limit=machine memory / # n_workers : possible to set beyond the machine limit. 

**reference for Cluster configuration worker memory**
- https://distributed.dask.org/en/stable/worker-memory.html#thresholds-configuration

distributed worker memory:  
- target: 0.60     # fraction of managed memory where we start spilling to disk  
- spill: 0.70      # fraction of process memory where we start spilling to disk  
- pause: 0.80      # fraction of process memory at which we pause worker threads  
- terminate: 0.95  # fraction of process memory at which we terminate the worker

**reference for Cluster shuffle method**  
https://docs.coiled.io/blog/shuffling-large-data-at-constant-memory.html  

Shuffle is the method of transferring data between worker when function `sort`, `merge`, `groupby` called.  
P2P Shuffle is the shuflling technics help reduce worker memory footprint.  

In [3]:
from dask.distributed import LocalCluster

dask.config.set({ "distributed.worker.memory.target": 0.6, 
                 "distributed.worker.memory.spill": 0.7, 
                 "distributed.worker.memory.pause": 0.8, 
                 "distributed.worker.memory.terminate": 0.95,
                 "dataframe.shuffle.method": "p2p" }) 

client = LocalCluster(n_workers=4).get_client()

In [6]:
# Print the client to see the cluster information
print(client)

<Client: 'tcp://127.0.0.1:43487' processes=4 threads=4, memory=15.02 GiB>


In [7]:
# Show the dashboard link
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 15.02 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:43487,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 15.02 GiB

0,1
Comm: tcp://127.0.0.1:39941,Total threads: 1
Dashboard: http://127.0.0.1:39783/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:40781,
Local directory: /tmp/dask-scratch-space/worker-rwljtop6,Local directory: /tmp/dask-scratch-space/worker-rwljtop6

0,1
Comm: tcp://127.0.0.1:39737,Total threads: 1
Dashboard: http://127.0.0.1:39125/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:40365,
Local directory: /tmp/dask-scratch-space/worker-v8dqjo54,Local directory: /tmp/dask-scratch-space/worker-v8dqjo54

0,1
Comm: tcp://127.0.0.1:44775,Total threads: 1
Dashboard: http://127.0.0.1:44761/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:40929,
Local directory: /tmp/dask-scratch-space/worker-ctdu6iuz,Local directory: /tmp/dask-scratch-space/worker-ctdu6iuz

0,1
Comm: tcp://127.0.0.1:45921,Total threads: 1
Dashboard: http://127.0.0.1:34747/status,Memory: 3.75 GiB
Nanny: tcp://127.0.0.1:41215,
Local directory: /tmp/dask-scratch-space/worker-kht6mkwh,Local directory: /tmp/dask-scratch-space/worker-kht6mkwh


In [8]:
# Show the dashboard link
client.dashboard_link

'http://127.0.0.1:8787/status'

# Case 1) Datamart without Hive Partition

## Datamart creation
**Configuration**
- Appending each raw data in to parquet.
- Set index on column `Card` by each raw file ingested, since `Card` is non-unique then (`ignore_divisions=True`)
- The index columns is sorted by default
- Non-Hive style partition folder (no `partition_on` parameter)

In [43]:
schema = {
    "User": "int64",
    "Card": "int64",
    "Year": "int64",
    "Month": "int64",
    "Day": "int64",
    "Time": "string",
    "Amount": "string",
    "Use Chip": "string",
    "Merchant Name": "int64",
    "Merchant City": "string",
    "Merchant State": "string",
    "Zip": "float64",
    "MCC": "int64",
    "Errors?": "string",
    "Is Fraud?": "string"
}

In [None]:
yr_rng = list(range(1991, 2021))
data_combined = data_path/"credit"/"data_combined_no_hive_sort_idx_no_div.parquet"

for y in yr_rng:
    print(y)
    yrly_ddf = dd.read_csv(data_path/"credit"/f"yearly_data_{y}.csv", dtype=schema)
    yrly_ddf = yrly_ddf.set_index("Card", partition_size="100MB")
    
    if data_combined.exists():    
        yrly_ddf.to_parquet(data_combined, append=True, ignore_divisions=True)
    else:
        yrly_ddf.to_parquet(data_combined)

1991


AttributeError: 'DataFrame' object has no attribute 'sort_index'

## Query performance
Reference tips to optimized query performance  

Indexing  
- https://stackoverflow.com/questions/75915860/dask-and-best-practices-with-multiple-indices
- https://stackoverflow.com/questions/16626058/what-is-the-performance-impact-of-non-unique-indexes-in-pandas

Joining performance 
- https://stackoverflow.com/questions/71233619/why-do-i-get-always-a-memory-error-after-i-run-dask-with-big-dataframes
- https://docs.dask.org/en/latest/dataframe-joins.html#sorted-joins
- https://www.coiled.io/blog/dask-dataframe-merge-join

The duplicated index (User) force output without division metadata

In [45]:
parquet_path = data_path/"credit"/"data_combined_no_hive_sort_idx_no_div.parquet"
ddf = dd.read_parquet(parquet_path)

In [46]:
# Number of partitions = number of raw ingested files (no need to split)
ddf.npartitions

30

In [47]:
# Unknown divisions, since the specified not to store divisions
ddf.known_divisions

False

In [48]:
# Check index type, and name 'Card'
ddf.index.head()

Index([1, 1, 1, 1, 1], dtype='int64', name='Card')

In [49]:
# The rest of data frame columns will excluding the index
ddf.columns

Index(['User', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use Chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'Is Fraud?'],
      dtype='object')

In [25]:
ddf.groupby("User")["MCC"].nunique().compute()



User
1683    89
1741    83
491     84
1490    90
376     84
        ..
476     24
1623    19
312     18
1751    17
1754    18
Name: MCC, Length: 2000, dtype: int64

In [26]:
# turn off FutureWarnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [42]:
ddf.groupby("Card")["MCC"].nunique().compute()

Card
0    109
3    109
5    109
6    109
2    109
8     81
1    109
4    109
7    107
Name: MCC, dtype: int64

# Datamart with Hive Parition
Reference  
- https://docs.dask.org/en/latest/dataframe-hive.html

## Query performance