In [1]:
import os
from pathlib import Path
import warnings

import pandas as pd

import dask
from dask import dataframe as dd
from dask import array as da

data_path = Path(os.getcwd()).parent/"data"

In [2]:
# turn off FutureWarnings
warnings.filterwarnings(action='ignore')

In [3]:
from dask.distributed import LocalCluster

dask.config.set({ "distributed.worker.memory.target": 0.6, 
                 "distributed.worker.memory.spill": 0.7, 
                 "distributed.worker.memory.pause": 0.8, 
                 "distributed.worker.memory.terminate": 0.95,
                 "dataframe.shuffle.method": "p2p" }) 

client = LocalCluster(n_workers=4).get_client()

In [4]:
# Print the client to see the cluster information
print(client)

<Client: 'tcp://127.0.0.1:34917' processes=4 threads=4, memory=15.02 GiB>


In [5]:
# Show the dashboard link
client.dashboard_link

'http://127.0.0.1:8787/status'

In [6]:
parquet_path = data_path/"credit"/"data_combined_no_hive_sort_idx_no_div.parquet"
ddf = dd.read_parquet(parquet_path)

In [7]:
ddf.dtypes

User                        int64
Year                        int64
Month                       int64
Day                         int64
Time              string[pyarrow]
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name               int64
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int64
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
dtype: object

In [8]:
print(f"Is known division? : {ddf.known_divisions}, number of partitions :{ddf.npartitions}")

Is known division? : False, number of partitions :30


## Data manipulation

### New column creation
Pandas style
- https://medium.com/@michalwesleymnach/the-complete-guide-to-create-columns-based-on-multiple-conditions-in-pandas-dataframes-eedf2c0392a6  

Dask style 
- Pure Dask.Array : https://stackoverflow.com/questions/38608446/create-an-if-else-condition-column-in-dask-dataframe

#### Pure Dask.Array
```python
dff["new_col_name"] = \
   da.where(dask_array cond#1, value if cond#1 true, 
    da.where(dask_array cond#2, value if cond#2 true,
        da.where(dask_array cond#3, value if cond#3 true, 
        default value)
            )
           )
```
Same as `case-when`, have short circuit property = if satisfy the condition will not further check the lower rank condition

In [12]:
from dask import array as da

mcc_arry = ddf["MCC"].to_dask_array(lengths=True)
mcc_flag = da.where(mcc_arry == 6011, 1, da.where(mcc_arry == 6010, 2, 3))
mcc_flag

Unnamed: 0,Array,Chunk
Bytes,186.06 MiB,71.34 MiB
Shape,"(24386900,)","(9351299,)"
Dask graph,5 chunks in 6 graph layers,5 chunks in 6 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 186.06 MiB 71.34 MiB Shape (24386900,) (9351299,) Dask graph 5 chunks in 6 graph layers Data type int64 numpy.ndarray",24386900  1,

Unnamed: 0,Array,Chunk
Bytes,186.06 MiB,71.34 MiB
Shape,"(24386900,)","(9351299,)"
Dask graph,5 chunks in 6 graph layers,5 chunks in 6 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [27]:
ddf.npartitions

30

  return pa.Table.from_pandas(obj, **kwargs)
  return pa.Table.from_pandas(obj, **kwargs)


In [29]:
# Repartition mcc_flag to match the number of partitions in ddf
mcc_flag_rechunk = da.rechunk(mcc_flag, ddf.npartitions)
mcc_flag_rechunk
# ddf["MCC_Flag"] = mcc_flag_rechunk

Unnamed: 0,Array,Chunk
Bytes,186.06 MiB,240 B
Shape,"(24386900,)","(30,)"
Dask graph,812897 chunks in 7 graph layers,812897 chunks in 7 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 186.06 MiB 240 B Shape (24386900,) (30,) Dask graph 812897 chunks in 7 graph layers Data type int64 numpy.ndarray",24386900  1,

Unnamed: 0,Array,Chunk
Bytes,186.06 MiB,240 B
Shape,"(24386900,)","(30,)"
Dask graph,812897 chunks in 7 graph layers,812897 chunks in 7 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


#### .apply() with function

## Univariate Aggregation

In [10]:
ddf.describe().compute()

Unnamed: 0,User,Year,Month,Day,Merchant Name,Zip,MCC
count,24386900.0,24386900.0,24386900.0,24386900.0,24386900.0,21508760.0,24386900.0
mean,1001.019,2011.955,6.525064,15.71812,-18958970000.0,50956.44,5561.171
std,569.4612,5.105921,3.472355,8.794073,4.75894e+18,29397.07,879.3154
min,0.0,1991.0,1.0,1.0,-9.222899e+18,501.0,1711.0
25%,777.0,2008.0,4.0,8.0,-4.282467e+18,30317.0,5300.0
50%,1236.0,2012.0,8.0,16.0,-2.484716e+17,57385.0,5499.0
75%,1683.0,2016.0,11.0,23.0,3.215786e+18,95076.0,5813.0
max,1999.0,2020.0,12.0,31.0,9.223292e+18,99928.0,9402.0


In [59]:
ddf[["MCC", "Merchant Name", "Zip"]].quantile(0.1).compute()

MCC              4.829000e+03
Merchant Name   -5.997942e+18
Zip              1.303100e+04
Name: 0.1, dtype: float64

**mode** takes a long time

In [31]:
ddf[["MCC", "Merchant Name", "Zip"]].mode().compute()

Unnamed: 0,MCC,Merchant Name,Zip
0,5411,1799189980464955940,98516.0


In [32]:
ddf[["Merchant City", "Merchant State", "Errors?", "Is Fraud?"]].mode().compute()

Unnamed: 0,Merchant City,Merchant State,Errors?,Is Fraud?
0,ONLINE,CA,Insufficient Balance,No


### GroupBy Aggregate
- **min, max, mean, std, count** support DataFrameGroupBy  
- **mode, percentile** not support in Dask DataFrameGroupBy  
Pandas API
```python
(ddf
 .groupby(["Year"])
 .agg(mode_city = ("Merchant City", lambda x: x.value_counts().index[0]),
      mode_city_n = ("Merchant City", lambda x: x.value_counts()[0]),
      p1_zip = ("Zip", lambda x: np.percentile(x, q=1))
      )
)
```

In [38]:
ddf.groupby(["Year"])[["MCC", "Merchant Name", "Zip"]].agg(["min", "max", "mean", "std", "count"]).compute().head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,MCC,MCC,MCC,MCC,MCC,Merchant Name,Merchant Name,Merchant Name,Merchant Name,Merchant Name,Zip,Zip,Zip,Zip,Zip
Unnamed: 0_level_1,min,max,mean,std,count,min,max,mean,std,count,min,max,mean,std,count
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
1991,3000,9402,5598.574763,838.340394,1585,-9124970903081131467,8919682822789039434,-152124800000000.0,,1585,2360.0,96792.0,29034.311412,29652.649372,1551
1992,1711,9402,5627.777367,912.827976,5134,-9124970903081131467,8985174911232927708,-1079754000000000.0,39110230.0,5134,7974.0,99504.0,61978.076336,39834.772308,5109
1993,1711,9402,5471.33015,877.783569,8378,-9162802609033316367,9138780633511352350,906066900000000.0,,8378,2920.0,99504.0,66492.631834,35654.884903,8268
1994,1711,9402,5566.013062,898.228816,14316,-9143276009416282185,9209155525736858158,-450762500000000.0,,14316,1701.0,99504.0,58028.740925,34452.078528,14243
1995,1711,9402,5585.48275,865.716147,20928,-9208146640057075396,9209155525736858158,217824600000000.0,,20928,1071.0,99504.0,54293.765811,32476.442119,20603


For Mode  
Label of mode `groupby()[].apply(lambda x : x.value_counts().idxmax())`  
Count of mode `groupby()[].apply(lambda x : x.value_counts().idxmax())`

In [39]:
ddf.groupby(["Year"])["Merchant City"].apply(lambda x: x.value_counts(sort=True).idxmax(), meta=("Year", "string")).compute().head()

Year
1994    Watsonville
1995     Palm Coast
2000         ONLINE
2007         ONLINE
2008         ONLINE
Name: Year, dtype: object

In [56]:
ddf["Merchant City"].value_counts(sort=True).head(1)[0]

2720821

In [51]:
ddf.groupby(["Year"]).apply(lambda x: x["Merchant City"].value_counts(sort=True).head(1)[0], meta=("Year", "float")).compute()



Year
1994      1247
1995      1291
2000     11572
2015    213290
2009    142466
1992      1292
2016    215049
2020     42938
2005     68457
2008    127392
2006     87467
2017    215170
1997      1467
2002     27171
2007    106989
2018    215056
1998      3194
2001     17992
1993      1182
1996      1420
2003     38461
1999      6233
2012    182414
2004     50617
2010    162638
1991       432
2013    194343
2014    201614
2019    215489
2011    172586
Name: Year, dtype: int64

In [None]:
ddf.groupby(["Year"]).apply(lambda x: x["Merchant City"].value_counts(sort=True).iloc(0), meta=("Year", "float")).compute()

For Percentile/Qunatile  
`.groupby().apply(lambda x : x[""].quantile(), meta=())`

In [30]:
ddf.groupby(["Year"]).apply(lambda x: x["Zip"].quantile(0.1), meta=("Year", "float")).compute()





Year
1994    10573.0
1995    15650.0
2000    12569.0
2015    11420.0
2009    11414.0
1992    10533.0
2016    11374.0
2020    11375.0
2005    11374.0
2008    11368.0
2006    11233.0
2017    11374.0
1997    11234.0
2002    10573.0
2007    11373.0
2018    11368.0
1998    13031.0
2001    11368.0
1993    10536.0
1996    13035.0
2003    10567.0
1999    12801.0
2012    11368.0
2004    11203.0
2010    11530.0
1991    10533.0
2013    11385.0
2014    11420.0
2019    11368.0
2011    11385.0
Name: Year, dtype: float64

groupby nlargest

### GroupBy Aggregate with set_index

Multiple Case-When

np.select / da.mask  
np.where / da.where  
np.mask / da.mask  



Histogram - Dask Sereis.histogram

In [92]:
rng_min = ddf["amount"].min().compute()
rng_max = ddf["amount"].max().compute()
h, bins = da.histogram(ddf["amount"], bins=50, range=[rng_min, rng_max])

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [93]:
bins

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [95]:
h.compute()

array([   96385,  8457238, 15512800,   248441,    37945,    19769,
           8704,     3428,     1257,      445,      190,       95,
             56,       45,       31,       15,       11,       11,
              5,        4,        5,        5,        1,        3,
              7,        0,        1,        1,        1,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        1])

Histogram - Pandas.cut

In [106]:
bins = np.linspace(rng_min, rng_max, 50)
ddf["bins"] = ddf["amount"].map_partitions(pd.cut, bins=bins)
ddf["bins"].value_counts().compute().sort_index()

bins
(-500.0, -236.929]           98321
(-236.929, 26.143]        11225340
(26.143, 289.214]         12774339
(289.214, 552.286]          220831
(552.286, 815.357]           36421
(815.357, 1078.429]          18525
(1078.429, 1341.5]            7820
(1341.5, 1604.571]            3037
(1604.571, 1867.643]          1084
(1867.643, 2130.714]           385
(2130.714, 2393.786]           167
(2393.786, 2656.857]            73
(2656.857, 2919.929]            59
(2919.929, 3183.0]              40
(3183.0, 3446.071]              31
(3446.071, 3709.143]            15
(3709.143, 3972.214]            10
(3972.214, 4235.286]             9
(4235.286, 4498.357]             3
(4498.357, 4761.429]             8
(4761.429, 5024.5]               3
(5024.5, 5287.571]               4
(5287.571, 5550.643]             0
(5550.643, 5813.714]             7
(5813.714, 6076.786]             3
(6076.786, 6339.857]             1
(6339.857, 6602.929]             0
(6602.929, 6866.0]               2
(6866.0, 7129.0

## Transform
Groupby-Aggregation and broadcasting back to each group
Dask have issue for group-agg

In [4]:
ddf_samp = dd.read_parquet(data_path/"data_sample.parquet")

In [252]:
ddf_samp["usr_mean"] = ddf_samp.groupby("User")["amount"].transform(np.mean)
ddf_samp.head()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  ddf_samp["usr_mean"] = ddf_samp.groupby("User")["amount"].transform(np.mean)


ValueError: cannot reindex on an axis with duplicate labels

## Categorical data & calculation

In [42]:
ddf_samp = dd.read_parquet(data_path/"simple.parquet")
ddf_samp.dtypes

User                       int64
Card                       int64
Year                       int64
Month                      int64
Day                        int64
Time                      object
Amount                    object
Use Chip                  object
Merchant Name              int64
Merchant City             object
Merchant State            object
Zip                      float64
MCC                        int64
Errors?           string[python]
Is Fraud?                 object
dtype: object

Convert non category to category .astype -> unknow category / .categorize -> know category

In [43]:
ddf_samp = ddf_samp.astype({"User":"category"})

In [44]:
ddf_samp = ddf_samp.categorize(columns=["Year"])

In [45]:
ddf_samp["User"].cat.known

False

In [46]:
ddf_samp["Year"].cat.known

True

In [47]:
ddf_samp["Year"].cat.ordered

False

In [50]:
# Category make aggretation error
ddf_samp["Day"].unique().compute()

KeyError: 'Year'

In [55]:
ddf_no_cat = dd.read_parquet(data_path/"simple.parquet")
ddf_no_cat["Day"].unique().compute()

0     12
1     20
2     22
0      1
1      6
2      8
3     10
4     16
5     17
6     18
7     23
8     24
9     25
10    27
0      2
1      3
2      7
3     13
4     21
5     28
6     30
0      4
1      5
2      9
3     11
4     14
5     15
6     19
7     26
8     29
9     31
Name: Day, dtype: int64

In [56]:
# Category with un order could not do the arithmatics
ddf_samp[ddf_samp["Year"] >= 2000]

TypeError: Unordered Categoricals can only compare equality or not

In [57]:
ddf_no_cat[ddf_no_cat["Year"] > 2000]

Unnamed: 0_level_0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
npartitions=14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,int64,int64,int64,int64,object,object,object,int64,object,object,float64,int64,string,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [59]:
ddf_samp["Year"]

Dask Series Structure:
npartitions=14
    category[known]
                ...
         ...       
                ...
                ...
Dask Name: getitem, 4 expressions
Expr=(Categorize(frame=AsType(frame=ReadParquetFSSpec(06c030d), dtypes={'User': 'category'}), categories={'Year': 689642    1991
689882    1992
692537    1993
636612    1994
259846    1995
255060    1996
255980    1997
263252    1998
80977     1999
81056     2000
81933     2001
0         2002
329       2003
734       2004
1060      2005
1332      2006
1619      2007
1899      2008
2188      2009
2466      2010
2754      2011
3065      2012
3394      2013
3668      2014
3895      2015
4136      2016
4338      2017
4540      2018
4776      2019
4983      2020
Name: Year, dtype: int64}))['Year']

In [68]:
# re order category to make the arithmatic operation work
yr_range = [*range(1991, 2021, 1)]
ddf_samp["Year"] = ddf_samp["Year"].cat.reorder_categories(yr_range, ordered=True)

In [69]:
ddf_samp[ddf_samp["Year"] >= 2000]

Unnamed: 0_level_0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
npartitions=14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,category[unknown],int64,category[known],int64,int64,object,object,object,int64,object,object,float64,int64,string,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [72]:
ddf_samp.groupby("User")["Year"].min().compute()

User
0      2002
1      2003
2      2002
3      2007
4      1999
       ... 
841    2000
842    2008
843    2020
844    2020
845    2007
Name: Year, Length: 846, dtype: category
Categories (30, int64): [1991 < 1992 < 1993 < 1994 ... 2017 < 2018 < 2019 < 2020]

In [73]:
ddf_no_cat.groupby("User")["Year"].min().compute()

User
0      2002
1      2003
2      2002
3      2007
4      1999
       ... 
841    2000
842    2008
843    2020
844    2020
845    2007
Name: Year, Length: 846, dtype: int64

#### Use order categorical to find max in order

In [81]:
ddf_samp = dd.read_parquet(data_path/"simple.parquet")
ddf_samp["Use Chip"].unique().compute()

0    Online Transaction
1      Chip Transaction
0     Swipe Transaction
Name: Use Chip, dtype: object

In [82]:
ddf_samp = ddf_samp.categorize(columns = ["Use Chip"])
chip_range = ["Chip Transaction", "Swipe Transaction", "Online Transaction"]
ddf_samp["Use Chip"] = ddf_samp["Use Chip"].cat.reorder_categories(chip_range, ordered=True)

In [83]:
ddf_samp["Use Chip"].unique().compute()

0    Online Transaction
1      Chip Transaction
0     Swipe Transaction
Name: Use Chip, dtype: category
Categories (3, object): ['Chip Transaction' < 'Swipe Transaction' < 'Online Transaction']

In [86]:
ddf_samp.groupby("User")["Use Chip"].max().compute()

User
0      Online Transaction
1      Online Transaction
2      Online Transaction
3      Online Transaction
4      Online Transaction
              ...        
841    Online Transaction
842    Online Transaction
843    Online Transaction
844    Online Transaction
845    Online Transaction
Name: Use Chip, Length: 846, dtype: category
Categories (3, object): ['Chip Transaction' < 'Swipe Transaction' < 'Online Transaction']

In [90]:
ddf_samp[ddf_samp["User"]==0].compute()["Use Chip"].value_counts()

Use Chip
Swipe Transaction     15840
Chip Transaction       2808
Online Transaction     1315
Name: count, dtype: int64

## Distrubution Plot

In [81]:
ddf.dtypes

User                        int16
Card                        int16
Month                       int16
Day                         int16
Time              string[pyarrow]
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dtype: object

In [90]:
pdf = ddf.loc[ddf["Year"]==2019, ["amount"]].compute()

In [92]:
pdf.boxplot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

# Dask cluster - distributed computaion 
**Reference**
- Dask Client (https://docs.dask.org/en/latest/deploying-python.html)  
- Futures : (https://docs.dask.org/en/latest/futures.html)  
- Indexing : (https://stackoverflow.com/questions/16626058/)what-is-the-performance-impact-of-non-unique-indexes-in-pandas
- Optimization on distributed cluster (https://targomo.medium.com/how-we-learned-to-love-dask-and-achieved-a-40x-speedup-aa14e72d99c0#fa71)