In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da
from dask_sql import Context

import pyarrow as pa

data_path = Path(os.getcwd()).parent/"data"
c = Context()

# Distributed Cluster (Error for Single machine)

In [3]:
from dask.distributed import Client
client = Client()

In [4]:
client.dashboard_link

'http://127.0.0.1:8787/status'

In [5]:
parquet_path = data_path/"combined_wp_w_div.parquet"

read_ddf = dd.read_parquet(parquet_path)
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 178


In [6]:
read_ddf.dtypes

User                        int16
Card                        int16
Month                       int16
Day                         int16
Time              string[pyarrow]
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float16
Year                     category
dtype: object

In [7]:
read_ddf.Zip.sum().compute()

2024-08-12 12:49:58,866 - distributed.protocol.core - CRITICAL - Failed to deserialize
Traceback (most recent call last):
  File "/home/danny/miniconda3/envs/dask/lib/python3.12/site-packages/distributed/protocol/core.py", line 175, in loads
    return msgpack.loads(
           ^^^^^^^^^^^^^^
  File "/home/danny/miniconda3/envs/dask/lib/python3.12/site-packages/msgpack/fallback.py", line 136, in unpackb
    raise ExtraData(ret, unpacker._get_extradata())
msgpack.exceptions.ExtraData: unpack(b) received extra data.
2024-08-12 12:49:58,876 - distributed.core - ERROR - Exception while handling op register-client
Traceback (most recent call last):
  File "/home/danny/miniconda3/envs/dask/lib/python3.12/site-packages/distributed/core.py", line 970, in _handle_comm
    result = await result
             ^^^^^^^^^^^^
  File "/home/danny/miniconda3/envs/dask/lib/python3.12/site-packages/distributed/scheduler.py", line 5710, in add_client
    await self.handle_stream(comm=comm, extra={"client":

CancelledError: ('sum-tree-6ae2a5a1d538e50c65b1a046d98b4150', 0)

Dask distributed client have problems with single machine run

# EDA

In [3]:
read_ddf = dd.read_parquet(data_path/"repartition_wp_w_div.parquet")

In [5]:
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 55


In [6]:
read_ddf.dtypes

User                        int16
Card                        int16
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dt                 datetime64[ns]
dtype: object

In [7]:
read_ddf.partitions[0].memory_usage(deep=True).compute().apply(dask.utils.format_bytes)

Index               4.62 MiB
User                1.15 MiB
Card                1.15 MiB
Amount              8.10 MiB
Use Chip           14.46 MiB
Merchant Name      15.82 MiB
Merchant City       9.78 MiB
Merchant State      5.81 MiB
Zip                 4.62 MiB
MCC                 1.15 MiB
Errors?             4.84 MiB
Is Fraud?           5.77 MiB
amount              4.62 MiB
Year              591.34 kiB
dt                  4.62 MiB
dtype: object

In [8]:
dask.utils.format_bytes(read_ddf.partitions[0].memory_usage(deep=True).compute().sum())

'87.09 MiB'

In [44]:
read_ddf.isnull().sum().compute()

User                     0
Card                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
amount                   0
Year                     0
dt                       0
dtype: int64

In [10]:
ddf = read_ddf

In [46]:
ddf.dtypes

User                        int16
Card                        int16
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dt                 datetime64[ns]
dtype: object

In [47]:
pdf = ddf.groupby(["Year", "Is Fraud?"])["User"].count().compute().reset_index()

In [48]:
## Dask pivot table, must be categorical
dd.pivot_table(ddf, index="Year", columns="Is Fraud?", values="User", aggfunc="count")

ValueError: 'columns' must be category dtype

## Frequency Table

In [49]:
pd.crosstab(index=pdf["Year"], columns=pdf["Is Fraud?"], values=pdf["User"], aggfunc="sum", margins=True)

Is Fraud?,No,Yes,All
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991,1585,0,1585
1992,5134,0,5134
1993,8378,0,8378
1994,14316,0,14316
1995,20928,0,20928
1996,29935,10,29945
1997,49721,32,49753
1998,78313,32,78345
1999,118226,24,118250
2000,177558,171,177729


In [50]:
pd.crosstab(index=pdf["Year"], columns=pdf["Is Fraud?"], values=pdf["User"], aggfunc="sum", margins=True, normalize="index")

Is Fraud?,No,Yes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1991,1.0,0.0
1992,1.0,0.0
1993,1.0,0.0
1994,1.0,0.0
1995,1.0,0.0
1996,0.999666,0.000334
1997,0.999357,0.000643
1998,0.999592,0.000408
1999,0.999797,0.000203
2000,0.999038,0.000962


In [51]:
ddf.groupby(["Year"])["amount"].agg(["min", "max", "mean", "std", "count", "size"]).compute()

Unnamed: 0_level_0,min,max,mean,std,count,size
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1991,-423.0,1824.78,62.817167,127.858428,1585,1585
1992,-495.0,1630.18,55.531578,85.396222,5134,5134
1993,-498.0,2118.11,56.124552,108.624566,8378,8378
1994,-500.0,1919.19,50.952865,92.470315,14316,14316
1995,-496.0,1898.72,47.735061,85.895939,20928,20928
1996,-497.0,2214.55,47.079224,82.615396,29945,29945
1997,-500.0,1953.18,46.889105,84.13884,49753,49753
1998,-500.0,5233.81,46.794463,87.48,78345,78345
1999,-500.0,2432.54,46.379605,87.767542,118250,118250
2000,-500.0,5878.31,46.533204,88.474357,177729,177729


## Univariate stats

In [70]:
ddf.columns

Index(['User', 'Card', 'Amount', 'Use Chip', 'Merchant Name', 'Merchant City',
       'Merchant State', 'Zip', 'MCC', 'Errors?', 'Is Fraud?', 'amount',
       'Year', 'dt'],
      dtype='object')

In [71]:
ddf[["MCC", "Zip", "Use Chip", "Merchant Name", "Merchant City", "Merchant State"]].mode().compute()

Unnamed: 0,MCC,Zip,Use Chip,Merchant Name,Merchant City,Merchant State
0,5411,98516.0,Swipe Transaction,1799189980464955940,ONLINE,CA


In [72]:
ddf["Use Chip"].value_counts().compute()

Use Chip
Swipe Transaction     15386082
Chip Transaction       6287598
Online Transaction     2713220
Name: count, dtype: int64[pyarrow]

In [74]:
ddf["Merchant Name"].value_counts().compute().sort_values(ascending=False)

Merchant Name
1799189980464955940     1130230
-4282466774399734331    1129061
2027553650310142703     1028485
-2088492411650162548     720615
-1288082279022882052     687779
                         ...   
976545292452472256            1
983720339223540555            1
985110798894234348            1
994405403286927176            1
996981766903790322            1
Name: count, Length: 100343, dtype: int64[pyarrow]

In [75]:
ddf["Merchant City"].value_counts().compute().sort_values(ascending=False)

Merchant City
ONLINE           2720821
Houston           246036
Los Angeles       180496
Miami             178653
Brooklyn          155425
                  ...   
West Sayville          1
Western                1
Westside               1
Williford              1
Willow                 1
Name: count, Length: 13429, dtype: int64[pyarrow]

In [76]:
ddf["Merchant State"].value_counts().compute().sort_values(ascending=False)

Merchant State
CA                                  2591830
TX                                  1793298
FL                                  1458699
NY                                  1446864
OH                                   895970
                                     ...   
Democratic Republic of the Congo          2
Tonga                                     2
Paraguay                                  1
Botswana                                  1
Kiribati                                  1
Name: count, Length: 223, dtype: int64[pyarrow]

In [66]:
ddf["MCC"].value_counts().compute().sort_values(ascending=False)

MCC
5411    2860738
5499    2680609
5541    2638982
5812    1797920
5912    1407636
         ...   
3007        666
5722        663
4411        634
3144        632
5733        496
Name: count, Length: 109, dtype: int64

In [69]:
ddf["Zip"].value_counts().compute().sort_values(ascending=False)

Zip
98516.0    55679
43830.0    48815
55024.0    44571
95076.0    43656
94606.0    43512
           ...  
17062.0        1
51551.0        1
66424.0        1
54895.0        1
48476.0        1
Name: count, Length: 27321, dtype: int64

In [79]:
ddf[["amount"]].describe().compute()

Unnamed: 0,amount
count,24386900.0
mean,43.63401
std,82.02239
min,-500.0
25%,9.85
50%,33.34
75%,70.01
max,12390.5


In [85]:
amt_range = ddf["amount"].quantile([0.25, 0.75]).compute()

In [86]:
amt_range

0.25     9.85
0.75    70.01
Name: amount, dtype: float64

In [88]:
ddf.amount.between(*amt_range).sum().compute()

12472687

Histogram - Dask Sereis.histogram

In [92]:
rng_min = ddf["amount"].min().compute()
rng_max = ddf["amount"].max().compute()
h, bins = da.histogram(ddf["amount"], bins=50, range=[rng_min, rng_max])

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [93]:
bins

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [95]:
h.compute()

array([   96385,  8457238, 15512800,   248441,    37945,    19769,
           8704,     3428,     1257,      445,      190,       95,
             56,       45,       31,       15,       11,       11,
              5,        4,        5,        5,        1,        3,
              7,        0,        1,        1,        1,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        1])

Histogram - Pandas.cut

In [106]:
bins = np.linspace(rng_min, rng_max, 50)
ddf["bins"] = ddf["amount"].map_partitions(pd.cut, bins=bins)
ddf["bins"].value_counts().compute().sort_index()

bins
(-500.0, -236.929]           98321
(-236.929, 26.143]        11225340
(26.143, 289.214]         12774339
(289.214, 552.286]          220831
(552.286, 815.357]           36421
(815.357, 1078.429]          18525
(1078.429, 1341.5]            7820
(1341.5, 1604.571]            3037
(1604.571, 1867.643]          1084
(1867.643, 2130.714]           385
(2130.714, 2393.786]           167
(2393.786, 2656.857]            73
(2656.857, 2919.929]            59
(2919.929, 3183.0]              40
(3183.0, 3446.071]              31
(3446.071, 3709.143]            15
(3709.143, 3972.214]            10
(3972.214, 4235.286]             9
(4235.286, 4498.357]             3
(4498.357, 4761.429]             8
(4761.429, 5024.5]               3
(5024.5, 5287.571]               4
(5287.571, 5550.643]             0
(5550.643, 5813.714]             7
(5813.714, 6076.786]             3
(6076.786, 6339.857]             1
(6339.857, 6602.929]             0
(6602.929, 6866.0]               2
(6866.0, 7129.0

## Windows function with Dask

In [4]:
ddf_small = read_ddf[["Year", "dt", "User", "amount"]]

ddf_small.head()

Unnamed: 0,Year,dt,User,amount
0,1991,1991-02-08 13:39:00,791,42.41
1,1991,1991-12-10 11:19:00,791,16.08
2,1991,1991-12-10 22:59:00,791,17.96
3,1991,1991-12-12 10:38:00,791,145.92
4,1991,1991-12-12 11:01:00,791,9.28


In [12]:
ddf_small.groupby(["User"]).first().compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2002,2002-12-31 08:22:00,42.33
1,2003,2003-10-04 12:57:00,26.04
2,2002,2002-12-14 10:49:00,41.48
4,1999,1999-12-01 09:20:00,106.61
5,2002,2002-05-01 21:45:00,22.64
...,...,...,...
1972,2020,2020-02-05 09:51:00,1.01
1973,2020,2020-02-11 03:51:00,17.61
1984,2020,2020-01-08 22:28:00,41.83
1991,2020,2020-02-01 22:21:00,28.59


In [13]:
ddf_small.groupby(["User"]).last().compute()

Unnamed: 0_level_0,Year,dt,amount
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2020,2020-02-18 06:10:00,119.87
1,2020,2020-01-09 21:17:00,-239.00
2,2020,2020-01-08 13:25:00,27.92
4,2020,2020-01-30 15:09:00,44.58
5,2020,2020-02-28 18:42:00,113.72
...,...,...,...
1972,2020,2020-02-13 19:27:00,0.93
1973,2020,2020-01-06 05:43:00,3.47
1984,2020,2020-01-26 19:19:00,137.56
1991,2020,2020-02-11 22:29:00,30.21


Lead , Lag function

In [7]:
sorted_ddf = ddf_small.sort_values("dt", ascending=True)
out = sorted_ddf.assign(lagged_amount = sorted_ddf.groupby("User")["amount"].shift(1))
# ddf_small[ddf_small["User"]==0].head()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  out = sorted_ddf.assign(lagged_amount = sorted_ddf.groupby("User")["amount"].shift(1))


In [6]:
sorted_ddf.head()

Unnamed: 0,Year,dt,User,amount
427,1991,1991-01-02 07:10:00,791,68.0
428,1991,1991-01-02 07:17:00,791,-68.0
429,1991,1991-01-02 07:21:00,791,113.62
430,1991,1991-01-02 17:30:00,791,114.73
432,1991,1991-01-03 09:03:00,791,251.71


In [8]:
out.head()

ValueError: cannot reindex on an axis with duplicate labels

Lead / Lag with Pandas

In [13]:
pdf = ddf_small.compute()
pdf.sort_values("dt", inplace=True)

In [14]:
pdf['Lag_amnt'] = pdf.groupby(["User"])["amount"].shift(1)

In [15]:
pdf[pdf["User"]==0]

Unnamed: 0,Year,dt,User,amount,Lag_amnt
78930,2002,2002-09-01 06:21:00,0,134.09,
78928,2002,2002-09-01 06:42:00,0,38.48,134.09
78905,2002,2002-09-02 06:22:00,0,120.34,38.48
78906,2002,2002-09-02 17:45:00,0,128.95,120.34
78907,2002,2002-09-03 06:23:00,0,104.71,128.95
...,...,...,...,...,...
19074,2020,2020-02-27 15:22:00,0,-295.00,167.96
17559,2020,2020-02-28 06:23:00,0,46.77,-295.00
19020,2020,2020-02-28 06:29:00,0,114.51,46.77
19072,2020,2020-02-28 06:53:00,0,34.11,114.51


## Use dask-sql

In [10]:
from dask_sql import Context

c = Context()
c.create_table("ddf_small", ddf_small)

In [16]:
ddf_small["Year"] = ddf_small["Year"].astype("string")

In [17]:
c.create_table("ddf_small", ddf_small)

In [23]:
ddf_small.groupby("Year").count().compute()

Unnamed: 0_level_0,dt,User,amount
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991,1585,1585,1585
1992,5134,5134,5134
1993,8378,8378,8378
1994,14316,14316,14316
1995,20928,20928,20928
1996,29945,29945,29945
1997,49753,49753,49753
1998,78345,78345,78345
1999,118250,118250,118250
2000,177729,177729,177729


In [29]:
qry = \
"""
select User
,      amount
,      lag(amount) over (partition by User order by dt)
from   ddf_small
group by User
"""
result = c.sql(qry)
result.compute()

ParsingException: SchemaError(FieldNotFound { field: Column { relation: None, name: "user" }, valid_fields: [Column { relation: Some(Bare { table: "ddf_small" }), name: "Year" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "dt" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "User" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "amount" }] })

## Distrubution Plot

In [81]:
ddf.dtypes

User                        int16
Card                        int16
Month                       int16
Day                         int16
Time              string[pyarrow]
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dtype: object

In [90]:
pdf = ddf.loc[ddf["Year"]==2019, ["amount"]].compute()

In [92]:
pdf.boxplot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.