In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da
from dask_sql import Context

import pyarrow as pa

data_path = Path(os.getcwd()).parent/"data"
c = Context()

# Distributed Cluster 
Dask not recommended distributed computing if to make to acccelerate the work

In [9]:
from dask.distributed import LocalCluster, Client
client = LocalCluster().get_client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45399 instead


In [4]:
client.dashboard_link

'http://127.0.0.1:8787/status'

In [5]:
parquet_path = data_path/"combined_wp_w_div.parquet"

read_ddf = dd.read_parquet(parquet_path)
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 89


In [6]:
read_ddf.dtypes

User                       int16
Card                       int16
Month                      int16
Day                        int16
Time              string[python]
Amount            string[python]
Use Chip          string[python]
Merchant Name     string[python]
Merchant City     string[python]
Merchant State    string[python]
Zip                      float64
MCC                        int16
Errors?           string[python]
Is Fraud?         string[python]
amount                   float64
Year                    category
dtype: object

In [7]:
yr_cnt = client.compute(read_ddf.groupby("Year").agg({"User":"count"}))
yr_cnt

In [8]:
yr_cnt.result()

Unnamed: 0_level_0,User
Year,Unnamed: 1_level_1
1991,1585
1992,5134
1993,8378
1994,14316
1995,20928
1996,29945
1997,49753
1998,78345
1999,118250
2000,177729


Dask distributed client have problems with single machine run  
Fix: upgrade msgpack-python == 1.0.5  
https://github.com/dask/distributed/issues/8038  

# EDA

In [3]:
read_ddf = dd.read_parquet(data_path/"repartition_wp_w_div.parquet")

In [4]:
print(f"Division know : {read_ddf.known_divisions}, number of partition : {read_ddf.npartitions}")

Division know : False, number of partition : 55


In [5]:
read_ddf.dtypes

User                       int16
Card                       int16
Amount            string[python]
Use Chip          string[python]
Merchant Name     string[python]
Merchant City     string[python]
Merchant State    string[python]
Zip                      float64
MCC                        int16
Errors?           string[python]
Is Fraud?         string[python]
amount                   float64
Year                    category
dt                datetime64[ns]
dtype: object

In [6]:
read_ddf.partitions[0].memory_usage(deep=True).compute().apply(dask.utils.format_bytes)

Index               4.62 MiB
User                1.15 MiB
Card                1.15 MiB
Amount             31.77 MiB
Use Chip           38.13 MiB
Merchant Name      39.48 MiB
Merchant City      33.44 MiB
Merchant State     29.60 MiB
Zip                 4.62 MiB
MCC                 1.15 MiB
Errors?            32.41 MiB
Is Fraud?          29.44 MiB
amount              4.62 MiB
Year              591.34 kiB
dt                  4.62 MiB
dtype: object

In [7]:
dask.utils.format_bytes(read_ddf.partitions[0].memory_usage(deep=True).compute().sum())

'256.79 MiB'

In [8]:
read_ddf.isnull().sum().compute()

User                     0
Card                     0
Amount                   0
Use Chip                 0
Merchant Name            0
Merchant City            0
Merchant State     2720821
Zip                2878135
MCC                      0
Errors?           23998469
Is Fraud?                0
amount                   0
Year                     0
dt                       0
dtype: int64

In [9]:
ddf = read_ddf

In [10]:
ddf.dtypes

User                       int16
Card                       int16
Amount            string[python]
Use Chip          string[python]
Merchant Name     string[python]
Merchant City     string[python]
Merchant State    string[python]
Zip                      float64
MCC                        int16
Errors?           string[python]
Is Fraud?         string[python]
amount                   float64
Year                    category
dt                datetime64[ns]
dtype: object

In [11]:
ddf_samp = ddf.groupby(["Year", "Is Fraud?"])["User"].count().compute().reset_index()

In [12]:
ddf_samp

Unnamed: 0,Year,Is Fraud?,User
0,1991,No,1585
1,1991,Yes,0
2,1992,No,5134
3,1992,Yes,0
4,1993,No,8378
...,...,...,...
115,2014,Yes,0
116,2019,No,0
117,2019,Yes,0
118,2020,No,0


In [13]:
## Dask pivot table, must be categorical
dd.pivot_table(ddf, index="Year", columns="Is Fraud?", values="User", aggfunc="count")

ValueError: 'columns' must be category dtype

## Frequency Table - with Pandas

In [14]:
pdf = read_ddf.compute()

In [15]:
pd.crosstab(index=pdf["Year"], columns=pdf["Is Fraud?"], values=pdf["User"], aggfunc="sum", margins=True)

Is Fraud?,No,Yes,All
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1991,1875403,0,1875403
1992,4641033,0,4641033
1993,7950817,0,7950817
1994,14994459,0,14994459
1995,23327017,0,23327017
1996,33217825,8265,33226090
1997,52991788,41138,53032926
1998,81590585,33278,81623863
1999,119517456,16647,119534103
2000,177111607,152599,177264206


In [16]:
pd.crosstab(index=pdf["Year"], columns=pdf["Is Fraud?"], values=pdf["User"], aggfunc="sum", margins=True, normalize="index")

Is Fraud?,No,Yes
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1991,1.0,0.0
1992,1.0,0.0
1993,1.0,0.0
1994,1.0,0.0
1995,1.0,0.0
1996,0.999751,0.000249
1997,0.999224,0.000776
1998,0.999592,0.000408
1999,0.999861,0.000139
2000,0.999139,0.000861


## Univariate stats

In [17]:
ddf.groupby(["Year"])["amount"].agg(["min", "max", "mean", "std", "count", "size"]).compute()

Unnamed: 0_level_0,min,max,mean,std,count,size
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1991,-423.0,1824.78,62.817167,127.858428,1585,1585
1992,-495.0,1630.18,55.531578,85.396222,5134,5134
1993,-498.0,2118.11,56.124552,108.624566,8378,8378
1994,-500.0,1919.19,50.952865,92.470315,14316,14316
1995,-496.0,1898.72,47.735061,85.895939,20928,20928
1996,-497.0,2214.55,47.079224,82.615396,29945,29945
1997,-500.0,1953.18,46.889105,84.13884,49753,49753
1998,-500.0,5233.81,46.794463,87.48,78345,78345
1999,-500.0,2432.54,46.379605,87.767542,118250,118250
2000,-500.0,5878.31,46.533204,88.474357,177729,177729


In [18]:
ddf.columns

Index(['User', 'Card', 'Amount', 'Use Chip', 'Merchant Name', 'Merchant City',
       'Merchant State', 'Zip', 'MCC', 'Errors?', 'Is Fraud?', 'amount',
       'Year', 'dt'],
      dtype='object')

In [19]:
ddf[["MCC", "Zip", "Use Chip", "Merchant Name", "Merchant City", "Merchant State"]].mode().compute()

In [72]:
ddf["Use Chip"].value_counts().compute()

Use Chip
Swipe Transaction     15386082
Chip Transaction       6287598
Online Transaction     2713220
Name: count, dtype: int64[pyarrow]

In [74]:
ddf["Merchant Name"].value_counts().compute().sort_values(ascending=False)

Merchant Name
1799189980464955940     1130230
-4282466774399734331    1129061
2027553650310142703     1028485
-2088492411650162548     720615
-1288082279022882052     687779
                         ...   
976545292452472256            1
983720339223540555            1
985110798894234348            1
994405403286927176            1
996981766903790322            1
Name: count, Length: 100343, dtype: int64[pyarrow]

In [75]:
ddf["Merchant City"].value_counts().compute().sort_values(ascending=False)

Merchant City
ONLINE           2720821
Houston           246036
Los Angeles       180496
Miami             178653
Brooklyn          155425
                  ...   
West Sayville          1
Western                1
Westside               1
Williford              1
Willow                 1
Name: count, Length: 13429, dtype: int64[pyarrow]

In [76]:
ddf["Merchant State"].value_counts().compute().sort_values(ascending=False)

Merchant State
CA                                  2591830
TX                                  1793298
FL                                  1458699
NY                                  1446864
OH                                   895970
                                     ...   
Democratic Republic of the Congo          2
Tonga                                     2
Paraguay                                  1
Botswana                                  1
Kiribati                                  1
Name: count, Length: 223, dtype: int64[pyarrow]

In [66]:
ddf["MCC"].value_counts().compute().sort_values(ascending=False)

MCC
5411    2860738
5499    2680609
5541    2638982
5812    1797920
5912    1407636
         ...   
3007        666
5722        663
4411        634
3144        632
5733        496
Name: count, Length: 109, dtype: int64

In [69]:
ddf["Zip"].value_counts().compute().sort_values(ascending=False)

Zip
98516.0    55679
43830.0    48815
55024.0    44571
95076.0    43656
94606.0    43512
           ...  
17062.0        1
51551.0        1
66424.0        1
54895.0        1
48476.0        1
Name: count, Length: 27321, dtype: int64

In [79]:
ddf[["amount"]].describe().compute()

Unnamed: 0,amount
count,24386900.0
mean,43.63401
std,82.02239
min,-500.0
25%,9.85
50%,33.34
75%,70.01
max,12390.5


In [85]:
amt_range = ddf["amount"].quantile([0.25, 0.75]).compute()

In [86]:
amt_range

0.25     9.85
0.75    70.01
Name: amount, dtype: float64

In [88]:
ddf.amount.between(*amt_range).sum().compute()

12472687

Histogram - Dask Sereis.histogram

In [92]:
rng_min = ddf["amount"].min().compute()
rng_max = ddf["amount"].max().compute()
h, bins = da.histogram(ddf["amount"], bins=50, range=[rng_min, rng_max])

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [93]:
bins

array([ -500.  ,  -242.19,    15.62,   273.43,   531.24,   789.05,
        1046.86,  1304.67,  1562.48,  1820.29,  2078.1 ,  2335.91,
        2593.72,  2851.53,  3109.34,  3367.15,  3624.96,  3882.77,
        4140.58,  4398.39,  4656.2 ,  4914.01,  5171.82,  5429.63,
        5687.44,  5945.25,  6203.06,  6460.87,  6718.68,  6976.49,
        7234.3 ,  7492.11,  7749.92,  8007.73,  8265.54,  8523.35,
        8781.16,  9038.97,  9296.78,  9554.59,  9812.4 , 10070.21,
       10328.02, 10585.83, 10843.64, 11101.45, 11359.26, 11617.07,
       11874.88, 12132.69, 12390.5 ])

In [95]:
h.compute()

array([   96385,  8457238, 15512800,   248441,    37945,    19769,
           8704,     3428,     1257,      445,      190,       95,
             56,       45,       31,       15,       11,       11,
              5,        4,        5,        5,        1,        3,
              7,        0,        1,        1,        1,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        0,        0,        0,        0,        0,
              0,        1])

Histogram - Pandas.cut

In [106]:
bins = np.linspace(rng_min, rng_max, 50)
ddf["bins"] = ddf["amount"].map_partitions(pd.cut, bins=bins)
ddf["bins"].value_counts().compute().sort_index()

bins
(-500.0, -236.929]           98321
(-236.929, 26.143]        11225340
(26.143, 289.214]         12774339
(289.214, 552.286]          220831
(552.286, 815.357]           36421
(815.357, 1078.429]          18525
(1078.429, 1341.5]            7820
(1341.5, 1604.571]            3037
(1604.571, 1867.643]          1084
(1867.643, 2130.714]           385
(2130.714, 2393.786]           167
(2393.786, 2656.857]            73
(2656.857, 2919.929]            59
(2919.929, 3183.0]              40
(3183.0, 3446.071]              31
(3446.071, 3709.143]            15
(3709.143, 3972.214]            10
(3972.214, 4235.286]             9
(4235.286, 4498.357]             3
(4498.357, 4761.429]             8
(4761.429, 5024.5]               3
(5024.5, 5287.571]               4
(5287.571, 5550.643]             0
(5550.643, 5813.714]             7
(5813.714, 6076.786]             3
(6076.786, 6339.857]             1
(6339.857, 6602.929]             0
(6602.929, 6866.0]               2
(6866.0, 7129.0

## Transform
Groupby-Aggregation and broadcasting back to each group
Dask have issue for group-agg

In [4]:
ddf_samp = dd.read_parquet(data_path/"data_sample.parquet")

In [252]:
ddf_samp["usr_mean"] = ddf_samp.groupby("User")["amount"].transform(np.mean)
ddf_samp.head()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  ddf_samp["usr_mean"] = ddf_samp.groupby("User")["amount"].transform(np.mean)


ValueError: cannot reindex on an axis with duplicate labels

## Categorical data & calculation

In [42]:
ddf_samp = dd.read_parquet(data_path/"simple.parquet")
ddf_samp.dtypes

User                       int64
Card                       int64
Year                       int64
Month                      int64
Day                        int64
Time                      object
Amount                    object
Use Chip                  object
Merchant Name              int64
Merchant City             object
Merchant State            object
Zip                      float64
MCC                        int64
Errors?           string[python]
Is Fraud?                 object
dtype: object

Convert non category to category .astype -> unknow category / .categorize -> know category

In [43]:
ddf_samp = ddf_samp.astype({"User":"category"})

In [44]:
ddf_samp = ddf_samp.categorize(columns=["Year"])

In [45]:
ddf_samp["User"].cat.known

False

In [46]:
ddf_samp["Year"].cat.known

True

In [47]:
ddf_samp["Year"].cat.ordered

False

In [50]:
# Category make aggretation error
ddf_samp["Day"].unique().compute()

KeyError: 'Year'

In [55]:
ddf_no_cat = dd.read_parquet(data_path/"simple.parquet")
ddf_no_cat["Day"].unique().compute()

0     12
1     20
2     22
0      1
1      6
2      8
3     10
4     16
5     17
6     18
7     23
8     24
9     25
10    27
0      2
1      3
2      7
3     13
4     21
5     28
6     30
0      4
1      5
2      9
3     11
4     14
5     15
6     19
7     26
8     29
9     31
Name: Day, dtype: int64

In [56]:
# Category with un order could not do the arithmatics
ddf_samp[ddf_samp["Year"] >= 2000]

TypeError: Unordered Categoricals can only compare equality or not

In [57]:
ddf_no_cat[ddf_no_cat["Year"] > 2000]

Unnamed: 0_level_0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
npartitions=14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,int64,int64,int64,int64,object,object,object,int64,object,object,float64,int64,string,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [59]:
ddf_samp["Year"]

Dask Series Structure:
npartitions=14
    category[known]
                ...
         ...       
                ...
                ...
Dask Name: getitem, 4 expressions
Expr=(Categorize(frame=AsType(frame=ReadParquetFSSpec(06c030d), dtypes={'User': 'category'}), categories={'Year': 689642    1991
689882    1992
692537    1993
636612    1994
259846    1995
255060    1996
255980    1997
263252    1998
80977     1999
81056     2000
81933     2001
0         2002
329       2003
734       2004
1060      2005
1332      2006
1619      2007
1899      2008
2188      2009
2466      2010
2754      2011
3065      2012
3394      2013
3668      2014
3895      2015
4136      2016
4338      2017
4540      2018
4776      2019
4983      2020
Name: Year, dtype: int64}))['Year']

In [68]:
# re order category to make the arithmatic operation work
yr_range = [*range(1991, 2021, 1)]
ddf_samp["Year"] = ddf_samp["Year"].cat.reorder_categories(yr_range, ordered=True)

In [69]:
ddf_samp[ddf_samp["Year"] >= 2000]

Unnamed: 0_level_0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
npartitions=14,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,category[unknown],int64,category[known],int64,int64,object,object,object,int64,object,object,float64,int64,string,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [72]:
ddf_samp.groupby("User")["Year"].min().compute()

User
0      2002
1      2003
2      2002
3      2007
4      1999
       ... 
841    2000
842    2008
843    2020
844    2020
845    2007
Name: Year, Length: 846, dtype: category
Categories (30, int64): [1991 < 1992 < 1993 < 1994 ... 2017 < 2018 < 2019 < 2020]

In [73]:
ddf_no_cat.groupby("User")["Year"].min().compute()

User
0      2002
1      2003
2      2002
3      2007
4      1999
       ... 
841    2000
842    2008
843    2020
844    2020
845    2007
Name: Year, Length: 846, dtype: int64

#### Use order categorical to find max in order

In [81]:
ddf_samp = dd.read_parquet(data_path/"simple.parquet")
ddf_samp["Use Chip"].unique().compute()

0    Online Transaction
1      Chip Transaction
0     Swipe Transaction
Name: Use Chip, dtype: object

In [82]:
ddf_samp = ddf_samp.categorize(columns = ["Use Chip"])
chip_range = ["Chip Transaction", "Swipe Transaction", "Online Transaction"]
ddf_samp["Use Chip"] = ddf_samp["Use Chip"].cat.reorder_categories(chip_range, ordered=True)

In [83]:
ddf_samp["Use Chip"].unique().compute()

0    Online Transaction
1      Chip Transaction
0     Swipe Transaction
Name: Use Chip, dtype: category
Categories (3, object): ['Chip Transaction' < 'Swipe Transaction' < 'Online Transaction']

In [86]:
ddf_samp.groupby("User")["Use Chip"].max().compute()

User
0      Online Transaction
1      Online Transaction
2      Online Transaction
3      Online Transaction
4      Online Transaction
              ...        
841    Online Transaction
842    Online Transaction
843    Online Transaction
844    Online Transaction
845    Online Transaction
Name: Use Chip, Length: 846, dtype: category
Categories (3, object): ['Chip Transaction' < 'Swipe Transaction' < 'Online Transaction']

In [90]:
ddf_samp[ddf_samp["User"]==0].compute()["Use Chip"].value_counts()

Use Chip
Swipe Transaction     15840
Chip Transaction       2808
Online Transaction     1315
Name: count, dtype: int64

## Use dask-sql

In [6]:
from dask_sql import Context

ddf_small = dd.read_parquet(data_path/"data_sample.parquet")
c = Context()
c.create_table("ddf_small", ddf_small)

In [7]:
ddf_small["Year"] = ddf_small["Year"].astype("string")

In [8]:
c.create_table("ddf_small", ddf_small)

In [9]:
ddf_small.groupby("Year").count().compute()

Unnamed: 0_level_0,dt,User,amount
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1992,9,9,9
1993,5,5,5
1994,12,12,12
1995,26,26,26
1996,35,35,35
1997,58,58,58
1998,72,72,72
1999,113,113,113
2000,190,190,190
2001,267,267,267


In [10]:
qry = \
"""
select User
,      amount
,      lag(amount) over (partition by User order by dt)
from   ddf_small
group by User
"""
result = c.sql(qry)
result.compute()

ParsingException: SchemaError(FieldNotFound { field: Column { relation: None, name: "user" }, valid_fields: [Column { relation: Some(Bare { table: "ddf_small" }), name: "Year" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "dt" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "User" }, Column { relation: Some(Bare { table: "ddf_small" }), name: "amount" }] })

## Distrubution Plot

In [81]:
ddf.dtypes

User                        int16
Card                        int16
Month                       int16
Day                         int16
Time              string[pyarrow]
Amount            string[pyarrow]
Use Chip          string[pyarrow]
Merchant Name     string[pyarrow]
Merchant City     string[pyarrow]
Merchant State    string[pyarrow]
Zip                       float64
MCC                         int16
Errors?           string[pyarrow]
Is Fraud?         string[pyarrow]
amount                    float64
Year                     category
dtype: object

In [90]:
pdf = ddf.loc[ddf["Year"]==2019, ["amount"]].compute()

In [92]:
pdf.boxplot()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.