#### Build the client in dask and import the packages

In [1]:
from dask.distributed import Client, progress

c = Client()
c

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 7.92 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:7464,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 7.92 GiB

0,1
Comm: tcp://127.0.0.1:7479,Total threads: 1
Dashboard: http://127.0.0.1:7483/status,Memory: 1.98 GiB
Nanny: tcp://127.0.0.1:7467,
Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-oe50_hxd,Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-oe50_hxd

0,1
Comm: tcp://127.0.0.1:7482,Total threads: 1
Dashboard: http://127.0.0.1:7490/status,Memory: 1.98 GiB
Nanny: tcp://127.0.0.1:7468,
Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-6uusqd4e,Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-6uusqd4e

0,1
Comm: tcp://127.0.0.1:7485,Total threads: 1
Dashboard: http://127.0.0.1:7488/status,Memory: 1.98 GiB
Nanny: tcp://127.0.0.1:7469,
Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-zpd1fuj4,Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-zpd1fuj4

0,1
Comm: tcp://127.0.0.1:7492,Total threads: 1
Dashboard: http://127.0.0.1:7493/status,Memory: 1.98 GiB
Nanny: tcp://127.0.0.1:7470,
Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-uhw78l8v,Local directory: C:\Users\Jayanth\AppData\Local\Temp\dask-worker-space\worker-uhw78l8v


### Dask provides parallelism and helps achieve scalability on python

In [2]:
import os
import time


import dask
import dask.array as da
import dask.dataframe as dd
import numpy as np

In [4]:
print(dask.__version__)
print(np.__version__)

2023.3.2
1.23.5


In [5]:
dtype = {
    'fine_grained_location': 'float64', 
    'officer_id':'object', 
    'county_fips': 'float64',
    'search_type': 'object',
    'search_type_raw': 'object'
}

ddf = dd.read_csv(
    "D:\\DataEngineering\\DataAnalysis\\Dask\\archive\\TX_2010_onwards.csv", # The file size is around 2.8 GB
#     blocksize="25MB", 
    dtype=dtype, 
    low_memory=False,
    assume_missing=True
)
ddf

Unnamed: 0_level_0,id,state,stop_date,stop_time,location_raw,county_name,county_fips,fine_grained_location,police_department,driver_gender,driver_age_raw,driver_age,driver_race_raw,driver_race,violation_raw,violation,search_conducted,search_type_raw,search_type,contraband_found,stop_outcome,is_arrested,lat,lon,officer_id,driver_race_original
npartitions=42,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
,object,object,object,object,object,object,float64,float64,float64,object,float64,float64,object,object,object,object,bool,object,object,bool,object,float64,float64,float64,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
%%time
ddf.head()

CPU times: total: 2.11 s
Wall time: 6.19 s


Unnamed: 0,id,state,stop_date,stop_time,location_raw,county_name,county_fips,fine_grained_location,police_department,driver_gender,...,search_conducted,search_type_raw,search_type,contraband_found,stop_outcome,is_arrested,lat,lon,officer_id,driver_race_original
0,TX-2010-0000002,TX,2010-01-01,00:00,Guadalupe,Guadalupe County,48187.0,622.0,,F,...,False,,,False,Warning,,29.622867,-97.778663,11524,Asian
1,TX-2010-0000003,TX,2010-01-01,00:00,Fannin,Fannin County,48147.0,668.0,,F,...,False,,,False,Warning,,33.603183,-96.150215,12274,White
2,TX-2010-0000004,TX,2010-01-01,00:00,Coryell,Coryell County,48099.0,560.0,,M,...,False,,,False,Citation,,31.121599,-97.835418,12365,Black
3,TX-2010-0000005,TX,2010-01-01,00:00,Dallas,Dallas County,48113.0,464.0,,M,...,False,,,False,Citation,,,,10795,Black
4,TX-2010-0000006,TX,2010-01-01,00:00,Denton,Denton County,48121.0,0.0,,M,...,False,,,False,Citation,,,,12571,White


#### How is the dask dataframe divided into multiple paritions
- A large, virtual dataframe divided along the index into multiple Pandas dataframes:

In [7]:
ddf.map_partitions(len).compute()

0     320331
1     319055
2     317988
3     317534
4     317974
5     318156
6     320416
7     320723
8     318603
9     317948
10    315403
11    315560
12    316307
13    316121
14    315790
15    316616
16    315106
17    315204
18    315310
19    315377
20    315152
21    315723
22    317291
23    317568
24    315960
25    315924
26    316432
27    316258
28    316371
29    317325
30    317513
31    315363
32    314852
33    314527
34    315914
35    316771
36    316046
37    317190
38    317145
39    317452
40    319439
41    316180
dtype: int64

##### The data type of each partition

In [9]:
%%time
ddf.map_partitions(type).compute()

CPU times: total: 20.9 s
Wall time: 1min 22s


0     <class 'pandas.core.frame.DataFrame'>
1     <class 'pandas.core.frame.DataFrame'>
2     <class 'pandas.core.frame.DataFrame'>
3     <class 'pandas.core.frame.DataFrame'>
4     <class 'pandas.core.frame.DataFrame'>
5     <class 'pandas.core.frame.DataFrame'>
6     <class 'pandas.core.frame.DataFrame'>
7     <class 'pandas.core.frame.DataFrame'>
8     <class 'pandas.core.frame.DataFrame'>
9     <class 'pandas.core.frame.DataFrame'>
10    <class 'pandas.core.frame.DataFrame'>
11    <class 'pandas.core.frame.DataFrame'>
12    <class 'pandas.core.frame.DataFrame'>
13    <class 'pandas.core.frame.DataFrame'>
14    <class 'pandas.core.frame.DataFrame'>
15    <class 'pandas.core.frame.DataFrame'>
16    <class 'pandas.core.frame.DataFrame'>
17    <class 'pandas.core.frame.DataFrame'>
18    <class 'pandas.core.frame.DataFrame'>
19    <class 'pandas.core.frame.DataFrame'>
20    <class 'pandas.core.frame.DataFrame'>
21    <class 'pandas.core.frame.DataFrame'>
22    <class 'pandas.core.frame.

#### The length of the dataframe

In [10]:
%%time
len(ddf)

CPU times: total: 20.7 s
Wall time: 1min 20s


13307918

In [11]:
%%time
ddf_rows = ddf.shape[0].compute()
ddf_col = ddf.shape[1]
print(f"Number of rows: {ddf_rows}")
print(f"Number of columns: {ddf_col}")

Number of rows: 13307918
Number of columns: 26
CPU times: total: 22.6 s
Wall time: 1min 20s


In [12]:
%%time
(ddf.isna().sum().compute() / ddf_rows) * 100

CPU times: total: 29.7 s
Wall time: 2min 11s


id                         0.000000
state                      0.000000
stop_date                  0.000000
stop_time                  0.000000
location_raw               0.000751
county_name                0.000751
county_fips                0.000751
fine_grained_location      0.001390
police_department        100.000000
driver_gender              0.002239
driver_age_raw           100.000000
driver_age               100.000000
driver_race_raw            0.001796
driver_race                2.957750
violation_raw              0.006244
violation                  0.006244
search_conducted           0.000000
search_type_raw           98.170495
search_type               98.170495
contraband_found           0.000000
stop_outcome               0.000000
is_arrested              100.000000
lat                       16.357254
lon                       16.354031
officer_id                 0.003103
driver_race_original       6.084941
dtype: float64