#### GPU device information

In [27]:
!nvidia-smi

Fri Jan 22 17:47:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 455.23.05    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:89:00.0 Off |                    0 |
| N/A   32C    P0    55W / 300W |   7617MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

---

In [7]:
from cudf import read_csv, read_parquet, read_orc, DataFrame, concat
import numpy as np

In [8]:
# #to warmup the gpu
df = DataFrame({'a':range(10)})

---
#### Reading the cal_dim table from `/data/calendar.csv` stored in csv format

In [9]:
%%time
cal = read_csv('/data/calendar.csv')
cal['cal_dt'] = cal['cal_dt'].astype('datetime64[ms]')
cal['ly_date'] = cal['ly_date'].astype('datetime64[ms]')
cal['ly_comp_visit_dt'] = cal['ly_comp_visit_dt'].astype('datetime64[ms]')

CPU times: user 14.4 ms, sys: 10.5 ms, total: 25 ms
Wall time: 22 ms


In [10]:
cal.head()

Unnamed: 0,cal_dt,ly_date,ly_comp_visit_dt,wm_yr_wk_nbr
0,2017-04-11,2016-04-11,2016-04-12,201711


In [11]:
join_dates = concat([
        DataFrame({'cal_dt':cal['cal_dt'], 'cal_ind':col_ind , 'join_date':cal[col]}) 
        for (col_ind, col) in [('C', 'cal_dt'), ('L', 'ly_date'), ('R', 'ly_comp_visit_dt')]
    ])
join_dates.head()

Unnamed: 0,cal_dt,cal_ind,join_date
0,2017-04-11,C,2017-04-11
0,2017-04-11,L,2016-04-11
0,2017-04-11,R,2016-04-12


---
#### Reading the store_visit_scan table from `/data/store_visit_scan_mask.parquet` stored in parquet format

In [12]:
%%time
svs = read_parquet('/data/store_visit_scan_mask.parquet')
svs['load_ts'] = svs['load_ts'].astype('datetime64[ms]')
svs['visit_dt'] = svs['visit_dt'].astype('datetime64[ms]')

CPU times: user 43.4 ms, sys: 87.4 ms, total: 131 ms
Wall time: 128 ms


In [13]:
print("shape : ", svs.shape)
svs.head()

shape :  (6599219, 8)


Unnamed: 0,visit_nbr,store_nbr,scan_seq_nbr,scan_id,scan_type,load_ts,visit_dt,op_cmpny_cd
0,710103784,1099,8,877534402,0,2017-06-23 12:46:35,2017-04-11,WMT-US
1,710104362,1999,7,867174169,0,2017-06-23 12:46:35,2017-04-11,WMT-US
2,710101771,17377,6,738523462,0,2017-06-23 12:46:35,2017-04-11,WMT-US
3,710103327,9700,10,892230403,0,2017-06-23 12:46:35,2017-04-11,WMT-US
4,710101361,8737,4,851258830,0,2017-06-23 12:46:35,2017-04-11,WMT-US


---
#### Reading the store_visit table from `/data/store_visit_mask.parquet` stored in parquet format

In [14]:
%%time
vst = read_parquet('/data/store_visit_mask.parquet')
vst['visit_dt'] = vst['visit_dt'].astype('datetime64[ms]')
vst['visit_hr'] = (vst.visit_tm / 100).astype('i2')
vst['visit_min'] = (vst.visit_tm%100).astype('i2')
vst = vst.drop(['visit_tm'], axis=1)

CPU times: user 3.98 s, sys: 137 ms, total: 4.12 s
Wall time: 4.12 s


In [15]:
print("shape : ", vst.shape)
vst.head()

shape :  (7189981, 6)


Unnamed: 0,store_nbr,visit_nbr,visit_dt,op_cmpny_cd,visit_hr,visit_min
0,11680,710100004,2017-04-11,WMT-US,0,1
1,16237,710100004,2017-04-11,WMT-US,0,1
2,17365,710100004,2017-04-11,WMT-US,0,2
3,5071,710100005,2017-04-11,WMT-US,0,3
4,5443,710100005,2017-04-11,WMT-US,0,3


---
### Performing inner join between store_visit_scan and store_visit

In [16]:
%%time
# merging the scan and visite tables
df = svs.merge(vst, on=['visit_dt', 'store_nbr', 'visit_nbr'], how='inner')
df = df.drop_duplicates()

CPU times: user 28.1 ms, sys: 105 ms, total: 133 ms
Wall time: 130 ms


In [17]:
print("shape : ", df.shape)
df.head()

shape :  (2338031, 11)


Unnamed: 0,visit_nbr,store_nbr,scan_seq_nbr,scan_id,scan_type,load_ts,visit_dt,op_cmpny_cd_x,op_cmpny_cd_y,visit_hr,visit_min
1372954,710100013,31,1,926596021,0,2017-06-23 12:10:28,2017-04-11,WMT-US,WMT-US,0,11
1372953,710100013,31,2,885408808,0,2017-06-23 12:10:27,2017-04-11,WMT-US,WMT-US,0,11
1372952,710100013,31,3,247820944,0,2017-06-23 12:10:27,2017-04-11,WMT-US,WMT-US,0,11
620456,710100013,106,1,399520165,0,2017-06-23 12:09:44,2017-04-11,WMT-US,WMT-US,0,21
1000526,710100013,115,1,846065128,0,2017-06-23 12:10:10,2017-04-11,WMT-US,WMT-US,0,8


---
#### Reading the item_dim table from `/data/item_dim_mask.orc` stored in orc format

In [18]:
%%time
item = read_orc('/data/item_dim_mask.orc')

CPU times: user 203 ms, sys: 233 ms, total: 436 ms
Wall time: 435 ms


In [19]:
print("shape : ", item.shape)
item.head()

shape :  (34110184, 4)


Unnamed: 0,mds_fam_id,item_nbr,upc_nbr,dept_nbr
0,-905,-6989,-1994.0,8
1,-1787,-6989,-1994.0,400
2,-1778,-6989,-1994.0,396
3,-1769,-6989,-1994.0,392
4,-1760,-6989,-1994.0,388


---
### Performing inner join with item_dim

In [20]:
%%time
df = df.rename(columns = {'scan_id':'mds_fam_id'})
df = df.merge(item, on=['mds_fam_id'], how='inner')

CPU times: user 379 ms, sys: 1.63 s, total: 2.01 s
Wall time: 2.02 s


In [21]:
print("shape : ", df.shape)
df.head()

shape :  (2319138, 14)


Unnamed: 0,visit_nbr,store_nbr,scan_seq_nbr,mds_fam_id,scan_type,load_ts,visit_dt,op_cmpny_cd_x,op_cmpny_cd_y,visit_hr,visit_min,item_nbr,upc_nbr,dept_nbr
0,710101582,16159,1,22138933,3,2017-06-23 12:13:08,2017-04-11,WMT-US,WMT-US,13,36,1810579880,4.0,104
1,710103603,238,16,30839692,3,2017-06-23 12:18:45,2017-04-11,WMT-US,WMT-US,19,10,19138949,8809889000.0,112
2,710100470,154,3,30492499,3,2017-06-23 12:32:01,2017-04-11,WMT-US,WMT-US,10,38,4309323,7200820000.0,28
3,710100470,6694,5,31950688,3,2017-06-23 12:31:56,2017-04-11,WMT-US,WMT-US,9,25,14200407,8661226000.0,84
4,710101285,2227,2,47745895,3,2017-06-23 13:03:41,2017-04-11,WMT-US,WMT-US,11,41,2184790409,4.0,128


In [22]:
%%time
df = df.rename(columns = {'visit_dt':'cal_dt'})
df = df.merge(join_dates, on=['cal_dt'], how='inner')

CPU times: user 19.7 ms, sys: 25.8 ms, total: 45.5 ms
Wall time: 42.6 ms


In [23]:
df.head()

Unnamed: 0,visit_nbr,store_nbr,scan_seq_nbr,mds_fam_id,scan_type,load_ts,cal_dt,op_cmpny_cd_x,op_cmpny_cd_y,visit_hr,visit_min,item_nbr,upc_nbr,dept_nbr,cal_ind,join_date
0,710101189,16495,52,942517732,0,2017-06-23 12:11:04,2017-04-11,WMT-US,WMT-US,12,10,3893860863,4499845000.0,32,C,2017-04-11
1,710101189,16495,53,942517732,0,2017-06-23 12:11:04,2017-04-11,WMT-US,WMT-US,12,10,3893860863,4499845000.0,32,C,2017-04-11
2,710101189,16495,54,942517732,0,2017-06-23 12:11:05,2017-04-11,WMT-US,WMT-US,12,10,3893860863,4499845000.0,32,C,2017-04-11
3,710101252,1093,5,942517732,0,2017-06-23 12:49:06,2017-04-11,WMT-US,WMT-US,12,4,3893860863,4499845000.0,32,C,2017-04-11
4,710101252,1093,6,942517732,0,2017-06-23 12:49:07,2017-04-11,WMT-US,WMT-US,12,4,3893860863,4499845000.0,32,C,2017-04-11


### Perfroming groupby on store number to get the item counts

In [24]:
%%time
df_grpd = df.groupby('store_nbr').agg({'mds_fam_id':'count'})

CPU times: user 14 ms, sys: 8.03 ms, total: 22.1 ms
Wall time: 19.4 ms


In [25]:
df_grpd = df_grpd.rename(columns = {'mds_fam_id':'mds_fam_id_count'})
df_grpd

Unnamed: 0_level_0,mds_fam_id_count
store_nbr,Unnamed: 1_level_1
7,2307
10,2826
13,2100
16,2193
19,1587
...,...
22051,606
22075,540
22087,450
22090,477


---
### Top 5 store with maximum item sold on given day

In [26]:
df_grpd.mds_fam_id_count.nlargest(5).to_frame()

Unnamed: 0_level_0,mds_fam_id_count
store_nbr,Unnamed: 1_level_1
15883,5466
5221,5100
6277,5061
11389,4929
15901,4479


---
#### Total execution time:  
 = 17.5ms + 76ms + 102ms + 91.8ms + 399ms + 3.58s + 23.4ms + 13.5ms  
 ~ 5s approx.