# Transformers4Rec: preprocessing 

## Installazione della libreria Cudf e Transformers4Rec

In [None]:
!python --version

Python 3.8.10


In [None]:
pip install -U pip

In [None]:
!pip install cudf-cu11==22.12 dask-cudf-cu11==22.12 --extra-index-url=https://pypi.nvidia.com/.
!rm -rf /usr/local/lib/python3.8/dist-packages/cupy*
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com

In [None]:
!pip install transformers4rec[pytorch,nvtabular]

## Preprocessing




In questa fase di preprocessing si andrà ad utilizzare un dataset eCommerce proveniente da REES46 Marketing Platform. Ogni evento è rappresentato come una associazione molti-a-molti tra prodotti e utenti.



In [None]:
import os
import numpy as np 
import gc
import shutil
import glob

import dask_cudf
import cudf
import nvtabular as nvt
from merlin.dag import ColumnSelector
from merlin.schema import Schema, Tags
import pandas as pd 
from nvtabular.ops import Operator

Lettura del file .csv attraverso Cudf. 

In [None]:
raw_df = cudf.read_csv("/content/drive/MyDrive/2019-Oct.csv") 
raw_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [None]:
#num of rows and cols
raw_df.shape

(42448764, 9)

In [None]:
# Remove rows where `user_session` is null.
raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)

42448762

## Converting time stamp to datetime




In [None]:
raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')
raw_df['event_time_ts']= raw_df['event_time_dt'].astype('int')

In [None]:
raw_df = raw_df.drop(['event_time'],  axis=1)

In [None]:
raw_df.head(10)

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_dt,event_time_ts
0,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,2019-10-01 00:00:00,1569888000
1,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01 00:00:00,1569888000
2,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,2019-10-01 00:00:01,1569888001
3,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2019-10-01 00:00:01,1569888001
4,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2019-10-01 00:00:04,1569888004
5,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9,2019-10-01 00:00:05,1569888005
6,view,17300353,2053013553853497655,,creed,380.96,555447699,4fe811e9-91de-46da-90c3-bbd87ed3a65d,2019-10-01 00:00:08,1569888008
7,view,31500053,2053013558031024687,,luminarc,41.16,550978835,6280d577-25c8-4147-99a7-abc6048498d6,2019-10-01 00:00:08,1569888008
8,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880,2019-10-01 00:00:10,1569888010
9,view,1004545,2053013555631882655,electronics.smartphone,huawei,566.01,537918940,406c46ed-90a4-4787-a43b-59a410c1a5fb,2019-10-01 00:00:11,1569888011


In [None]:
# check the min date
raw_df['event_time_dt'].min()

numpy.datetime64('2019-10-01T00:00:00')

In [None]:
raw_df['event_time_dt'].max()

numpy.datetime64('2019-10-31T23:59:59')

Al fine di poter evitare errori nella successiva fase di raggruppamento per sessione, si converte il campo user_session in int.

In [None]:
cols = list(raw_df.columns)
cols.remove('user_session')
cols

['event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'event_time_dt',
 'event_time_ts']

Si utilizza nello specifico l'operazione **Categorify** di NVTabular, necessaria per trasformare fetaures categoriche in valori interi unici.
La classe workflow applica un grafo di operazioni sul dataset, utile per effettuare feature engineering e operazioni di preprocessing. Questa classe utilizza delle API simili a Transformers in sklearn, in quanto si utilizza *fit* sul dataset per calcolare le sue statistiche e successivamente si applica transform sul dataset applicando queste statistiche.



In [None]:

#converting from dask_cudf to cudf
#raw_df = raw_df.compute()

df_event = nvt.Dataset(raw_df) 

# categorify user_session (transform to integer values)
cat_feats = ['user_session'] >> nvt.ops.Categorify()


In [None]:
# Initialize the workflow and execute it
workflow = nvt.Workflow(cols + cat_feats)
workflow.fit(df_event)
df = workflow.transform(df_event).to_ddf().compute()

In [None]:
df.head(10)

Unnamed: 0,user_session,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts
0,5126085,view,44600062,2103807459595387724,,shiseido,35.79,541312140,2019-10-01 00:00:00,1569888000
1,7854470,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,2019-10-01 00:00:00,1569888000
2,730655,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,2019-10-01 00:00:01,1569888001
3,1637332,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,2019-10-01 00:00:01,1569888001
4,4202155,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,2019-10-01 00:00:04,1569888004
5,1808164,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,2019-10-01 00:00:05,1569888005
6,6995770,view,17300353,2053013553853497655,,creed,380.96,555447699,2019-10-01 00:00:08,1569888008
7,3794756,view,31500053,2053013558031024687,,luminarc,41.16,550978835,2019-10-01 00:00:08,1569888008
8,5470852,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,2019-10-01 00:00:10,1569888010
9,2858777,view,1004545,2053013555631882655,electronics.smartphone,huawei,566.01,537918940,2019-10-01 00:00:11,1569888011


In [None]:
#deleting old df
raw_df = None
del(raw_df)
gc.collect()


69

In [None]:
#sorting the df based on session and time
df = df.sort_values(['user_session', 'event_time_ts']).reset_index(drop=True)


In [None]:
df = df.drop(['event_time_dt'],  axis=1)

In [None]:
type(df)

cudf.core.dataframe.DataFrame

## Rimozione delle interazioni consecutive


Si eliminano le interazioni consecutive, avvenute probabilmente a causa dell'aggiornamento della scheda del browser o a diversi tipi di interazione (es. click, add-to-card, purchase)

In [None]:

print("Count with in-session repeated interactions: {}".format(len(df)))

#creating column shifting the product and so creating the past event-product 
df['product_id_past'] = df['product_id'].shift(1).fillna(0)
df['session_id_past'] = df['user_session'].shift(1).fillna(0)

#Keeping only no consecutive repeated in session interactions
df = df[~((df['user_session'] == df['session_id_past']) & \
             (df['product_id'] == df['product_id_past']))]
print("Count after removed in-session repeated interactions: {}".format(len(df)))
del(df['product_id_past'])
del(df['session_id_past'])

gc.collect()
df.head(10)


Count with in-session repeated interactions: 42448762
Count after removed in-session repeated interactions: 30733301


Unnamed: 0,user_session,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_ts
0,1,view,12712529,2053013553559896355,,hankook,70.79,513605798,1569900208
1,1,view,12702204,2053013553559896355,,bridgestone,72.07,513605798,1569900523
2,1,view,12718922,2053013553559896355,,dunlop,72.59,513605798,1569900702
4,1,view,12711730,2053013553559896355,,goodride,45.45,513605798,1569902687
5,1,view,12708497,2053013553559896355,,nitto,68.21,513605798,1569902730
6,1,view,5801483,2053013553945772349,electronics.audio.subwoofer,pioneer,58.43,513605798,1569903169
8,1,view,5800140,2053013553945772349,electronics.audio.subwoofer,sony,41.19,513605798,1569903199
10,1,view,5801322,2053013553945772349,electronics.audio.subwoofer,pioneer,90.09,513605798,1569903246
12,1,view,5800827,2053013553945772349,electronics.audio.subwoofer,mystery,172.46,513605798,1569903266
14,1,view,5800380,2053013553945772349,electronics.audio.subwoofer,hertz,62.65,513605798,1569903298


###Feature temporale: product first time seen 
Si calcola la feature temporale in funzione che indica il timestamp in cui un item è stato visto per la prima volta. 

In [None]:
#compute the event time min for each product id
item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts': 'min'}) \
            .reset_index().rename(columns={'event_time_ts': 'prod_first_event_time_ts'})

item_first_interaction_df.head()
gc.collect()

0

In [None]:
df = df.merge(item_first_interaction_df, on=['product_id'], how='left').reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,user_session,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_ts,prod_first_event_time_ts
0,152,view,1801539,2053013554415534427,electronics.video.tv,lg,419.24,514971132,1570986064,1569898645
1,152,view,1801826,2053013554415534427,electronics.video.tv,toshiba,359.6,514971132,1570986081,1569898007
2,152,view,1801539,2053013554415534427,electronics.video.tv,lg,419.24,514971132,1570986090,1569898645
3,152,view,1801881,2053013554415534427,electronics.video.tv,samsung,496.4,514971132,1570986093,1569894423
4,152,view,1801539,2053013554415534427,electronics.video.tv,lg,419.24,514971132,1570986100,1569898645


In [None]:
len(df)

30733301

Si salva il risultato del preprocessing in un file .parquet

In [None]:
# save df as parquet files on disk
df.to_parquet(os.path.join("/content/drive/MyDrive/dataset_rees46", '2019-Oct.parquet'))
