# Data Wrangling

In [None]:
# Import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import helper
import importlib
_ = importlib.reload(helper)


Let's read the data into a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) so that we can begin to understand it.

*Note, we'll set `error_bad_lines=False` when reading the file in as there appear to be a very small number of records which would create a problem otherwise.*

In [2]:
dataset_name = 'book-crossing'
df = helper.get_interactions(dataset_name)
df.head()

In [2]:
dataset_name = 'book-crossing'
df_cleaned = helper.get_interactions(dataset_name, filename="interactions_cleaned.csv.gz")
df_cleaned.head()

Unnamed: 0,USER_ID,ITEM_ID,RATING
0,276746,425115801,0.0
1,277427,425115801,0.0
2,1660,425115801,0.0
3,2288,425115801,5.0
4,4938,425115801,4.5


Next, we'll number each user and item, giving them their own sequential index.  This will allow us to hold the information in a sparse format where the sequential indices indicate the row and column in our ratings matrix.

In [3]:
users = df['USER_ID'].value_counts()
items = df['ITEM_ID'].value_counts()

In [4]:
user_index = pd.DataFrame({'USER_ID': users.index, 'user_idx': np.arange(users.shape[0])})

item_index = pd.DataFrame({'ITEM_ID': items.index, 
                              'item_idx': np.arange(items.shape[0])})


df_cleaned = df_cleaned.merge(user_index).merge(item_index)
df_cleaned

Unnamed: 0,USER_ID,ITEM_ID,RATING,user_idx,item_idx
0,276746,0425115801,0.0,18642,445
1,277427,0425115801,0.0,290,445
2,1660,0425115801,0.0,12603,445
3,2288,0425115801,5.0,4136,445
4,4938,0425115801,4.5,2493,445
...,...,...,...,...,...
582560,219386,2070415732,0.0,18326,34046
582561,40043,3423125195,4.0,6985,34462
582562,61114,8845912957,0.0,16625,26299
582563,105423,8881123320,0.0,13331,26803


### Prepare

Let's start by splitting in training and test sets.  This will allow us to estimate the model's accuracy on videos our customers rated, but wasn't included in our training.

In [5]:
df_test = df_cleaned.groupby('USER_ID').last().reset_index()
df_test


Unnamed: 0,USER_ID,ITEM_ID,RATING,user_idx,item_idx
0,8,1558746218,0.0,7988,22431
1,17,0684823802,0.0,16659,17487
2,44,0440223571,4.0,19851,1687
3,53,0060914068,1.5,16838,16741
4,69,0312970242,0.0,18071,26480
...,...,...,...,...,...
21891,278782,0553408453,3.5,21586,17983
21892,278838,0441735762,0.0,17144,4374
21893,278843,0670879835,0.0,2742,16291
21894,278851,0553277375,0.0,6474,10810


In [6]:
df_train = df_cleaned.merge(df_test[['USER_ID', 'ITEM_ID']], 
                            on=['USER_ID', 'ITEM_ID'], 
                            how='outer', 
                            indicator=True)
df_train = df_train[(df_train['_merge'] == 'left_only')]

df_train


Unnamed: 0,USER_ID,ITEM_ID,RATING,user_idx,item_idx,_merge
0,276746,0425115801,0.0,18642,445,left_only
1,277427,0425115801,0.0,290,445,left_only
2,1660,0425115801,0.0,12603,445,left_only
3,2288,0425115801,5.0,4136,445,left_only
4,4938,0425115801,4.5,2493,445,left_only
...,...,...,...,...,...,...
582485,159687,8845915913,0.0,8045,21980,left_only
582504,136908,8804497033,2.5,5219,26182,left_only
582508,20871,880450577X,3.5,2231,32935,left_only
582527,171286,8807813998,3.5,4400,34405,left_only


Now, we can convert our Pandas DataFrames into MXNet NDArrays, use those to create a member of the SparseMatrixDataset class, and add that to an MXNet Data Iterator.  This process is the same for both test and control.

In [7]:
%store user_index
%store item_index
%store df_train
%store df_test

Stored 'user_index' (DataFrame)
Stored 'item_index' (DataFrame)
Stored 'df_train' (DataFrame)
Stored 'df_test' (DataFrame)


In [None]:
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print(len(users), 'descriptions of', max_userid, 'users loaded.')

In [57]:
# Save into users.csv
df_users_reg.to_csv(USERS_REG_CSV_FILE, 
             sep=',', 
             header=True, 
             encoding='latin-1',
             columns=['USER_ID', 'jobfunc', 'country', 'role'])
print('Saved to', USERS_REG_CSV_FILE)

Saved to users_reg.csv


In [81]:
# Read the Ratings File
import numpy as np
df_interactions = pd.read_csv(os.path.join(RAW_DATA_DIR, INTERACT_DATA_FILE), 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    header=0,
                    names=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])

df_interactions = df_interactions.fillna(method='ffill')
df_interactions.TIMESTAMP = df_interactions.TIMESTAMP.astype('int32')
df_interactions

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,event_type,event_value
0,docu-173226,Prod-181146,1475248352,RFI,4
1,docu-173226,Prod-594273,1475248547,RFI,4
2,docu-173226,Prod-553244,1475248777,RFI,4
3,docu-173226,Prod-512376,1475248828,RFI,4
4,docu-173226,Prod-35859,1475248950,RFI,4
...,...,...,...,...,...
9450314,docu-201094,Prod-483061,1600787512,RFI,4
9450315,docu-201094,Prod-250002,1600787512,RFI,4
9450316,docu-201094,Prod-600148,1600787512,RFI,4
9450317,docu-110539,Prod-583985,1600787512,RFI,4


In [82]:
df_interactions = df_interactions.loc[df_interactions['event_type']=='Page_Visit']
df_interactions

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,event_type,event_value
157,06jc13b905hvwha820,Prod-781752,1588799001,Page_Visit,1
158,06aayj9805ap16yx20,Prod-58541,1588799043,Page_Visit,1
159,07u9qw38052qx39e20,Prod-976899,1588799067,Page_Visit,1
160,06gc50z205nyaxx220,Prod-1004637,1588799071,Page_Visit,1
161,07u9qw38052qx39e20,Prod-976899,1588799084,Page_Visit,1
...,...,...,...,...,...
9450249,221iliir09jo28ha20,Prod-245816,1600787450,Page_Visit,1
9450250,22eb9h1509rbbg8j20,Prod-1249083,1600787456,Page_Visit,1
9450251,22eb9h1509rbbg8j20,Prod-1254738,1600787474,Page_Visit,1
9450252,22c3o1tn09gauy5520,Prod-503268,1600787507,Page_Visit,1


In [83]:
# Read the Ratings File
df_users_unreg = df_interactions.loc[~df_interactions.USER_ID.isin(df_users_reg.USER_ID), ['USER_ID']]

df_users_unreg

Unnamed: 0,USER_ID
157,06jc13b905hvwha820
158,06aayj9805ap16yx20
159,07u9qw38052qx39e20
160,06gc50z205nyaxx220
161,07u9qw38052qx39e20
...,...
9450249,221iliir09jo28ha20
9450250,22eb9h1509rbbg8j20
9450251,22eb9h1509rbbg8j20
9450252,22c3o1tn09gauy5520


In [84]:
# Save into users.csv
df_users_unreg.to_csv(USERS_UNREG_CSV_FILE, 
             sep=',', 
             header=True, 
             encoding='latin-1',
             columns=['USER_ID'])
print('Saved to', USERS_UNREG_CSV_FILE)


Saved to users_unreg.csv


In [85]:
df_interactions_reg = df_interactions.loc[df_interactions.USER_ID.isin(df_users_reg.USER_ID)]
df_interactions_reg

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,event_type,event_value
982,docu-201466,Prod-909983,1588819197,Page_Visit,1
1051,docu-185788,Prod-962677,1588821731,Page_Visit,1
1056,docu-185788,Prod-1243937,1588821802,Page_Visit,1
1057,docu-185788,Prod-962677,1588821805,Page_Visit,1
1089,docu-195395,Prod-518868,1588822541,Page_Visit,1
...,...,...,...,...,...
9450068,docu-180179,Prod-1041055,1600784219,Page_Visit,1
9450133,docu-227020,Prod-817156,1600785021,Page_Visit,1
9450159,docu-227020,Prod-817156,1600785126,Page_Visit,1
9450196,docu-180179,Prod-971514,1600785536,Page_Visit,1


In [None]:
# Set max_userid to the maximum user_id in the ratings
max_userid = df_interactions['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = df_interactions['item_id'].drop_duplicates().max()

# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_emb_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1

print(len(ratings), 'ratings loaded')

In [86]:
# Save into ratings.csv
df_interactions_reg.to_csv(INTERACT_REG_CSV_FILE, 
               sep=',', 
               header=True, 
               encoding='latin-1', 
               columns=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])
print('Saved to', INTERACT_REG_CSV_FILE)

Saved to interactions_reg.csv


In [87]:
df_interactions_unreg = df_interactions.loc[df_interactions.USER_ID.isin(df_users_unreg.USER_ID)]
df_interactions_unreg

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,event_type,event_value
157,06jc13b905hvwha820,Prod-781752,1588799001,Page_Visit,1
158,06aayj9805ap16yx20,Prod-58541,1588799043,Page_Visit,1
159,07u9qw38052qx39e20,Prod-976899,1588799067,Page_Visit,1
160,06gc50z205nyaxx220,Prod-1004637,1588799071,Page_Visit,1
161,07u9qw38052qx39e20,Prod-976899,1588799084,Page_Visit,1
...,...,...,...,...,...
9450249,221iliir09jo28ha20,Prod-245816,1600787450,Page_Visit,1
9450250,22eb9h1509rbbg8j20,Prod-1249083,1600787456,Page_Visit,1
9450251,22eb9h1509rbbg8j20,Prod-1254738,1600787474,Page_Visit,1
9450252,22c3o1tn09gauy5520,Prod-503268,1600787507,Page_Visit,1


In [88]:
# Save into ratings.csv
df_interactions_unreg.to_csv(INTERACT_UNREG_CSV_FILE, 
               sep=',', 
               header=True, 
               encoding='latin-1', 
               columns=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])
print('Saved to', INTERACT_UNREG_CSV_FILE)

Saved to interactions_unreg.csv


In [89]:
# Read the Movies File
df_items = pd.read_csv(os.path.join(RAW_DATA_DIR, ITEM_DATA_FILE), 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    header=0,
                    names=['ITEM_ID', 'desc', 'TIMESTAMP', 'client_status'])

df_items = df_items.fillna(method='bfill')
df_items.TIMESTAMP = df_items.TIMESTAMP.astype('int32')
df_items

Unnamed: 0,ITEM_ID,desc,TIMESTAMP,client_status
0,prod-100069,Plant Extracts,1440547200,Informa Basic
1,prod-100070,Plant Extracts,1440547200,Informa Basic
2,prod-100071,Plant Extracts,1440547200,Informa Basic
3,prod-100072,Plant Extracts,1440547200,Informa Basic
4,prod-100073,Plant Extracts,1440547200,Informa Basic
...,...,...,...,...
552770,prod-99959,,1440547200,Informa Basic
552771,prod-99960,,1440547200,Informa Basic
552772,prod-99961,,1440547200,Informa Basic
552773,prod-99962,,1440547200,Informa Basic


In [90]:
print(len(df_items), 'descriptions of item loaded.')

552775 descriptions of item loaded.


In [91]:
# Save into movies.csv
df_items.to_csv(ITEMS_CSV_FILE, 
              sep=',', 
              header=True, 
              columns=['ITEM_ID'])
print('Saved to', ITEMS_CSV_FILE)

Saved to items.csv
