# Data Wrangling

In [104]:
# Import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import helper
import importlib
_ = importlib.reload(helper)


Let's read the data into a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) so that we can begin to understand it.

*Note, we'll set `error_bad_lines=False` when reading the file in as there appear to be a very small number of records which would create a problem otherwise.*

In [105]:
dataset_name = 'book-crossing'
df_interactions = helper.get_csv(dataset_name, filename="interactions.csv.gz")
df_interactions.head()

Unnamed: 0,USER_ID,ITEM_ID,RATING
0,276725,034545104X,0.0
1,276726,0155061224,2.5
2,276727,0446520802,0.0
3,276729,052165615X,1.5
4,276729,0521795028,3.0


In [106]:
df_interactions_cleaned = helper.get_csv(dataset_name, filename="interactions_cleaned.csv.gz")
df_interactions_cleaned.head()

Unnamed: 0,USER_ID,ITEM_ID,RATING
0,276746,425115801,0.0
1,277427,425115801,0.0
2,1660,425115801,0.0
3,2288,425115801,5.0
4,4938,425115801,4.5


Next, we'll number each user and item, giving them their own sequential index.  This will allow us to hold the information in a sparse format where the sequential indices indicate the row and column in our ratings matrix.

In [107]:
users = df_interactions['USER_ID'].value_counts()
items = df_interactions['ITEM_ID'].value_counts()

In [110]:
df_user_index = pd.DataFrame({'USER_ID': users.index, '_USER_IDX': np.arange(users.shape[0])})

df_item_index = pd.DataFrame({'ITEM_ID': items.index, 
                              '_ITEM_IDX': np.arange(items.shape[0])})
helper.put_csv(df_user_index, dataset_name, "user_index.csv.gz")
helper.put_csv(df_item_index, dataset_name, "item_index.csv.gz")


In [111]:
df_interactions_cleaned = df_interactions_cleaned.merge(df_user_index).merge(df_item_index)
df_interactions_cleaned


Unnamed: 0,USER_ID,ITEM_ID,RATING,_USER_IDX,_ITEM_IDX
0,276746,0425115801,0.0,18642,445
1,277427,0425115801,0.0,290,445
2,1660,0425115801,0.0,12603,445
3,2288,0425115801,5.0,4136,445
4,4938,0425115801,4.5,2493,445
...,...,...,...,...,...
582560,264339,880781563X,5.0,8478,33328
582561,66085,880781563X,3.5,21004,33328
582562,61114,8845912957,0.0,16625,27179
582563,105423,8881123320,0.0,13331,26495


### Prepare

Let's start by splitting in training and test sets.  This will allow us to estimate the model's accuracy on videos our customers rated, but wasn't included in our training.

In [112]:
df_test = df_interactions_cleaned.groupby('USER_ID').last().reset_index()
df_test


Unnamed: 0,USER_ID,ITEM_ID,RATING,_USER_IDX,_ITEM_IDX
0,8,1558746218,0.0,7988,23742
1,17,0684823802,0.0,16659,17526
2,44,0440223571,4.0,19851,1711
3,53,0156047624,5.0,16838,19175
4,69,0312970242,0.0,18071,25776
...,...,...,...,...,...
21891,278782,0553408453,3.5,21586,17077
21892,278838,0441735762,0.0,17144,4273
21893,278843,0670879835,0.0,2742,16185
21894,278851,0553277375,0.0,6474,10784


In [113]:
helper.put_csv(df_test, dataset_name, "interactions_test.csv.gz")



In [114]:
df_train = df_interactions_cleaned.merge(df_test[['USER_ID', 'ITEM_ID']], 
                            on=['USER_ID', 'ITEM_ID'], 
                            how='outer', 
                            indicator=True)
df_train = df_train[(df_train['_merge'] == 'left_only')]

df_train


Unnamed: 0,USER_ID,ITEM_ID,RATING,_USER_IDX,_ITEM_IDX,_merge
0,276746,0425115801,0.0,18642,445,left_only
1,277427,0425115801,0.0,290,445,left_only
2,1660,0425115801,0.0,12603,445,left_only
3,2288,0425115801,5.0,4136,445,left_only
4,4938,0425115801,4.5,2493,445,left_only
...,...,...,...,...,...,...
582468,160414,387024674X,0.0,1842,13398,left_only
582469,160415,387024674X,0.0,1846,13398,left_only
582470,160416,387024674X,0.0,1827,13398,left_only
582531,171286,8817202266,0.0,4400,32077,left_only


In [115]:
helper.put_csv(df_train, dataset_name, "interactions_train.csv.gz")



Now, we can convert our Pandas DataFrames into MXNet NDArrays, use those to create a member of the SparseMatrixDataset class, and add that to an MXNet Data Iterator.  This process is the same for both test and control.

In [14]:
%store user_index
%store item_index
%store df_train
%store df_test

Stored 'user_index' (DataFrame)
Stored 'item_index' (DataFrame)
Stored 'df_train' (DataFrame)
Stored 'df_test' (DataFrame)


In [None]:
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print(len(users), 'descriptions of', max_userid, 'users loaded.')

In [None]:
# Save into users.csv
df_users_reg.to_csv(USERS_REG_CSV_FILE, 
             sep=',', 
             header=True, 
             encoding='latin-1',
             columns=['USER_ID', 'jobfunc', 'country', 'role'])
print('Saved to', USERS_REG_CSV_FILE)

In [None]:
# Read the Ratings File
import numpy as np
df_interactions = pd.read_csv(os.path.join(RAW_DATA_DIR, INTERACT_DATA_FILE), 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    header=0,
                    names=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])

df_interactions = df_interactions.fillna(method='ffill')
df_interactions.TIMESTAMP = df_interactions.TIMESTAMP.astype('int32')
df_interactions

In [None]:
df_interactions = df_interactions.loc[df_interactions['event_type']=='Page_Visit']
df_interactions

In [None]:
# Read the Ratings File
df_users_unreg = df_interactions.loc[~df_interactions.USER_ID.isin(df_users_reg.USER_ID), ['USER_ID']]

df_users_unreg

In [None]:
# Save into users.csv
df_users_unreg.to_csv(USERS_UNREG_CSV_FILE, 
             sep=',', 
             header=True, 
             encoding='latin-1',
             columns=['USER_ID'])
print('Saved to', USERS_UNREG_CSV_FILE)


In [None]:
df_interactions_reg = df_interactions.loc[df_interactions.USER_ID.isin(df_users_reg.USER_ID)]
df_interactions_reg

In [None]:
# Set max_userid to the maximum user_id in the ratings
max_userid = df_interactions['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = df_interactions['item_id'].drop_duplicates().max()

# Process ratings dataframe for Keras Deep Learning model
# Add user_emb_id column whose values == user_id - 1
ratings['user_emb_id'] = ratings['user_id'] - 1
# Add movie_emb_id column whose values == movie_id - 1
ratings['movie_emb_id'] = ratings['movie_id'] - 1

print(len(ratings), 'ratings loaded')

In [None]:
# Save into ratings.csv
df_interactions_reg.to_csv(INTERACT_REG_CSV_FILE, 
               sep=',', 
               header=True, 
               encoding='latin-1', 
               columns=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])
print('Saved to', INTERACT_REG_CSV_FILE)

In [None]:
df_interactions_unreg = df_interactions.loc[df_interactions.USER_ID.isin(df_users_unreg.USER_ID)]
df_interactions_unreg

In [None]:
# Save into ratings.csv
df_interactions_unreg.to_csv(INTERACT_UNREG_CSV_FILE, 
               sep=',', 
               header=True, 
               encoding='latin-1', 
               columns=['USER_ID', 'ITEM_ID', 'TIMESTAMP', 'event_type', 'event_value'])
print('Saved to', INTERACT_UNREG_CSV_FILE)

In [None]:
# Read the Movies File
df_items = pd.read_csv(os.path.join(RAW_DATA_DIR, ITEM_DATA_FILE), 
                    sep=',', 
                    engine='python', 
                    encoding='latin-1',
                    header=0,
                    names=['ITEM_ID', 'desc', 'TIMESTAMP', 'client_status'])

df_items = df_items.fillna(method='bfill')
df_items.TIMESTAMP = df_items.TIMESTAMP.astype('int32')
df_items

In [None]:
print(len(df_items), 'descriptions of item loaded.')

In [None]:
# Save into movies.csv
df_items.to_csv(ITEMS_CSV_FILE, 
              sep=',', 
              header=True, 
              columns=['ITEM_ID'])
print('Saved to', ITEMS_CSV_FILE)