# Data Wrangling

In [1]:
# Import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import helper
import importlib
_ = importlib.reload(helper)


Let's read the data into a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) so that we can begin to understand it.

*Note, we'll set `error_bad_lines=False` when reading the file in as there appear to be a very small number of records which would create a problem otherwise.*

In [2]:
dataset_name = 'ml-latest-small'
df_interactions = helper.get_csv(dataset_name, filename="interactions.csv.gz")
df_interactions.head()


Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df_interactions_cleaned = helper.get_csv(dataset_name, filename="interactions_cleaned.csv.gz")
df_interactions_cleaned.head()


Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP
0,1,1,4.0,964982703
1,5,1,4.0,847434962
2,7,1,4.5,1106635946
3,15,1,2.5,1510577970
4,17,1,4.5,1305696483


Next, we'll number each user and item, giving them their own sequential index.  This will allow us to hold the information in a sparse format where the sequential indices indicate the row and column in our ratings matrix.

In [4]:
users = df_interactions['USER_ID'].value_counts()
items = df_interactions['ITEM_ID'].value_counts()


In [5]:
df_user_index = pd.DataFrame({'USER_ID': users.index, '_USER_IDX': np.arange(users.shape[0])})

df_item_index = pd.DataFrame({'ITEM_ID': items.index, 
                              '_ITEM_IDX': np.arange(items.shape[0])})
helper.put_csv(df_user_index, dataset_name, "user_index.csv.gz")
helper.put_csv(df_item_index, dataset_name, "item_index.csv.gz")


In [6]:
df_interactions_cleaned = df_interactions_cleaned.merge(df_user_index).merge(df_item_index)
df_interactions_cleaned


Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP,_USER_IDX,_ITEM_IDX
0,1,1,4.0,964982703,111,11
1,5,1,4.0,847434962,408,11
2,7,1,4.5,1106635946,172,11
3,15,1,2.5,1510577970,190,11
4,17,1,4.5,1305696483,238,11
...,...,...,...,...,...,...
94789,111,161580,4.5,1516153346,28,4233
94790,306,161580,3.5,1518380859,226,4233
94791,143,78316,1.0,1444881335,304,4238
94792,563,78316,3.5,1441846114,124,4238


### Prepare

Let's start by splitting in training and test sets.  This will allow us to estimate the model's accuracy on videos our customers rated, but wasn't included in our training.

In [7]:
df_test = df_interactions_cleaned.groupby('USER_ID').last().reset_index()
df_test


Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP,_USER_IDX,_ITEM_IDX
0,1,5060,5.0,964984002,111,499
1,2,86345,4.0,1445715166,507,2394
2,3,4518,5.0,1306463770,428,3709
3,4,1733,5.0,945079143,121,4024
4,5,597,3.0,847434962,408,61
...,...,...,...,...,...,...
605,606,4794,3.5,1171500721,8,4771
606,607,3109,4.0,964744724,140,3723
607,608,2107,2.5,1117506680,20,4820
608,609,731,4.0,847221025,442,4197


In [8]:
helper.put_csv(df_test, dataset_name, "interactions_test.csv.gz")



In [9]:
df_train = df_interactions_cleaned.merge(df_test[['USER_ID', 'ITEM_ID']], 
                            on=['USER_ID', 'ITEM_ID'], 
                            how='outer', 
                            indicator=True)
df_train = df_train[(df_train['_merge'] == 'left_only')]

df_train


Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP,_USER_IDX,_ITEM_IDX,_merge
0,1,1,4.0,964982703,111,11,left_only
1,5,1,4.0,847434962,408,11,left_only
2,7,1,4.5,1106635946,172,11,left_only
3,15,1,2.5,1510577970,190,11,left_only
4,17,1,4.5,1305696483,238,11,left_only
...,...,...,...,...,...,...,...
94780,111,71668,4.5,1516152773,28,4301,left_only
94781,563,71668,3.0,1446856757,124,4301,left_only
94783,125,156387,3.5,1476903871,72,4604,left_only
94785,125,155509,4.0,1476313973,72,4696,left_only


In [10]:
helper.put_csv(df_train, dataset_name, "interactions_train.csv.gz")

