In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
USERS_PATH = 'dataset/users.dat'
ITEMS_PATH = 'dataset/movies.dat'
RATINGS_PATH = 'dataset/ratings.dat'

# Process users---------------------------------------------------------------------
print("Processing users")
users = pd.read_csv(USERS_PATH, delimiter='::', engine='python', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])
users['user_id'] = users['user_id'].astype('category')
users['user_id_num'] = users['user_id'].cat.codes
users = users[['user_id', 'user_id_num']]
user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))
print(users)

# Process items---------------------------------------------------------------
print("Processing items")
items = pd.read_csv(ITEMS_PATH, delimiter='::', engine='python', encoding='latin1', names=['movie_id', 'title', 'genres'])
items['movie_id'] = items['movie_id'].astype('category')
items['movie_id_num'] = items['movie_id'].cat.codes
items = items[['movie_id', 'movie_id_num']]
item_id_to_num = dict(zip(items['movie_id'], items['movie_id_num']))
print(items)

# Process ratings--------------------------------------------------------------------
print("Processing ratings")
ratings = pd.read_csv(RATINGS_PATH, delimiter='::', engine='python', names=['user_id', 'movie_id', 'rating', 'timestamp'])
# Merge users and items info
ratings = pd.merge(ratings, users, how='inner', on='user_id')
ratings = pd.merge(ratings, items, how='inner', on='movie_id')
# Drop id (keep id in number)
ratings = ratings.drop(columns='user_id')
ratings = ratings.drop(columns='movie_id')
print(ratings)

Processing users
     user_id  user_id_num
0          1            0
1          2            1
2          3            2
3          4            3
4          5            4
...      ...          ...
6035    6036         6035
6036    6037         6036
6037    6038         6037
6038    6039         6038
6039    6040         6039

[6040 rows x 2 columns]
Processing items
     movie_id  movie_id_num
0           1             0
1           2             1
2           3             2
3           4             3
4           5             4
...       ...           ...
3878     3948          3878
3879     3949          3879
3880     3950          3880
3881     3951          3881
3882     3952          3882

[3883 rows x 2 columns]
Processing ratings
         rating  timestamp  user_id_num  movie_id_num
0             5  978300760            0          1176
1             3  978302109            0           655
2             3  978301968            0           902
3             4  978300275       

In [3]:
# Save data----------------------------------------------------------------------------
pickle.dump(user_id_to_num, open('./dataset/user_id_to_num.pkl', 'wb'))
pickle.dump(item_id_to_num, open('./dataset/item_id_to_num.pkl', 'wb'))
np.save('./dataset/data.npy', ratings.values)
