In [30]:
import numpy as np
import pandas as pd 

In [31]:
df = pd.read_csv('archive/rating.csv')

In [32]:
df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [33]:
df.drop(['timestamp'], axis=1,inplace = True)

In [34]:
df.head(5)

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [35]:
unique_movie = list(df['movieId'].unique())

In [36]:
movieid_idx_dict = {}

count = 0

for movie in unique_movie:
    movieid_idx_dict[movie] = count
    count+=1
    

In [37]:
df['movie_idx'] = df.apply(lambda x: movieid_idx_dict[x.movieId], axis = 1)

In [38]:
df.userId = df.userId - 1

In [39]:
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,0
1,0,29,3.5,1
2,0,32,3.5,2
3,0,47,3.5,3
4,0,50,3.5,4


# Shrinking Dataset

In [40]:
from collections import Counter

In [41]:
m = 10000
n = 2000

In [42]:
common_user = Counter(df.userId).most_common(m)
common_movies = Counter(df.movie_idx).most_common(n)

In [43]:
user_ids = [u for u, count in common_user] 
movie_ids = [m for m, count in common_movies] 

In [44]:
df_shrink = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

In [45]:
df_shrink.size

21568100

In [46]:
df_shrink.head()

Unnamed: 0,userId,movieId,rating,movie_idx
960,10,1,4.5,227
961,10,10,2.5,365
962,10,19,3.5,366
963,10,32,5.0,2
964,10,39,4.5,631


In [47]:
unique_user_ids = df_shrink.userId.unique()
unique_movie_ids = df_shrink.movie_idx.unique()

In [48]:
shrink_user_ids_dict = {} 
shrink_movie_ids_dict = {}

In [49]:
count = 0
for user in unique_user_ids:
    shrink_user_ids_dict[user] = count
    count+=1
count1= 0
for movie in unique_movie_ids:
    shrink_movie_ids_dict[movie] = count1
    count1+=1


In [50]:
df_shrink['userId'] = df_shrink.apply(lambda x: shrink_user_ids_dict[x.userId], axis = 1)


In [51]:

df_shrink['movie_idx'] = df_shrink.apply(lambda x: shrink_movie_ids_dict[x.movie_idx], axis = 1)

In [52]:
df_shrink.head()

Unnamed: 0,userId,movieId,rating,movie_idx
960,0,1,4.5,0
961,0,10,2.5,1
962,0,19,3.5,2
963,0,32,5.0,3
964,0,39,4.5,4


In [53]:
df_shrink.userId.max()

9999

In [54]:
df_shrink.to_csv("shrinked_rating.csv", index=False)

In [55]:
shuffled_df = df_shrink.sample(frac=1, random_state=42)


split_index = int(0.8 * len(shuffled_df))


train_df = shuffled_df[:split_index]
test_df = shuffled_df[split_index:]

In [56]:
print('Test df size:',test_df.size,'Train df size:', train_df.size)

Test df size: 4313620 Train df size: 17254480


In [58]:
user2movie = {}
movie2user = {}
usermovie2rating = {}


def update_train_df(row):
    i = int(row.userId)
    j = int(row.movie_idx)
    
    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)
    
    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)
        
    usermovie2rating[(i, j)] = row.rating

train_df.apply(update_train_df, axis = 1)
    

2935096     None
13306793    None
9685656     None
15050184    None
9475231     None
            ... 
13934804    None
2503053     None
312879      None
16131476    None
2809147     None
Length: 4313620, dtype: object

In [60]:
usermovie2rating_test = {}

def update_test_df(row):
    i = int(row.userId)
    j = int(row.movie_idx)
        
    usermovie2rating_test[(i, j)] = row.rating

test_df.apply(update_test_df, axis = 1)
    

6913277     None
18145020    None
11150328    None
3175533     None
681507      None
            ... 
5784152     None
8310303     None
18202208    None
15891449    None
6240549     None
Length: 1078405, dtype: object

In [61]:
import pickle

In [62]:
# user2movie = {}
# movie2user = {}
# usermovie2rating = {}

with open('user2movie.json', 'wb') as f:
    pickle.dump(user2movie, f)
with open('movie2user.json', 'wb') as f:
    pickle.dump(movie2user, f)
with open('usermovie2rating.json', 'wb') as f:
    pickle.dump(usermovie2rating, f)
with open('usermovie2rating_test.json', 'wb') as f:
    pickle.dump(usermovie2rating_test, f)