In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
plt.rcParams['figure.figsize'] = (6, 4)
plt.style.use('ggplot')
%config InlineBackend.figure_formats = {'png', 'retina'}


In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
print(movies.shape)

(62423, 3)


In [5]:
ratings = pd.read_csv('ratings.csv')

In [6]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [7]:
print(ratings.shape)

(25000095, 4)


In [8]:
# there are many user, and many differences criteria for rating a movie. 
# I decided to find the mean rating of each user.
#Movie which got rating higher than user rating mean will assign as like

In [9]:
ratings[ratings['userId']==1].rating.mean()

3.8142857142857145

In [10]:
ratings[ratings['userId']==2].rating.mean()

3.630434782608696

In [11]:
ratings[ratings['userId']==1000].rating.mean()

3.5236686390532546

In [12]:
# We can calculate rating per user

In [13]:
MRPU = ratings.groupby(['userId']).mean().reset_index()
MRPU['mean_rating']= MRPU['rating']
MRPU.drop(['movieId','rating','timestamp'],axis=1,inplace=True)

In [14]:
MRPU.head(10)

Unnamed: 0,userId,mean_rating
0,1,3.814286
1,2,3.630435
2,3,3.697409
3,4,3.378099
4,5,3.752475
5,6,4.153846
6,7,3.64
7,8,3.612903
8,9,3.865169
9,10,3.45283


In [15]:
ratings = pd.merge(ratings, MRPU, on=['userId','userId'])

In [16]:
ratings.head(100)

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating
0,1,296,5.0,1147880044,3.814286
1,1,306,3.5,1147868817,3.814286
2,1,307,5.0,1147868828,3.814286
3,1,665,5.0,1147878820,3.814286
4,1,899,3.5,1147868510,3.814286
...,...,...,...,...,...
95,2,733,4.5,1141415905,3.630435
96,2,858,3.5,1141416926,3.630435
97,2,914,4.0,1141417642,3.630435
98,2,953,4.5,1141417487,3.630435


In [17]:
# filter out all movies that have rating > than the mean rating for each user.
# This means all movies in this table can be considered as favorite movie of each user
ratings = ratings.drop(ratings[ratings.rating < ratings.mean_rating].index)

In [18]:
ratings[ratings['userId']==1].head(10)

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating
0,1,296,5.0,1147880044,3.814286
2,1,307,5.0,1147868828,3.814286
3,1,665,5.0,1147878820,3.814286
5,1,1088,4.0,1147868495,3.814286
8,1,1237,5.0,1147868839,3.814286
9,1,1250,4.0,1147868414,3.814286
11,1,1653,4.0,1147868097,3.814286
16,1,2351,4.5,1147877957,3.814286
17,1,2573,4.0,1147878923,3.814286
18,1,2632,5.0,1147878248,3.814286


In [19]:
ratings[ratings['userId']==2].head(10)

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating
72,2,110,5.0,1141416589,3.630435
73,2,150,4.0,1141415790,3.630435
74,2,151,4.5,1141415643,3.630435
75,2,236,4.0,1141415659,3.630435
76,2,260,5.0,1141417172,3.630435
79,2,318,5.0,1141417181,3.630435
80,2,333,5.0,1141415931,3.630435
81,2,349,4.5,1141417045,3.630435
82,2,356,4.5,1141416637,3.630435
83,2,364,4.5,1141417077,3.630435


In [20]:

print(ratings.shape)

(13616282, 5)


In [21]:
ratings['userId'].unique()

array([     1,      2,      3, ..., 162539, 162540, 162541])

In [22]:
ratings = ratings.rename({'rating':'useRating'},axis='columns')

In [23]:
#Merge 2 dataset movies and rating

mergedata = pd.merge(movies,ratings,on=['movieId','movieId'])
mergedata.head(10)

Unnamed: 0,movieId,title,genres,userId,useRating,timestamp,mean_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1439472215,3.697409
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,858625949,3.752475
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,890492517,3.612903
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,3.5,1227571347,3.45283
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,12,4.0,1167582601,3.296196
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,4.0,1265223970,3.559466
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,36,5.0,857131378,3.546296
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,43,4.0,1170491388,3.52619
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,51,4.0,1510742879,3.625
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,57,4.0,1142403282,3.826761


In [24]:
# number of favorite movies by all users
len(mergedata['movieId'].unique())

47094

In [25]:
#number of movies in original movie dataset
len(movies['movieId'].unique())

62423

In [29]:
# Create a crosstable to show the movie which each user like

movie_user = pd.crosstab(mergedata['userId'],mergedata['title'])

ValueError: Unstacked DataFrame is too big, causing int32 overflow

In [None]:
movie_user.head(10)