# Content-based

In [18]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

ratings = pd.read_csv('ml-latest-small/ratings.csv')
#number of user
n_users = ratings.userId.nunique()

# Shuffle the ratings table
data = shuffle(ratings)

# Init ratio to split the data: 80%, 20%
ratio = int(0.8*len(data))

# Split into train_set and test_set
train_set = data[:ratio].reset_index().drop('index', axis=1)
test_set = data[ratio:].reset_index().drop('index', axis=1)

train_set = train_set.astype(int).to_numpy()
test_set = test_set.astype(int).to_numpy()

print("Training set size:", train_set.shape[0])
print("Test set size:", test_set.shape[0])
print("Number of user: ", n_users ,"\n")

ratings.head()

Training set size: 80668
Test set size: 20168
Number of user:  610 



Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [2]:
#Reading items file:
movies = pd.read_csv('ml-latest-small/movies.csv') 
dummies = movies['genres'].str.get_dummies(sep='|')
movies.drop('genres',inplace=True, axis = 1)
movies = pd.concat([movies, dummies], axis=1)
movies.rename(columns={'(no genres listed)':'Unknow'}, inplace=True)

n_movies = movies.shape[0]
print('Number of movies:', n_movies)

Number of movies: 9742


Build feature vector by tf-idf

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

X0 = movies.to_numpy()
X_train_counts = X0[:, -20:]
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
#feature vector of each movies
X = transformer.fit_transform(X_train_counts.tolist()).toarray()

a matrix get items that rated by user

In [5]:
movies_idList = movies.movieId.astype(int).to_numpy()

In [6]:
def get_items_rated_by_user(rate_matrix, user_id):
  
    y = rate_matrix[:,0] # all users
    
    #list of user in rate matrix
    ids = np.where(y == user_id +1)[0] 
    
    #list of movie that user 'ids' rate
    movie_ids = rate_matrix[ids, 1] 
    #return index of movie
    
    movie_index = np.empty(len(movie_ids),dtype = int)
    for i in range (len(movie_ids)):
        movie_index[i] = (np.where(movies_idList == movie_ids[i])[0][0]) 
    #and score for each movie that user 'ids' rate
    scores = rate_matrix[ids, 2]
    
    return (movie_index, scores)

build model

In [7]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = X.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros(n_users)

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(train_set, n)
    model = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = X[ids, :]
    model.fit(Xhat, scores) 
    W[:, n] = model.coef_
    b[n] = model.intercept_


In [9]:
# predicted scores
Yhat = X.dot(W) + b

In [14]:
#test for a user
n = 103
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(test_set, 10)
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [   5  277 1209   97 1084 1071    9  176 1066  142]
True ratings     : [5 4 4 5 5 1 3 3 3 4]
Predicted ratings: [2.88 3.32 2.53 2.94 2.78 2.84 2.91 2.86 2.9  3.15]


In [17]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return np.sqrt(se/cnt)

print('RMSE for training: %.2f' %evaluate(Yhat, train_set, W, b))
print('RMSE for test    : %.2f' %evaluate(Yhat, test_set, W, b))

RMSE for training: 0.86
RMSE for test    : 1.00
