In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd

In [6]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv("ml-100k/u.user", sep="|", names=u_cols)
users.head(5)

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [16]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
rating_train = pd.read_csv("ml-100k/ua.base", names=r_cols, sep="\t") 
rating_test = pd.read_csv("ml-100k/ua.test", names=r_cols, sep='\t')
rating_train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [17]:
rating_train = rating_train.values
rating_test = rating_test.values
rating_test[:5]

array([[        1,        20,         4, 887431883],
       [        1,        33,         4, 878542699],
       [        1,        61,         4, 878542420],
       [        1,       117,         3, 874965739],
       [        1,       155,         2, 878542201]], dtype=int64)

In [52]:
m_cols = ['movie_id', 'name', 'release_date', "Unknown", "Url", 'bit_1', 'bit_2', 'bit_3', 'bit_4', 'bit_5', 'bit_6', 'bit_7', 'bit_8', 'bit_9', 'bit_10', 'bit_11', 'bit_12', 'bit_13', 'bit_14', 'bit_15', 'bit_16', 'bit_17', 'bit_18', 'bit_19']
items = pd.read_csv("ml-100k/u.item", sep="|",  encoding='latin-1', names=m_cols)
items = items.drop('Unknown', axis=1)
items.head()

Unnamed: 0,movie_id,name,release_date,Url,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,...,bit_10,bit_11,bit_12,bit_13,bit_14,bit_15,bit_16,bit_17,bit_18,bit_19
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
X = items.values
X_train = X[:, -19:]
X_train.shape

(1682, 19)

In [54]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_idf = TfidfTransformer(smooth_idf=True, norm='l2')
X = tf_idf.fit_transform(X_train).toarray()

In [89]:
def get_items_by_user(rating, u_id):
    list_uid = rating[:,0]
    idx = np.where(list_uid == u_id + 1)[0]
    item_idx = rating[idx, 1] - 1
    scores = rating[idx, 2]
    return (item_idx, scores)

In [90]:
from sklearn.linear_model import Ridge
d = X.shape[1]
W = np.zeros((d, users.shape[0]))
b = np.zeros(users.shape[0])
for n in range(0, users.shape[0]):
    idxs, scores = get_items_by_user(rating_train, n)
    model = Ridge(alpha=0.01, fit_intercept=True)
    Xhat = X[idxs, :]
    model.fit(Xhat, scores)
    W[:, n] = model.coef_
    b[n] = model.intercept_
    

In [91]:
yhat = X.dot(W) + b

In [93]:
n = 10
idxs, scores = get_items_by_user(rating_test, 100)
print('Rated movies ids :', idxs )
print('True ratings :', scores)
print('Predicted ratings:', yhat[idxs, n].round(2))

Rated movies ids : [221 251 280 281 303 368 404 470 595 828]
True ratings : [3 3 2 3 3 2 4 3 3 3]
Predicted ratings: [3.09 3.06 2.76 3.95 2.48 3.35 3.12 3.5  1.69 2.69]


In [103]:
def evaluate(Yhat, rates, W, b):
    se = cnt = 0
    for n in range(users.shape[0]):
        ids, scores_truth = get_items_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
        
    return np.sqrt(se/cnt)
print('RMSE for test : %.2f' %evaluate(yhat, rating_train, W, b))

RMSE for test : 0.91
