In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.linear_model import Ridge
from sklearn import linear_model

# Reading the user file
* `users = (#user, user feeatures)`

In [2]:
users = pd.read_csv('ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip code'], index_col='user_id')
n_users = users.shape[0]
print('Number of users:', n_users)
users.head()

Number of users: 943


Unnamed: 0_level_0,age,gender,occupation,zip code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


# Reading ratings file
* `rating = (user_id, item_id, rating)`

In [3]:
rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_base.head()

Unnamed: 0,user_id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [4]:
rating_base['timestamp'] = rating_base['timestamp'].apply(lambda stamp: datetime.datetime \
                                                          .fromtimestamp(int(stamp)).strftime('%Y-%m-%d %H:%M:%S'))
rating_test['timestamp'] = rating_test['timestamp'].apply(lambda stamp: datetime.datetime \
                                                          .fromtimestamp(int(stamp)).strftime('%Y-%m-%d %H:%M:%S'))
rating_base.head()

Unnamed: 0,user_id,item id,rating,timestamp
0,1,1,5,1997-09-23 05:02:38
1,1,2,3,1997-10-15 12:26:11
2,1,3,4,1997-11-03 14:42:40
3,1,4,3,1997-10-15 12:25:19
4,1,5,3,1998-03-13 08:15:12


In [5]:
rating_train = rating_base.as_matrix()
type(rating_train)

numpy.ndarray

In [6]:
rating_test = rating_test.as_matrix()
print('Number of training rate:', rating_train.shape[0])
print('Number of test rate:', rating_test.shape[0])

Number of training rate: 90570
Number of test rate: 9430


# Construct Items profile

In [7]:
items = pd.read_csv('ml-100k/u.item', sep='|', names=['movie id', 'movie title' ,'release date',\
                                                      'video release date', 'IMDb URL', 'unknown',\
                                                      'Action', 'Adventure', 'Animation', 'Children\'s',\
                                                      'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],\
                    index_col='movie id', encoding='latin-1')
items.head()

Unnamed: 0_level_0,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
n_items = items.shape[0]
print('Number of items:', n_items)

Number of items: 1682


* We only consider the last 19 binary features to construct (**items profile ~ X**)
* `X = (# of items, 19 item features)`

In [12]:
X0 = items.as_matrix()
X_train_counts = X0[:, -19:]

In [13]:
# TFIDF
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X = transformer.fit_transform(X_train_counts.tolist()).toarray()

# Model
* users = (#user, user feeatures)
* rating = (user_id, item_id, rating)
* X = (# of items, 19 item features)
* y_hat = (# of items, #user)
* W = (19 item features, #user)
* For each user, the movies and rating values they rate need to be determined

In [14]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    return (item_ids, scores)
    """
    ids = np.where(rate_matrix[:, 0] == user_id + 1)[0]#Find row indices has "user_id"
    item_ids = rate_matrix[ids, 1] - 1#Item index starts from 0 
    scores = rate_matrix[ids, 2]
    return item_ids, scores

In [15]:
W = np.zeros((X.shape[1], n_users))
b = np.zeros((1, n_users))
model = Ridge(alpha=0.01, fit_intercept=True)
for user in range(n_users):
    item_ids, scores = get_items_rated_by_user(rating_train, user)# score = (#score user rated)
    X_hat = X[item_ids.tolist(), :]# X_hat = (#item user rated, 19 item feature)
    model.fit(X_hat, scores)
    W[:, user] = model.coef_ # model.coef_ = (19 item feature)
    b[0, user] = model.intercept_

In [16]:
y_hat = np.dot(X, W) + b

In [22]:
# Check score pred and score truth in train and test of user #23
np.set_printoptions(precision=2)
user = 23
item_ids, scores = get_items_rated_by_user(rating_train, user)
print('True ratings in training set:', scores[:10])
print('Predicted ratings in training set:', y_hat[item_ids[:10].tolist(), user])

item_ids, scores = get_items_rated_by_user(rating_test, user)
print('True ratings in training set:', scores[:10])
print('Predicted ratings in training set:', y_hat[item_ids[:10].tolist(), user])

True ratings in training set: [4 5 5 5 5 4 5 5 4 3]
Predicted ratings in training set: [3.5  5.05 4.37 4.73 4.73 4.25 4.25 4.87 4.92 4.37]
True ratings in training set: [5 3 4 5 2 4 5 5 4 5]
Predicted ratings in training set: [4.37 4.87 4.25 5.18 4.25 4.25 4.37 4.2  4.38 4.25]


# Evaluate by RMSE

In [28]:
def evaluate(y_hat, rate_matrix, W, b):
    tot = cnt = 0
    for user in range(n_users):
        item_ids, scores = get_items_rated_by_user(rate_matrix, user)
        err = y_hat[item_ids.tolist(), user] - scores
        tot += np.sum(err * err)
        cnt += len(err)
    return np.sqrt(tot/cnt)

In [30]:
print('RMSE for training: {:.2f}'.format(evaluate(y_hat, rating_train, W, b)))
print('RMSE for test    : {:.2f}'.format(evaluate(y_hat, rating_test, W, b)))

RMSE for training: 0.91
RMSE for test    : 1.27
