In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from prob2utils_skeleton import train_model, get_err
from surprise import AlgoBase
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

## Import data

In [42]:
movies = pd.read_table('data/movies.txt', header=None, names=["Movie Id", "Movie Title", "Unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary","Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])
np.save("movie", movies)

data = pd.read_table('data/data.txt', header=None, names=["user", "movie", "rating"])
np.save("data", data)

train = pd.read_table('data/train.txt',  header=None, names=["user", "movie", "rating"])
np.save("train", train)

test = pd.read_table('data/test.txt', header=None, names=["user", "movie", "rating"])
np.save("test", test)

In [43]:
reader = Reader(rating_scale=(1, 5))
#traindata = Dataset.load_from_df(train, reader=reader)
#trainset = traindata.build_full_trainset()
ydata = Dataset.load_from_df(data, reader=reader)
fullset = ydata.build_full_trainset()

## Instantiate SVD

In [51]:
filterer = SVD()

## Do cross-validation

In [52]:
cross_validate(filterer, ydata, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9313  0.9365  0.9317  0.9409  0.9408  0.9362  0.0042  
Fit time          8.03    9.63    10.38   9.03    7.14    8.84    1.15    
Test time         0.47    0.64    0.35    0.21    0.20    0.37    0.17    


{u'fit_time': (8.026655912399292,
  9.62839412689209,
  10.377238988876343,
  9.03115701675415,
  7.139965057373047),
 u'test_rmse': array([0.93128872, 0.93654601, 0.93165136, 0.94086391, 0.94079329]),
 u'test_time': (0.4723508358001709,
  0.636134147644043,
  0.3485431671142578,
  0.2054460048675537,
  0.20148110389709473)}

## Run factorization on full set

In [44]:
filterer.fit(fullset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ba45f50>

## Calculate training and test errors. Lambda = 0.02 .

In [45]:
u = filterer.pu
v = filterer.qi
ubias = filterer.bu
vbias = filterer.bi
print(u.shape, ubias.shape, v.shape, vbias.shape)
print(len(testset))

((943, 100), (943,), (1682, 100), (1682,))
10000


In [46]:
def get_err(U, V, BU, BV, Y, reg=0.0):
    """
    Takes as input a matrix Y of triples (i, j, Y_ij) where i is the index of a user,
    j is the index of a movie, and Y_ij is user i's rating of movie j and
    user/movie matrices U and V.

    Returns the mean regularized squared-error of predictions made by
    estimating Y_{ij} as the dot product of the ith row of U and the jth column of V^T.
    """
    ratings = np.zeros(len(Y))
    for n in np.arange(len(Y)):
        ratings[n] = Y[n][2]
    mu = np.mean(ratings)
    err = 0.0    
    
    err += 0.5*reg*(np.sum(U**2) + np.sum(V**2) + np.sum(BU**2) + np.sum(BV**2))
    for n in np.arange(len(Y)):
        i = Y[n][0] - 1 #Columns start at 1, not zero.
        j = Y[n][1] - 1 #Columns start at 1, not zero.
        #print(U.shape, V.shape)
        err += 0.5*((ratings[n] - mu) - (np.dot(U[i,:],V.T[:,j]) + BU[i] + BV[j]))**2

    return err

In [47]:
testerr = get_err(u, v, ubias, vbias, testset, reg=0.0)

In [48]:
np.savetxt('U_shelf.txt', u)
np.savetxt('V_shelf.txt', v)
np.savetxt('Ubias_shelf.txt', ubias)
np.savetxt('Vbias_shelf.txt', vbias)

In [49]:
print testerr

8550.170417778732


In [50]:
print u.shape, v.shape

(943, 100) (1682, 100)
