In [11]:
import numpy as np
import pandas as pd

In [None]:
users_list = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

ratings_list = pd.read_table('data/ml-1m/ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')

movies_list = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'], engine='python')

In [5]:
ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)

In [12]:
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding='iso-8859-1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding='iso-8859-1')
movielens_test.shape

(2668, 11)

In [13]:
ratings_df = movielens_train.pivot_table(values='rating',index='user_id',columns='movie_id').fillna(0)

In [14]:
ratings_df.head()

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
R = ratings_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [16]:
R_demeaned.shape

(2170, 1934)

In [35]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 20)

In [36]:
sigma = np.diag(sigma)

In [37]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [38]:
all_user_predicted_ratings

array([[ -2.95477958e-02,   8.23312267e-03,  -9.91989965e-04, ...,
          5.02948788e-04,   2.03763732e-03,   9.40109324e-06],
       [ -2.19897925e-03,   1.79235264e-03,   1.42732637e-03, ...,
          1.30569858e-03,   2.12597998e-03,   1.92877548e-03],
       [ -6.03683283e-02,   1.65201991e-02,   8.32579860e-04, ...,
         -1.17582260e-04,   1.45668916e-02,   1.07312869e-02],
       ..., 
       [ -7.65608290e-04,   1.05410038e-03,   1.18593819e-03, ...,
          1.18003902e-03,   1.18365999e-03,   1.14921115e-03],
       [ -3.18135755e-02,   1.80113459e-02,   3.48258314e-03, ...,
          2.23755735e-03,   2.51686471e-02,   1.38604584e-02],
       [ -3.63201878e-03,   1.93849209e-03,   2.34730862e-03, ...,
          2.45937123e-03,   1.95837866e-03,   2.70907064e-03]])

In [39]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = ratings_df.columns)
preds_df.head()

movie_id,1,2,4,5,6,7,10,11,12,13,...,3928,3929,3930,3932,3943,3945,3947,3948,3949,3952
0,-0.029548,0.008233,-0.000992,0.000254,0.007313,-0.000144,0.014709,0.00222,-0.000115,0.000286,...,-0.001032,-0.001011,-0.00029,-0.000136,-0.001071,-0.001122,0.000306,0.000503,0.002038,9e-06
1,-0.002199,0.001792,0.001427,0.001615,0.000503,0.001706,0.000743,0.000595,0.001414,0.001612,...,0.001433,0.002657,0.001724,0.001707,0.001255,0.00144,0.00161,0.001306,0.002126,0.001929
2,-0.060368,0.01652,0.000833,0.004158,-0.025833,0.008057,0.300786,-0.002735,0.002036,0.004107,...,0.010217,0.02348,0.028319,0.023253,-0.006786,0.000237,0.004189,-0.000118,0.014567,0.010731
3,-0.018465,0.002192,0.003631,0.003121,0.00617,0.002497,0.002163,0.00443,0.00358,0.003131,...,0.003281,0.000753,0.002806,0.001097,0.004283,0.003974,0.003131,0.003876,0.000525,0.001868
4,-0.004239,0.001559,0.001949,0.00179,0.001248,0.001738,0.001664,0.001846,0.001724,0.001779,...,0.001684,0.001604,0.001603,0.00128,0.001262,0.001884,0.001773,0.002027,0.001605,0.001548
