In [21]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds
import pickle

In [22]:
data_file = pd.read_table('ml-1m/ratings.dat', sep = '::', header=None, engine='python')

In [23]:
data_file.describe
data_file.shape

(1000209, 4)

In [24]:
users = np.unique(data_file[0])
movies = np.unique(data_file[1])
 
number_of_rows = len(users)
number_of_columns = len(movies)

movie_indices, user_indices = {}, {}
 
for i in range(len(movies)):
    movie_indices[movies[i]] = i
    
for i in range(len(users)):
    user_indices[users[i]] = i

In [25]:
np.unique(data_file[0])

array([   1,    2,    3, ..., 6038, 6039, 6040])

In [26]:
#scipy sparse matrix to store the 1M matrix
V = sp.lil_matrix((number_of_rows, number_of_columns))

#adds data into the sparse matrix
for line in data_file.values:
    u, i , r , gona = map(int,line)
    V[user_indices[u], movie_indices[i]] = r

In [27]:
#as these operations consume a lot of time, it's better to save processed data 
with open('movielens_1M.pickle', 'wb') as handle:
    pickle.dump(V, handle)

In [28]:
V.shape

(6040, 3706)

In [41]:
V.toarray()

array([[ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 3.,  0.,  0., ...,  0.,  0.,  0.]])

In [42]:
#as these operations consume a lot of time, it's better to save processed data 
#gets SVD components from 10M matrix
u,s, vt = svds(V, k = 32)
 
with open('movielens_1M_svd_u.pickle', 'wb') as handle:
    pickle.dump(u, handle)
with open('movielens_1M_svd_s.pickle', 'wb') as handle:
    pickle.dump(s, handle)
with open('movielens_1M_svd_vt.pickle', 'wb') as handle:
    pickle.dump(vt, handle)

In [9]:
u.shape

(6040, 32)

In [10]:
u

array([[ 0.00410462,  0.00412278,  0.00573711, ...,  0.00267141,
        -0.00164551,  0.00471786],
       [-0.0069305 ,  0.00495312,  0.00630195, ...,  0.00038215,
         0.00269782,  0.00928856],
       [-0.02099482,  0.00757615, -0.00446269, ..., -0.00334366,
         0.00334292,  0.00501018],
       ..., 
       [ 0.00254681,  0.00562312, -0.00725292, ..., -0.00011879,
        -0.00181339,  0.00138885],
       [ 0.00730304,  0.00601646,  0.00030571, ..., -0.01071225,
        -0.0187647 ,  0.00700793],
       [ 0.00687984, -0.02834141, -0.0201742 , ..., -0.00304316,
        -0.04080244,  0.01896102]])

In [11]:
s.shape

(32,)

In [44]:
s

array([  170.39056208,   173.4545534 ,   177.40814501,   180.00054896,
         182.54076131,   184.35736998,   187.23251225,   192.34874592,
         196.85673474,   199.89552804,   202.22382818,   211.99396056,
         213.28383445,   216.29756551,   221.72978787,   231.39268055,
         239.65266106,   245.51816079,   253.3454035 ,   256.36631123,
         276.37005962,   289.28673298,   316.08860249,   335.46238645,
         346.70684534,   398.74614105,   426.16540262,   444.85478082,
         518.08422502,   574.85275997,   671.34356538,  1893.21055869])

In [45]:
vt.shape

(32, 3706)

In [46]:
vt

array([[ 0.02594449, -0.01260792, -0.00431814, ...,  0.00077875,
         0.00507803, -0.00444861],
       [ 0.03941802, -0.00317296, -0.01265863, ...,  0.00414554,
        -0.00361951, -0.00092356],
       [ 0.01233547, -0.00091809, -0.03974107, ...,  0.00424986,
         0.00469391,  0.02537816],
       ..., 
       [-0.03016472,  0.01018907, -0.01257242, ..., -0.00178319,
        -0.00352092, -0.02235768],
       [ 0.02094015,  0.02979245,  0.0167039 , ..., -0.0018744 ,
        -0.00226511, -0.00502213],
       [-0.07013714, -0.02354382, -0.01376584, ..., -0.00261526,
        -0.00116636, -0.01325659]])

In [13]:
s_diag_matrix = np.zeros((s.shape[0], s.shape[0]))

for i in range(s.shape[0]):
    s_diag_matrix[i,i] = s[i]

In [14]:
X_lr = np.dot(np.dot(u, s_diag_matrix), vt)

In [15]:
X_lr

array([[  3.46082409e+00,   3.79645341e-01,  -1.00658366e-01, ...,
          4.54694446e-03,   8.59065574e-04,   1.26139332e-01],
       [  1.14993182e+00,   2.98553553e-01,   2.17074964e-01, ...,
         -2.95004384e-02,  -1.43338851e-02,  -3.66208889e-02],
       [  1.64867269e+00,   7.20365220e-01,   1.02267102e-01, ...,
         -1.03005964e-02,   1.66668728e-02,  -1.14509969e-01],
       ..., 
       [  5.32095794e-01,  -2.66320882e-03,  -2.73637940e-02, ...,
         -2.92053697e-02,  -3.05362056e-02,  -1.38650678e-01],
       [  8.95395251e-01,  -1.20133177e-01,   6.31031568e-02, ...,
         -3.35518359e-02,  -8.57036877e-02,  -2.74991620e-01],
       [  2.46577408e+00,  -2.28213659e-01,  -3.05154451e-01, ...,
          3.87987595e-02,   1.48801896e-01,   1.88615556e-01]])

In [16]:
X_lr.shape

(6040, 3706)

In [17]:
X_lr.size

22384240