In [81]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds


%matplotlib inline

In [102]:
#Importing the ratings data 
ratings = pd.read_csv('ratings.dat', sep='::')
ratings.columns = ['User_ID','Movie_ID','Rating','timeStamp']
ratings.head()
filtered = ratings[ratings.User_ID==1]
filtered

  


Unnamed: 0,User_ID,Movie_ID,Rating,timeStamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268
5,1,1287,5,978302039
6,1,2804,5,978300719
7,1,594,4,978302268
8,1,919,4,978301368
9,1,595,5,978824268


In [103]:
#Importing the users data
users = pd.read_csv('users.dat',sep = '::')
users.columns = ['User_ID', 'Gender', 'Age', 'Occupation', 'zipCode']
users.head()

  


Unnamed: 0,User_ID,Gender,Age,Occupation,zipCode
0,2,M,56,16,70072
1,3,M,25,15,55117
2,4,M,45,7,2460
3,5,M,25,20,55455
4,6,F,50,9,55117


In [134]:
#Importing the movies data
movies = pd.read_csv('movies.dat', sep='::')
movies.columns = ['Movie_ID','Title','Genres']
movies.head()


  


Unnamed: 0,Movie_ID,Title,Genres
0,2,Jumanji (1995),Adventure|Children's|Fantasy
1,3,Grumpier Old Men (1995),Comedy|Romance
2,4,Waiting to Exhale (1995),Comedy|Drama
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller


In [61]:
#Shuffling the data to make sure that there is no any arrangement
#Used for the training, validation and testing
shuffled_ratings = ratings.sample(frac=1)
shuffled_users = shuffled_ratings['User_ID'].values
shuffled_movies = shuffled_ratings['Movie_ID'].values
shuffled_ratings = shuffled_ratings['Rating'].values
print("User:", shuffled_users)
print("Movies:", shuffled_movies)
print("Ratings:", shuffled_ratings)

User: [1207 3270 4454 ... 1552 2486 1520]
Movies: [1975 1036   81 ... 2312 2716  480]
Ratings: [2 4 1 ... 4 3 5]


In [69]:
#Obtaining the number of unique users and movies
n_users = ratings.User_ID.nunique()
n_movies = ratings.Movie_ID.nunique()
print("Number of users is: ", n_users," and the number of movies is: ", n_movies)

Number of users is:  6040  and the number of movies is:  3706


In [72]:
#Now creating the rating matrix (users x items)
rating_mat = ratings.pivot (index = 'User_ID', columns = 'Movie_ID', values = 'Rating')
rating_mat = rating_mat.fillna(0)
rating_

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
#Now we normalize in very basic way
R = rating_mat.as_matrix()
user_rating_mean = np.mean(R,axis =1)
user_rating_normal = R - user_rating_mean.reshape(-1,1)
user_rating_normal

array([[ 4.9414463 , -0.0585537 , -0.0585537 , ..., -0.0585537 ,
        -0.0585537 , -0.0585537 ],
       [-0.12924987, -0.12924987, -0.12924987, ..., -0.12924987,
        -0.12924987, -0.12924987],
       [-0.05369671, -0.05369671, -0.05369671, ..., -0.05369671,
        -0.05369671, -0.05369671],
       ...,
       [-0.02050729, -0.02050729, -0.02050729, ..., -0.02050729,
        -0.02050729, -0.02050729],
       [-0.1287102 , -0.1287102 , -0.1287102 , ..., -0.1287102 ,
        -0.1287102 , -0.1287102 ],
       [ 2.6708041 , -0.3291959 , -0.3291959 , ..., -0.3291959 ,
        -0.3291959 , -0.3291959 ]])

In [82]:
U, sigma, M = svds(user_rating_normal, k=20)
sigma = np.diag(sigma)

In [86]:
#Filling the whole table 
users_predicted_ratings = np.dot(np.dot(U,sigma),M)
users_predictions = users_predicted_ratings + user_rating_mean.reshape(-1,1)
users_predictions

array([[ 3.19857025e+00,  6.38541268e-01, -3.47007414e-02, ...,
         1.38771811e-04,  7.05075443e-03,  4.27050290e-02],
       [ 1.33299698e+00,  4.64541203e-01,  1.94509769e-01, ...,
        -4.55202432e-02, -2.53603690e-02, -1.93740473e-02],
       [ 1.37118031e+00,  1.30284112e-01,  8.08668827e-02, ...,
        -1.72951262e-02,  4.49393118e-03, -1.54237755e-01],
       ...,
       [ 6.67697109e-01,  7.79444275e-04, -4.25792074e-02, ...,
        -5.01064075e-03,  5.33901642e-03, -7.01891748e-02],
       [ 1.20207454e+00,  3.64280843e-01,  2.74994291e-01, ...,
         5.90302871e-04, -1.32893048e-02, -1.71804639e-01],
       [ 1.94536002e+00,  9.85058685e-02, -5.37335603e-01, ...,
         8.16449100e-02,  1.00659719e-01,  3.56833166e-01]])

In [118]:
preds = pd.DataFrame(users_predictions, columns = rating_mat.columns)
preds.head()

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,3.19857,0.638541,-0.034701,-0.021458,0.076028,-0.221926,-0.066784,0.171268,-0.104106,-0.019997,...,-0.003451,0.017852,0.054395,0.002055,-0.05271,0.285666,-0.087522,0.000139,0.007051,0.042705
1,1.332997,0.464541,0.19451,0.069194,0.082928,0.81418,0.091039,0.031088,0.165862,1.294305,...,-0.054974,-0.023483,-0.018093,0.031399,-0.036435,0.135554,-0.193275,-0.04552,-0.02536,-0.019374
2,1.37118,0.130284,0.080867,-0.06142,-0.018566,0.155906,-0.121957,0.035303,0.037654,0.550767,...,-0.008815,0.023164,0.038411,0.024095,-0.007709,0.20496,-0.149019,-0.017295,0.004494,-0.154238
3,0.220915,-0.145702,-0.038958,0.049731,0.015739,0.268959,-0.00969,0.009443,-0.010747,0.15296,...,0.025489,0.016085,0.010708,-0.016086,-0.022127,-0.005389,0.052847,-0.003026,0.03023,-0.069364
4,1.006277,0.104324,-0.241007,0.138286,-0.211032,1.384833,-0.264925,0.00116,-0.047953,0.324213,...,0.085926,0.005881,-0.010044,-0.031761,0.007052,0.010148,0.510049,0.027555,0.084839,0.210094


In [212]:
def recommendations(predictions, user_id, movies, original_ratings, num_recommendations):
    user_rownum = user_id -1
    sorted_user_predictions = preds.iloc[user_rownum].sort_values(ascending=False)
    sorted_user_predictions = pd.DataFrame(sorted_user_predictions)
    sorted_user_predictions.columns = ['Movie_ID']
    
    user_data = original_ratings[original_ratings.User_ID==user_id]
    user_full = user_data.merge(movies, how = 'left', left_on = 'Movie_ID', right_on = 'Movie_ID')
    user_full = user_full.sort_values(['Rating'],ascending = False)
    user_full = pd.DataFrame(user_full)
    print ('User {0} has already rated {1} movies'.format(user_id, user_full.shape[0]))
    print('Now recommending the top {0} movies for the user {1}'.format(num_recommendations, user_id))
    
    rated_movies = movies['Movie_ID'].isin(user_full['Movie_ID'])
    unrated_movies = movies[~rated_movies]
    unrated_movies = unrated_movies.merge(sorted_user_predictions, how = 'left', left_on = 'Movie_ID', right_on = 'Movie_ID')
    unrated_n_movies = unrated_movies[:num_recommendations] 
    unrated_n_movies = pd.DataFrame(unrated_n_movies)
    print(sorted_user_predictions)

    #return user_full, unrated_n_movies

In [213]:
recommendations(preds, 1310, movies, ratings,10)

User 1310 has already rated 24 movies
Now recommending the top 10 movies for the user 1310
          Movie_ID
Movie_ID          
1961      0.924878
1225      0.907930
1097      0.833244
1674      0.818788
1090      0.788183
1246      0.781243
1231      0.772472
1293      0.743013
1302      0.700690
1270      0.688830
1307      0.680166
1962      0.676422
1957      0.666252
1299      0.661506
1196      0.658871
2020      0.657834
2352      0.649189
1259      0.642741
1968      0.635420
3360      0.623387
2243      0.596766
1186      0.596305
3098      0.590197
1228      0.582305
1956      0.564089
2918      0.563503
1960      0.555300
541       0.548450
3424      0.546868
1185      0.546467
...            ...
3174     -0.136118
466      -0.136209
1371     -0.137294
2454     -0.137749
940      -0.139142
3638     -0.140091
3635     -0.142267
1372     -0.144575
1269     -0.144599
1377     -0.145024
2300     -0.146397
1032     -0.150146
2991     -0.150624
3052     -0.157185
2993     -0.1667