In [81]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds


%matplotlib inline

In [102]:
#Importing the ratings data 
ratings = pd.read_csv('ratings.dat', sep='::')
ratings.columns = ['User_ID','Movie_ID','Rating','timeStamp']
ratings.head()
filtered = ratings[ratings.User_ID==1]
filtered

  


Unnamed: 0,User_ID,Movie_ID,Rating,timeStamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1197,3,978302268
5,1,1287,5,978302039
6,1,2804,5,978300719
7,1,594,4,978302268
8,1,919,4,978301368
9,1,595,5,978824268


In [103]:
#Importing the users data
users = pd.read_csv('users.dat',sep = '::')
users.columns = ['User_ID', 'Gender', 'Age', 'Occupation', 'zipCode']
users.head()

  


Unnamed: 0,User_ID,Gender,Age,Occupation,zipCode
0,2,M,56,16,70072
1,3,M,25,15,55117
2,4,M,45,7,2460
3,5,M,25,20,55455
4,6,F,50,9,55117


In [134]:
#Importing the movies data
movies = pd.read_csv('movies.dat', sep='::')
movies.columns = ['Movie_ID','Title','Genres']
movies.head()


  


Unnamed: 0,Movie_ID,Title,Genres
0,2,Jumanji (1995),Adventure|Children's|Fantasy
1,3,Grumpier Old Men (1995),Comedy|Romance
2,4,Waiting to Exhale (1995),Comedy|Drama
3,5,Father of the Bride Part II (1995),Comedy
4,6,Heat (1995),Action|Crime|Thriller


In [61]:
#Shuffling the data to make sure that there is no any arrangement
#Used for the training, validation and testing
shuffled_ratings = ratings.sample(frac=1)
shuffled_users = shuffled_ratings['User_ID'].values
shuffled_movies = shuffled_ratings['Movie_ID'].values
shuffled_ratings = shuffled_ratings['Rating'].values
print("User:", shuffled_users)
print("Movies:", shuffled_movies)
print("Ratings:", shuffled_ratings)

User: [1207 3270 4454 ... 1552 2486 1520]
Movies: [1975 1036   81 ... 2312 2716  480]
Ratings: [2 4 1 ... 4 3 5]


In [69]:
#Obtaining the number of unique users and movies
n_users = ratings.User_ID.nunique()
n_movies = ratings.Movie_ID.nunique()
print("Number of users is: ", n_users," and the number of movies is: ", n_movies)

Number of users is:  6040  and the number of movies is:  3706


In [72]:
#Now creating the rating matrix (users x items)
rating_mat = ratings.pivot (index = 'User_ID', columns = 'Movie_ID', values = 'Rating')
rating_mat = rating_mat.fillna(0)
rating_

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
#Now we normalize in very basic way
R = rating_mat.as_matrix()
user_rating_mean = np.mean(R,axis =1)
user_rating_normal = R - user_rating_mean.reshape(-1,1)
user_rating_normal

array([[ 4.9414463 , -0.0585537 , -0.0585537 , ..., -0.0585537 ,
        -0.0585537 , -0.0585537 ],
       [-0.12924987, -0.12924987, -0.12924987, ..., -0.12924987,
        -0.12924987, -0.12924987],
       [-0.05369671, -0.05369671, -0.05369671, ..., -0.05369671,
        -0.05369671, -0.05369671],
       ...,
       [-0.02050729, -0.02050729, -0.02050729, ..., -0.02050729,
        -0.02050729, -0.02050729],
       [-0.1287102 , -0.1287102 , -0.1287102 , ..., -0.1287102 ,
        -0.1287102 , -0.1287102 ],
       [ 2.6708041 , -0.3291959 , -0.3291959 , ..., -0.3291959 ,
        -0.3291959 , -0.3291959 ]])

In [245]:
U, sigma, M = svds(user_rating_normal, k=50)
sigma = np.diag(sigma)

In [246]:
#Filling the whole table 
users_predicted_ratings = np.dot(np.dot(U,sigma),M)
users_predictions = users_predicted_ratings + user_rating_mean.reshape(-1,1)
users_predictions

array([[ 4.34341552,  0.17609446, -0.22251787, ...,  0.02874148,
         0.04421855,  0.08061713],
       [ 0.74240161,  0.16952189,  0.33504243, ..., -0.10095573,
        -0.05411154, -0.13986882],
       [ 1.81925069,  0.45609283,  0.0905703 , ...,  0.01241187,
         0.01514886, -0.10982633],
       ...,
       [ 0.61902947, -0.16202893,  0.10676395, ..., -0.0133475 ,
        -0.03032223, -0.11487962],
       [ 1.50362034, -0.03631897, -0.16114051, ..., -0.01093524,
        -0.03863418, -0.1682341 ],
       [ 1.99538251, -0.18595327, -0.15600224, ..., -0.00663861,
         0.12706735,  0.28490808]])

In [247]:
preds = pd.DataFrame(users_predictions, columns = rating_mat.columns)
preds.head()

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.343416,0.176094,-0.222518,0.008381,0.012784,-0.225923,-0.075993,0.127179,-0.060585,-0.247791,...,0.015335,-0.003491,0.021153,-0.032425,-0.082901,0.359719,0.057426,0.028741,0.044219,0.080617
1,0.742402,0.169522,0.335042,0.000414,0.022431,1.352972,0.051335,0.071544,0.161474,1.56744,...,-0.056455,-0.013677,-0.010562,0.06269,-0.016251,0.156251,-0.418387,-0.100956,-0.054112,-0.139869
2,1.819251,0.456093,0.09057,-0.043048,-0.025732,-0.158879,-0.131815,0.099061,0.030494,0.73522,...,0.040515,-0.005301,0.012819,0.029367,0.020835,0.121648,0.076373,0.012412,0.015149,-0.109826
3,0.408268,-0.072707,0.039615,0.089507,0.041976,0.237459,-0.049474,0.009442,0.04549,-0.11151,...,0.008571,-0.005443,-0.008491,-0.003442,-0.084006,0.094446,0.057402,-0.02611,0.014817,-0.034223
4,1.574264,0.021322,-0.051054,0.246894,-0.032349,1.552466,-0.19966,-0.014985,-0.060449,0.450666,...,0.110088,0.046016,0.006935,-0.015966,-0.050051,-0.052738,0.507027,0.033798,0.125695,0.199119


In [323]:
def recommendations(predictions, user_id, movies, original_ratings, num_recommendations):
    user_rownum = user_id -1
    sorted_user_predictions = preds.iloc[user_rownum].sort_values(ascending=False)
    
    user_data = original_ratings[original_ratings.User_ID==user_id]
    user_full = user_data.merge(movies, how = 'left', left_on = 'Movie_ID', right_on = 'Movie_ID')
    user_full = user_full.sort_values(['Rating'],ascending = False)
    #user_full = pd.DataFrame(user_full)
    print ('User {0} has already rated {1} movies'.format(user_id, user_full.shape[0]))
    print('Now recommending the top {0} movies for the user {1}'.format(num_recommendations, user_id))
    
    rated_movies = movies['Movie_ID'].isin(user_full['Movie_ID'])
    unrated_movies = movies[~rated_movies]
    unrated_movies = unrated_movies.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'Movie_ID', right_on = 'Movie_ID').rename(columns = {user_rownum: 'predictions'})
    unrated_n_movies = pd.DataFrame(unrated_movies).sort_values('predictions',ascending = False)
    unrated_n_movies = unrated_n_movies[:num_recommendations] 
    print (unrated_n_movies)
    #return user_full, unrated_n_movies

In [324]:
recommendations(preds, 1310, movies, ratings,10)

User 1310 has already rated 24 movies
Now recommending the top 10 movies for the user 1310
      Movie_ID                                              Title  \
1617      1674                                     Witness (1985)   
1879      1961                                    Rain Man (1988)   
1186      1210  Star Wars: Episode VI - Return of the Jedi (1983)   
1215      1242                                       Glory (1989)   
1201      1225                                     Amadeus (1984)   
1272      1302                             Field of Dreams (1989)   
1219      1246                          Dead Poets Society (1989)   
1880      1962                          Driving Miss Daisy (1989)   
1876      1957                            Chariots of Fire (1981)   
1937      2020                          Dangerous Liaisons (1988)   

                                   Genres  predictions  
1617               Drama|Romance|Thriller     1.292197  
1879                               

In [291]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    #print(sorted_user_predictions)
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.User_ID == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'Movie_ID', right_on = 'Movie_ID').sort_values(['Rating'], ascending=False))

    #print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    #print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['Movie_ID'].isin(user_full['Movie_ID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'Movie_ID',
               right_on = 'Movie_ID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )
    print(user_full)
    #return user_full, recommendations

In [292]:
recommend_movies(preds, 1310, movies, ratings,20)


    User_ID  Movie_ID  Rating  timeStamp  \
5      1310      2248       5  974781573   
6      1310      2620       5  974781573   
7      1310      3683       5  974781935   
15     1310      1704       5  974781573   
1      1310      1293       5  974781839   
12     1310      3101       4  974781573   
11     1310      1343       4  974781534   
20     1310      2000       4  974781892   
18     1310      3526       4  974781892   
17     1310      3360       4  974781935   
13     1310      3111       4  974782001   
23     1310      1097       4  974781534   
10     1310      1196       4  974781701   
9      1310      1185       4  974781839   
8      1310      3685       4  974781935   
4      1310      2243       4  974782001   
3      1310      1299       4  974781701   
16     1310       144       3  974781573   
19     1310      1960       3  974782001   
0      1310      2988       3  974781935   
14     1310      2313       2  974781839   
2      1310      1295       2  9