In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.linear_model import LinearRegression

In [2]:
# Read movies.csv and ratings.csv files
ratings=pd.read_csv('./ml-latest/ratings.csv')
movies=pd.read_csv('./ml-latest/movies.csv')
print("ratings dimension:", ratings.shape)
print("movies dimension:",movies.shape)

ratings dimension: (27753444, 4)
movies dimension: (58098, 3)


In [3]:
# descriptive statistics
ratings['rating'].describe()

count    2.775344e+07
mean     3.530445e+00
std      1.066353e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Data cleaning
del movies['genres']

In [6]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [8]:
# Data cleaning
del ratings['timestamp']

In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [10]:
print("Number of unique users:",len(ratings.userId.unique()))
print("Number of unique movies:",len(ratings.movieId.unique()))

Number of unique users: 283228
Number of unique movies: 53889


In [11]:
# Creating a DataFrame for showing the number of ratings per movie
ratings_per_movie=pd.DataFrame({'count':ratings.groupby('movieId').size()})
ratings_per_movie.index.names=['index']
print(ratings_per_movie.shape)
ratings_per_movie.head(10)

(53889, 1)


Unnamed: 0_level_0,count
index,Unnamed: 1_level_1
1,68469
2,27143
3,15585
4,2989
5,15474
6,28683
7,15301
8,1539
9,4449
10,33086


In [12]:
#Obtaining the most popular movies: step I
ratings_per_movie['movie title']= movies['title']
ratings_per_movie['movieId']= movies['movieId']
sorted_ratings_per_movie= ratings_per_movie.sort_values(by='count',ascending=False)
sorted_ratings_per_movie.head(10)

Unnamed: 0_level_0,count,movie title,movieId
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
318,97999,Strawberry and Chocolate (Fresa y chocolate) (...,321.0
356,97040,I Love Trouble (1994),360.0
296,92406,Priest (1994),299.0
593,87899,"Wild Bunch, The (1969)",599.0
2571,84545,Tarantula (1955),2656.0
260,81815,Ladybird Ladybird (1994),263.0
480,76451,Lassie (1994),484.0
527,71516,"Secret Garden, The (1993)",531.0
110,68803,Rumble in the Bronx (Hont faan kui) (1995),112.0
1,68469,Jumanji (1995),2.0


In [13]:
#Obtaining the most popular movies: step II
min_num_ratings=60
popular_movies = list(set(sorted_ratings_per_movie.query('count >= @min_num_ratings').index))
ratings_after_dropping_movies = ratings[ratings.movieId.isin(popular_movies)]
print('shape of original ratings data: ', ratings.shape)
print('shape of ratings data after dropping unpopular movies: ', ratings_after_dropping_movies.shape)
ratings_after_dropping_movies.head(10)

shape of original ratings data:  (27753444, 3)
shape of ratings data after dropping unpopular movies:  (27383603, 3)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
5,1,1590,2.5
6,1,1591,1.5
7,1,2134,4.5
8,1,2478,4.0
9,1,2840,3.0


In [14]:
#print(popular_movies,type(popular_movies),len(popular_movies))

In [15]:
#Obtaining the most active users:step I
ratings_by_each_user=pd.DataFrame({'count':ratings.groupby('userId').size()})
sorted_ratings_by_each_user=ratings_by_each_user.sort_values(by='count',ascending=False)
sorted_ratings_by_each_user.head(10)

Unnamed: 0_level_0,count
userId,Unnamed: 1_level_1
123100,23715
117490,9279
134596,8381
212343,7884
242683,7515
111908,6645
77609,6398
63783,6346
172357,5868
141955,5810


In [16]:
#Obtaining the most active users:step II
min_num_ratings=50
active_users = list(set(sorted_ratings_by_each_user.query('count >= @min_num_ratings').index))
ratings_after_dropping_users = ratings_after_dropping_movies[ratings_after_dropping_movies.userId.isin(active_users)]
print('shape of original ratings data: ', ratings.shape)
print('shape of ratings data after dropping inactive users and unpopular movies: ', ratings_after_dropping_users.shape)
ratings_after_dropping_users.head(10)

shape of original ratings data:  (27753444, 3)
shape of ratings data after dropping inactive users and unpopular movies:  (24142558, 3)


Unnamed: 0,userId,movieId,rating
42,4,1,4.0
43,4,2,4.0
44,4,5,2.0
45,4,6,4.5
46,4,10,4.0
47,4,11,3.5
48,4,16,4.0
49,4,19,2.0
50,4,20,2.5
51,4,23,3.0


In [17]:
ratings_after_dropping_users.shape

(24142558, 3)

In [18]:
movies[movies['movieId'].isin(ratings_after_dropping_users['movieId'].unique())].reset_index().drop(columns=['index']).to_csv('./movies_MF.csv')

In [19]:
#pivot ratings into movie features
user_item_map=ratings_after_dropping_users.pivot(index='userId',columns='movieId', values='rating').fillna(0)

In [20]:
user_item_map

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,4.0,4.0,0.0,0.0,2.0,4.5,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,4.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283215,4.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283219,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283224,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Dimensions of user-item map
user_item_map.shape

(109672, 12492)

In [22]:
# Preprocessing the matrix
# Convert the DataFrame to np.array
user_item_matrix= np.array(user_item_map)

In [23]:
# Obtaining a baseline. In our case, the baseline is the average ratings given by each user.
sum_of_rows= np.sum(user_item_matrix,axis=1).reshape(-1,1)
binary_filter=user_item_matrix!=0
denom=np.sum(binary_filter,axis=1).reshape(-1,1)
baseline=sum_of_rows/denom
#subtracting the baseline(average ratings given by each user) from all ratings given by that user (in each row)
user_item=user_item_matrix-baseline

In [24]:
print(user_item, user_item.shape)

[[ 0.60258152  0.60258152 -3.39741848 ... -3.39741848 -3.39741848
  -3.39741848]
 [-4.26388889 -4.26388889 -4.26388889 ... -4.26388889 -4.26388889
  -4.26388889]
 [ 0.80991736 -4.19008264 -4.19008264 ... -4.19008264 -4.19008264
  -4.19008264]
 ...
 [-3.47826087 -3.47826087 -3.47826087 ... -3.47826087 -3.47826087
  -3.47826087]
 [-3.7325228  -3.7325228   0.2674772  ... -3.7325228  -3.7325228
  -3.7325228 ]
 [ 0.12946429 -4.37053571 -4.37053571 ... -4.37053571 -4.37053571
  -4.37053571]] (109672, 12492)


In [25]:
#SVD Decomposition (Matrix Factorization Implementation)
# The user_item matrix can be decomposed in the form of multiplication of three matrices.
U, sigma, Vt = svds(user_item, k = 20) # k: the number of sigular values and vectors to compute

In [26]:
print("User-Feature Matrix:",U)
print("Singular Values:",sigma)
print("Item-Feature Matrix:",Vt)
print("User-Feature Matrix Dimension:",U.shape)
print("Singular Values Dimension:",sigma.shape)
print("Item-Feature Matrix Dimension:",Vt.shape)

User-Feature Matrix: [[-2.53614963e-03 -6.48509988e-04  1.04214193e-02 ...  1.20789945e-03
  -9.01120317e-03 -2.67907689e-03]
 [-4.27517274e-03 -2.72802837e-03  3.52730904e-03 ... -1.61292385e-03
   2.10467256e-03 -3.53240139e-03]
 [-1.51092992e-03 -4.63059803e-03  3.58378377e-03 ...  2.44027455e-03
   1.03571980e-03 -3.45929820e-03]
 ...
 [ 1.14930797e-03  6.77910809e-04  2.56663593e-03 ... -2.82999833e-05
   1.47959079e-03 -2.88238865e-03]
 [ 9.48977113e-03  4.24002265e-03  3.89595307e-03 ...  6.31441118e-03
  -1.45494105e-03 -3.03411228e-03]
 [ 1.97129788e-03 -4.49670391e-04 -7.76470759e-04 ...  1.27524824e-03
   2.04278697e-03 -3.60903798e-03]]
Singular Values: [  1020.09130717   1029.16646319   1037.25960102   1072.87579714
   1101.56491124   1165.54986344   1204.26263144   1232.30752631
   1271.07738074   1342.65355433   1490.22007913   1619.01328473
   1684.79095293   1970.38612045   2061.21352287   2127.44939441
   2626.61645334   3642.88151532   4800.46596051 134137.14821833]


In [27]:
# converting Singular Values to a diagonal matrix
sigma=np.diag(sigma)

In [28]:
# Prediction of ratings given by users to movies
predicted_ratings=np.dot(np.dot(U, sigma), Vt)+ baseline.reshape(-1,1)
print(predicted_ratings)

[[ 2.19969940e+00  2.34170383e+00  1.34381408e+00 ...  2.84922961e-03
  -2.74120817e-02 -1.71213048e-02]
 [ 3.73453979e-01 -2.31126424e-01 -7.02425566e-03 ... -3.96404353e-03
  -3.45294652e-03 -8.09271604e-03]
 [ 2.60891855e+00  3.83556954e-01 -2.85922893e-02 ... -5.07882958e-03
  -1.01949175e-02 -5.61703451e-03]
 ...
 [ 3.99959362e-01  1.67875802e-01  1.11789406e-01 ...  3.51738780e-03
  -2.75025986e-03  1.73119583e-03]
 [ 1.11621623e+00  1.59494121e+00  1.52210439e+00 ...  5.80776493e-03
  -7.84943996e-03  3.98986063e-03]
 [ 1.46940659e+00  2.14604875e-01  1.41099681e-01 ... -1.74789685e-02
  -6.14205218e-03 -1.16231423e-02]]


In [29]:
# DataFrame of the predicted ratings
predicted_ratings_df=pd.DataFrame(predicted_ratings,columns=user_item_map.columns)
predicted_ratings_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
0,2.199699,2.341704,1.343814,0.11865,0.97469,4.649554,1.021188,0.155206,0.720663,5.158198,...,-0.01762,-0.000431,-0.006543,-0.003339,-0.025859,-0.041787,-0.037447,0.002849,-0.027412,-0.017121
1,0.373454,-0.231126,-0.007024,0.003822,-0.030527,0.841827,-0.101487,0.026615,0.014583,0.067268,...,-0.017051,-0.001289,-0.013448,-0.041477,-0.031017,-0.003812,-0.009919,-0.003964,-0.003453,-0.008093
2,2.608919,0.383557,-0.028592,0.016269,0.011271,0.390003,0.192016,0.090569,-0.047459,0.00696,...,-0.00244,-0.007544,-0.00091,-0.00351,0.00427,-0.007628,0.001,-0.005079,-0.010195,-0.005617
3,2.872752,0.830177,-0.240863,-0.022046,-0.111348,0.397076,-0.275578,0.009555,-0.080811,0.177993,...,0.023363,0.010247,0.015578,0.061729,0.055714,0.027308,0.034076,0.004625,0.020014,0.013011
4,1.627505,0.166608,-0.067832,0.032684,-0.069847,0.783049,-0.013327,-0.023777,-0.044435,0.107534,...,-0.00733,-0.005669,-0.015123,-0.008737,-0.011968,-0.004934,-0.002106,-0.004269,-0.003729,-0.003424


In [30]:
predicted_ratings_df.index=user_item_map.index

In [31]:
predicted_ratings_df

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,2.199699,2.341704,1.343814,0.118650,0.974690,4.649554,1.021188,0.155206,0.720663,5.158198,...,-0.017620,-0.000431,-0.006543,-0.003339,-0.025859,-0.041787,-0.037447,0.002849,-0.027412,-0.017121
5,0.373454,-0.231126,-0.007024,0.003822,-0.030527,0.841827,-0.101487,0.026615,0.014583,0.067268,...,-0.017051,-0.001289,-0.013448,-0.041477,-0.031017,-0.003812,-0.009919,-0.003964,-0.003453,-0.008093
10,2.608919,0.383557,-0.028592,0.016269,0.011271,0.390003,0.192016,0.090569,-0.047459,0.006960,...,-0.002440,-0.007544,-0.000910,-0.003510,0.004270,-0.007628,0.001000,-0.005079,-0.010195,-0.005617
14,2.872752,0.830177,-0.240863,-0.022046,-0.111348,0.397076,-0.275578,0.009555,-0.080811,0.177993,...,0.023363,0.010247,0.015578,0.061729,0.055714,0.027308,0.034076,0.004625,0.020014,0.013011
15,1.627505,0.166608,-0.067832,0.032684,-0.069847,0.783049,-0.013327,-0.023777,-0.044435,0.107534,...,-0.007330,-0.005669,-0.015123,-0.008737,-0.011968,-0.004934,-0.002106,-0.004269,-0.003729,-0.003424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283215,1.363716,0.491086,1.342480,0.317990,1.287002,0.849581,1.655354,0.166215,0.586232,0.550200,...,-0.006398,-0.005078,-0.014071,-0.004886,-0.002674,0.002916,-0.001772,-0.007531,0.001970,-0.004257
283219,2.416781,0.039833,0.044170,0.004910,-0.002349,0.505088,0.087673,-0.028864,0.094970,0.184848,...,0.004059,-0.005007,-0.010580,0.023605,0.003113,-0.005529,-0.000465,-0.003699,-0.000336,0.003259
283222,0.399959,0.167876,0.111789,0.003918,0.093284,0.083512,0.057657,0.006906,0.025065,0.328329,...,0.001706,-0.001075,-0.001608,-0.005292,-0.000483,-0.001039,0.003411,0.003517,-0.002750,0.001731
283224,1.116216,1.594941,1.522104,0.390764,1.378236,1.349007,1.833072,0.248292,0.547499,1.514119,...,0.009013,0.001867,0.015572,0.014643,0.002198,-0.004514,-0.005489,0.005808,-0.007849,0.003990


In [32]:
def recommend_movies(predicted_ratings_df, userID, movies, ratings, num_recommendations=5):
    
    # Get the userId and obtain the row number 
    user_row_number = userID - 1 # UserID starts at 1, not 0(userID=1 in row=0)
    #Sort the predicted ratings of user in user_row_number in a descending order
    sorted_user_predictions = predicted_ratings_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    #obtain the actual ratings that user has given to different movies
    initial_id = predicted_ratings_df.iloc[user_row_number].name # actual user ID in the ratings DataFrame
    user_data = ratings[ratings.userId == (initial_id)]
#     user_data = ratings[ratings.userId == (userID)]
   # obtain an m*4 data frame :userID,movieID,rating,title which shows all the movies rated by a user with their movieID and title
   # the dataframe is sorted based on rating in a descending way
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').sort_values(['rating'], ascending=False))
   
    recommendations = (movies[~(movies.movieId).isin(user_full.movieId)]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {initial_id: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations

In [33]:
already_rated, predictions = recommend_movies(predicted_ratings_df, 1, movies,ratings, 10)

In [34]:
already_rated

Unnamed: 0,userId,movieId,rating,title
550,4,5010,5.0,Black Hawk Down (2001)
175,4,1136,5.0,Monty Python and the Holy Grail (1975)
508,4,4306,5.0,Shrek (2001)
541,4,4901,5.0,Spy Game (2001)
552,4,5064,5.0,The Count of Monte Cristo (2002)
...,...,...,...,...
502,4,4232,0.5,Spy Kids (2001)
185,4,1220,0.5,"Blues Brothers, The (1980)"
489,4,4025,0.5,Miss Congeniality (2000)
75,4,339,0.5,While You Were Sleeping (1995)


In [35]:
predictions

Unnamed: 0,movieId,title
1299,1573,Face/Off (1997)
152,208,Waterworld (1995)
3444,4027,"O Brother, Where Art Thou? (2000)"
1930,2329,American History X (1998)
2678,3175,Galaxy Quest (1999)
2652,3147,"Green Mile, The (1999)"
11,21,Get Shorty (1995)
10748,49272,Casino Royale (2006)
430,553,Tombstone (1993)
3088,3624,Shanghai Noon (2000)


**calculating the error for peredictions:**

In [36]:
#Error evaluation
import random
random_numbers=np.random.permutation(predicted_ratings_df.shape[0])
rand=random_numbers[:36]
# taking 36 samples of the user-item matrix
samples=user_item_map.iloc[rand]
predicted_ratings_of_samples=predicted_ratings_df.iloc[rand]

In [37]:
samples

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66194,4.5,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
280776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89354,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
273423,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270368,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179977,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
240426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169863,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# keeping only the ratings that the users have given to movies
samples[samples!=0]

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66194,4.5,4.0,,,,4.0,,,,5.0,...,,,,,,,,,,
9902,,,,,,,,,,,...,,,,,,,,,,
280776,,,,,,,,,,,...,,,,,,,,,,
89354,5.0,,,,,4.0,,,,,...,,,,,,,,,,
273423,3.5,,,,,,,,,3.0,...,,,,,,,,,,
270368,,3.0,,,,,,,,,...,,,,,,,,,,
179977,,,,,,4.0,,,,,...,,,,,,,,,,
240426,,,,,,,,,,,...,,,,,,,,,,
149986,,,,,,,,,,,...,,,,,,,,,,
169863,,,,4.0,,,,,,,...,,,,,,,,,,


In [39]:
# making the index of predicted ratings exactly the same as actual ratings' index
predicted_ratings_of_samples.index=samples.index

In [40]:
predicted_ratings_of_samples

movieId,1,2,3,4,5,6,7,8,9,10,...,188797,189041,189203,189333,189363,189713,191351,191799,192283,192307
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66194,4.268108,2.039987,0.291584,0.014852,0.279724,2.153234,0.367855,0.073187,0.065665,3.20693,...,0.01673,0.020694,0.032432,0.02543,0.008418,0.01741,0.005739,0.016894,0.009572,0.01439
9902,0.518596,-0.061814,-0.165454,0.024545,-0.084617,-0.051124,-0.058214,-0.066865,0.029827,-0.217087,...,0.046363,0.025538,0.063266,0.119085,0.072806,0.012853,0.016599,0.018134,0.011203,0.016029
280776,0.197295,0.156148,-0.038284,0.030597,0.014616,-0.187582,-0.010935,0.017051,-0.048646,-0.225353,...,-0.009964,-0.006976,-0.007765,-0.019591,-0.011311,-0.004874,-0.003934,-0.004632,-0.005199,-0.006432
89354,3.323796,-0.24724,-0.213053,0.147828,-0.289583,1.075949,0.028145,-0.125602,-0.243933,-0.895739,...,-0.024213,-0.031699,-0.018542,-0.01197,-0.025262,-0.016309,-0.029566,-0.034344,-0.023375,-0.027458
273423,2.431568,1.396874,1.158121,0.124157,0.992335,1.314075,0.279098,0.118158,0.227904,1.761801,...,0.045085,0.031423,0.035932,0.045049,0.030533,0.033689,0.022539,0.023647,0.025066,0.024226
270368,0.653422,1.21666,-0.313871,-0.022129,-0.256694,-0.178594,-0.502603,0.078059,-0.09516,0.905936,...,0.102108,0.087857,0.156122,0.146337,0.093476,0.044439,0.049822,0.101143,0.046788,0.074488
179977,0.747134,1.443455,0.419157,0.249647,0.506468,1.452756,0.361558,0.10808,0.209387,2.278366,...,0.004884,0.005114,0.009811,0.009117,0.001578,7e-06,-0.003337,0.002077,-0.000175,0.001317
240426,0.60975,0.069849,0.154474,-0.054309,0.07061,0.021465,-0.042998,-0.023184,-0.014929,0.32137,...,0.007,0.002496,0.005655,0.006429,0.005573,0.009303,0.004802,0.003263,0.004744,0.00229
149986,-0.250678,0.517182,-0.361496,0.037678,-0.015406,0.049758,0.354737,-0.045992,-0.117797,-0.584032,...,0.020778,0.029311,0.077796,0.103532,0.074309,0.033063,0.016271,0.01811,0.023899,0.007125
169863,0.851934,0.713461,0.413346,0.299659,0.525958,0.834019,0.723988,0.106708,0.197264,0.7804,...,-0.008537,-0.007417,-0.011254,-0.014987,-0.009052,-0.011961,-0.00585,-0.004868,-0.005615,-0.005681


In [41]:
# Calculating RMSE
RMSE_for_each_sample=np.sqrt( (( samples.subtract(predicted_ratings_of_samples) ) **2).mean(axis=1))
RMSE=RMSE_for_each_sample.sum()/36

In [42]:
print(RMSE_for_each_sample, RMSE)

userId
66194     0.387594
9902      0.419067
280776    0.193888
89354     0.541511
273423    0.474199
270368    0.643444
179977    0.202287
240426    0.290293
149986    0.609606
169863    0.264909
159971    0.420822
8862      0.641932
179082    0.370528
277714    0.186236
246496    0.241731
208240    0.216442
98582     0.298671
164074    0.483401
177431    0.323794
238934    0.590638
25160     0.248958
205539    0.542510
68671     0.337701
11885     0.405622
2631      0.312978
198293    0.375600
24533     0.458829
91635     0.631552
184966    0.453681
65541     0.506522
94757     0.215170
199522    0.343877
156741    0.397214
262741    0.480649
58000     0.248351
53209     0.379503
dtype: float64 0.39276964642269063


In [43]:
#MAE
MAE_for_each_user=((samples.subtract(predicted_ratings_of_samples)).abs()).mean(axis=1)
MAE=(MAE_for_each_user.sum())/36
print(MAE_for_each_user,MAE)


userId
66194     0.099101
9902      0.106513
280776    0.036230
89354     0.185219
273423    0.156141
270368    0.267930
179977    0.042253
240426    0.064889
149986    0.226561
169863    0.055044
159971    0.123157
8862      0.250090
179082    0.088498
277714    0.037414
246496    0.040220
208240    0.035698
98582     0.056673
164074    0.149114
177431    0.075495
238934    0.227081
25160     0.059076
205539    0.215152
68671     0.072670
11885     0.123839
2631      0.073046
198293    0.111346
24533     0.122992
91635     0.250825
184966    0.163331
65541     0.169041
94757     0.049208
199522    0.084967
156741    0.118123
262741    0.194059
58000     0.057560
53209     0.099819
dtype: float64 0.11912162278139966


**using the values for recommending movies to a new user that is not in the data set:**

In [44]:
movies_MF=pd.read_csv('./movies_MF.csv')

In [45]:
new_user_movies = np.random.randint(1,12492, 50)

In [46]:
new_user_movies

array([ 7263, 10612,  7422,  5835, 10619,  1170,  7307, 10867,  4131,
        9735,  5318,   227,  7025,  1297,  7576,  4594,  3133,  6814,
       12221, 11911,   965,  8118,  4856,  7987,  2006,  8515,  8130,
        4898,  5330, 11793,  9283,  5902,  7357,    35, 12060,  1500,
        4552,    89,  9621,  2408, 12048,  2876,  2992,  9133,  3053,
       11437,  5659,  4784, 12019, 11967])

In [47]:
selected_movies_features = Vt[:,new_user_movies].T  #[n_movies(10) X n_features(20)]

In [48]:
all_other_movies = Vt[:,~np.isin(np.arange(12492), new_user_movies)].T

In [49]:
selected_movies_scores = np.random.randint(1,6,len(new_user_movies))
selected_movies_scores

array([2, 2, 2, 4, 1, 3, 5, 5, 3, 5, 1, 4, 5, 2, 2, 2, 2, 3, 2, 1, 2, 4,
       3, 5, 4, 1, 3, 2, 2, 2, 2, 5, 2, 5, 5, 3, 2, 1, 3, 4, 3, 2, 3, 3,
       5, 5, 5, 5, 1, 4])

In [50]:
selected_movies_scores.shape, selected_movies_features.shape

((50,), (50, 20))

In [51]:
reg = LinearRegression()

In [52]:
reg.fit(selected_movies_features, selected_movies_scores)

LinearRegression()

In [53]:
sorted_recommended_movies = pd.Series(reg.predict(all_other_movies)).sort_values(ascending= False)

In [54]:
sorted_recommended_movies

482      30.342623
5783     27.309351
551      27.258234
1690     25.654577
428      24.864639
           ...    
4401    -97.811474
1067   -104.356887
2262   -109.118592
1054   -112.364294
243    -135.886611
Length: 12442, dtype: float64

In [55]:
sorted_recommended_movies_indices =sorted_recommended_movies
min_val = sorted_recommended_movies_indices.min()
max_val = sorted_recommended_movies_indices.max()
normalized_scores = (sorted_recommended_movies_indices - min_val) / (max_val - min_val)
# print("normalized_scores:", normalized_scores)

normalized_scores, max_val, min_val

(482     1.000000
 5783    0.981752
 551     0.981445
 1690    0.971798
 428     0.967046
           ...   
 4401    0.229052
 1067    0.189676
 2262    0.161031
 1054    0.141505
 243     0.000000
 Length: 12442, dtype: float64,
 30.342622752811906,
 -135.8866112263658)

In [56]:
final_recommended_movies = movies_MF.iloc[list(normalized_scores.iloc[0:10].index)]
final_recommended_movies['% Match'] = (normalized_scores.iloc[0:10]*100).apply(lambda x: str(x))
final_recommended_movies = final_recommended_movies.loc[:,['title', '% Match']]
final_recommended_movies.columns = ['Movie Title', '% Match']
final_recommended_movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_recommended_movies['% Match'] = (normalized_scores.iloc[0:10]*100).apply(lambda x: str(x))


Unnamed: 0,Movie Title,% Match
482,North (1994),100.0
5783,All I Want (Try Seventeen) (2002),98.17524771051583
551,Dear Diary (Caro Diario) (1994),98.1444969635008
1690,Rocky (1976),97.17977028437652
428,Flesh and Bone (1993),96.7045607514912
560,Snow White and the Seven Dwarfs (1937),96.5250327114233
215,Death and the Maiden (1994),96.39430016810604
1726,Prom Night IV: Deliver Us From Evil (1992),95.59442671972072
4161,"Cage aux Folles, La (1978)",95.41890086136571
3554,Before Night Falls (2000),95.37543778501608
