https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

In [7]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os

In [92]:
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_users.csv').drop("Unnamed: 0",axis=1)

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0",axis=1)

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../matrix_factorization/data/ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

### Users

In [93]:
print(users.shape)
users.head()

(948, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [94]:
users.loc[users["user_id"]>940]

Unnamed: 0,user_id,age,sex,occupation,zip_code
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841
943,944,33,F,other,78744
944,945,43,M,engineer,78739
945,946,27,M,healthcare,78613
946,947,25,M,scientist,78726
947,948,28,F,student,78748


### Ratings

In [95]:
print(ratings.shape)
ratings.head()

(100105, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949.0
1,186,302,3,891717742.0
2,22,377,1,878887116.0
3,244,51,2,880606923.0
4,166,346,1,886397596.0


### Items

In [96]:
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Loading in train and test provided by GroupLens where the test data has 10 ratings for each user

In [97]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('../matrix_factorization/data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../matrix_factorization/data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

### Building collaborative filtering model from scratch

We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies

In [98]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [99]:
n_users

948

In [100]:
n_items

1682

Now, we will create a user-item matrix which can be used to calculate the similarity between users and items

In [101]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

Now, we will calculate the similarity. We can use the pairwise_distance function from sklearn to calculate the cosine similarity.

In [102]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

This gives us the item-item and user-user similarity in an array form. The next step is to make predictions based on these similarities. Let’s define a function to do just that.

In [103]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Finally, we will make predictions based on user similarity and item similarity

In [104]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')


### Building a recommendation engine using matrix factorization

Let us define a function to predict the ratings given by the user to all the movies which are not rated by him/her

In [105]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

#### Now we have a function that can predict the ratings. The input for this function are:

* R – The user-movie rating matrix
* K – Number of latent features
* alpha – Learning rate for stochastic gradient descent
* beta – Regularization parameter for bias
* iterations – Number of iterations to perform stochastic gradient descent


We have to convert the user item ratings to matrix form. It can be done using the pivot function in python

In [106]:
R= np.array(ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0))

In [107]:
R

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [108]:
R.shape



(948, 1682)

fillna(0) will fill all the missing ratings with 0. Now we have the R matrix. We can initialize the number of latent features, but the number of these features must be less than or equal to the number of original features.

Now let us predict all the missing ratings. Let’s take K=20, alpha=0.001, beta=0.01 and iterations=100

In [109]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=800)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()


Iteration: 20 ; error = 296.3795
Iteration: 40 ; error = 291.3158
Iteration: 60 ; error = 287.8814
Iteration: 80 ; error = 282.4162
Iteration: 100 ; error = 273.7857
Iteration: 120 ; error = 263.3715
Iteration: 140 ; error = 252.2104
Iteration: 160 ; error = 241.7021
Iteration: 180 ; error = 232.5719
Iteration: 200 ; error = 224.9426
Iteration: 220 ; error = 218.6685
Iteration: 240 ; error = 213.4999
Iteration: 260 ; error = 209.1943
Iteration: 280 ; error = 205.5582
Iteration: 300 ; error = 202.4522
Iteration: 320 ; error = 199.7718
Iteration: 340 ; error = 197.4400
Iteration: 360 ; error = 195.3953
Iteration: 380 ; error = 193.5901
Iteration: 400 ; error = 191.9852
Iteration: 420 ; error = 190.5504
Iteration: 440 ; error = 189.2594
Iteration: 460 ; error = 188.0914
Iteration: 480 ; error = 187.0295
Iteration: 500 ; error = 186.0594
Iteration: 520 ; error = 185.1689
Iteration: 540 ; error = 184.3497
Iteration: 560 ; error = 183.5914
Iteration: 580 ; error = 182.8881
Iteration: 600 ; e

TODO: having done the predicitons for the missing ratings we ought to backfill these values to the ratings dataframe

TODO: We have created our recommendation engine. Next we have to evaluate our recommendation engine

In [110]:
prediction_matrix = mf.full_matrix()

In [132]:
# prediction_matrix = np.loadtxt("team_prediction_matrix.txt")

In [112]:
prediction_matrix.shape

(948, 1682)

In [113]:
np.savetxt("team_prediction_matrix.txt", prediction_matrix)

In [114]:
prediction_df = ratings
prediction_df["predicted_rating"] = ""
prediction_df.head()



Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating
0,196,242,3,881250949.0,
1,186,302,3,891717742.0,
2,22,377,1,878887116.0,
3,244,51,2,880606923.0,
4,166,346,1,886397596.0,


In [115]:
def getRatingPrediction(user_id, movie_id):
    return prediction_matrix.item(user_id-1,movie_id-1)


In [116]:
print(prediction_matrix.item(196-1,242-1))
print(getRatingPrediction(196,242))

3.452013453560657
3.452013453560657


In [117]:
prediction_matrix.item(0,0)

4.378953879620689

In [118]:
print(getRatingPrediction(1,1))

4.378953879620689


In [119]:
prediction_df["predicted_rating"] = prediction_df.apply(lambda x: getRatingPrediction(x["user_id"],x["movie_id"]),axis=1)

In [120]:
prediction_df["rating_diff"] = prediction_df["predicted_rating"] - prediction_df["rating"]
prediction_df["ratings_diff_sqrd"] = prediction_df["rating_diff"]**2

In [121]:
prediction_df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating,rating_diff,ratings_diff_sqrd
0,196,242,3,881250949.0,3.452013,0.452013,0.204316
1,186,302,3,891717742.0,3.326804,0.326804,0.106801
2,22,377,1,878887116.0,1.210723,0.210723,0.044404
3,244,51,2,880606923.0,3.375187,1.375187,1.89114
4,166,346,1,886397596.0,1.176911,0.176911,0.031297


In [122]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [131]:
# prediction_df.to_csv("team_predicted_ratings.csv")



In [124]:
user_rating_predictions = users
user_rating_predictions = user_rating_predictions.merge(ratings.groupby("user_id")["rating"].count(),on="user_id").rename(columns={"rating":"#_ratings"})
# user_rating_predictions["RMSE"] = ""




In [125]:
user_rating_predictions.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,#_ratings
0,1,24,M,technician,85711,272
1,2,53,F,other,94043,62
2,3,23,M,writer,32067,54
3,4,24,M,technician,43537,24
4,5,33,F,other,15213,175


In [126]:
def getRMSE(user_id):
    user = list(prediction_df.loc[prediction_df["user_id"] == user_id]["ratings_diff_sqrd"])
    length = len(user)
    err_sqrd_sum = sum(user)
    rmse = np.sqrt(err_sqrd_sum/length)
    return rmse

In [127]:
user_rating_predictions["RMSE"] = user_rating_predictions.apply(lambda x: getRMSE(x["user_id"]), axis=1)

In [128]:
user_rating_predictions.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,#_ratings,RMSE
0,1,24,M,technician,85711,272,0.55617
1,2,53,F,other,94043,62,0.503705
2,3,23,M,writer,32067,54,0.604995
3,4,24,M,technician,43537,24,0.420339
4,5,33,F,other,15213,175,0.526288


In [136]:
# user_rating_predictions.sort_values("RMSE", ascending=False)

In [133]:
# user_rating_predictions.to_csv("user_data.csv")

In [129]:
prediction_matrix

array([[4.37895388, 3.41669637, 3.98394345, ..., 2.99200517, 3.5830733 ,
        3.3673083 ],
       [3.6734386 , 2.32103231, 3.14710705, ..., 3.29084506, 3.71790979,
        3.42980644],
       [2.79722513, 3.85942424, 2.60657082, ..., 2.38859982, 2.89745054,
        2.90802144],
       ...,
       [2.53084144, 3.95186772, 5.66505383, ..., 3.71433605, 4.12191136,
        4.00202215],
       [4.78910937, 3.31031068, 3.83030791, ..., 3.24637025, 3.33495889,
        3.12234058],
       [4.83863312, 3.85218198, 4.51181628, ..., 3.93211138, 3.94055163,
        3.88172064]])

In [130]:
prediction_matrix[1,]

array([3.6734386 , 2.32103231, 3.14710705, ..., 3.29084506, 3.71790979,
       3.42980644])

In [87]:
def top5rec(userId):
#     movie_recs = 
    orderedRecs = list(np.argsort(-prediction_matrix[userId - 1,]))[0:5]
#     print(orderedRecs)
    rec_df = pd.DataFrame()
#     print("Top 5 movie recommendations:")
    for movie in orderedRecs:
        rec_df = rec_df.append(items.loc[items["movie id"]==movie+1])
    movie_list = []
    for i,row in enumerate(rec_df.values):
        movie_data = {
            "movie_id": row[0],
            "movie_title": row[1],
            "release_date": row[2],
            "IMDb_URL": row[4]
        }
        movie_list.append(movie_data)
    return(movie_list)

In [78]:
top5rec(944)

[{'movie_id': 924,
  'movie_title': 'White Squall (1996)',
  'release_date': '01-Jan-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?White%20Squall%20(1996)'},
 {'movie_id': 289,
  'movie_title': 'Evita (1996)',
  'release_date': '25-Dec-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Evita%20(1996)'},
 {'movie_id': 155,
  'movie_title': 'Dirty Dancing (1987)',
  'release_date': '01-Jan-1987',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Dirty%20Dancing%20(1987)'},
 {'movie_id': 143,
  'movie_title': 'Sound of Music, The (1965)',
  'release_date': '01-Jan-1965',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Sound%20of%20Music,%20The%20(1965)'},
 {'movie_id': 966,
  'movie_title': 'Affair to Remember, An (1957)',
  'release_date': '01-Jan-1957',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Affair%20to%20Remember,%20An%20(1957)'}]

In [110]:
def top5rec1(userId):
    rated_movies = list(ratings.loc[ratings["user_id"] ==userId,"movie_id"].values)
    print(len(rated_movies))
    orderedRecs = list(np.argsort(-prediction_matrix[userId - 1,]))
    print(len(orderedRecs))
    for movie in rated_movies:
        if movie-1 in orderedRecs:
            orderedRecs.remove(movie)
    print(len(orderedRecs))
    top5 = orderedRecs[0:5]
    rec_df = pd.DataFrame()
    for movie in top5:
        rec_df = rec_df.append(items.loc[items["movie id"]==movie+1])
    movie_list = []
    for i,row in enumerate(rec_df.values):
        movie_data = {
            "movie_id": row[0],
            "movie_title": row[1],
            "release_date": row[2],
            "IMDb_URL": row[4]
        }
        movie_list.append(movie_data)
    return(movie_list)