https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

In [21]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os

In [22]:
#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_users.csv').drop("Unnamed: 0",axis=1)

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('../matrix_factorization/data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0",axis=1)

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('../matrix_factorization/data/ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

### Users

In [23]:
print(users.shape)
users.head()

(948, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [24]:
users.loc[users["user_id"]>940]

Unnamed: 0,user_id,age,sex,occupation,zip_code
940,941,20,M,student,97229
941,942,48,F,librarian,78209
942,943,22,M,student,77841
943,944,33,F,other,78744
944,945,43,M,engineer,78739
945,946,27,M,healthcare,78613
946,947,25,M,scientist,78726
947,948,28,F,student,78748


### Ratings

In [25]:
print(ratings.shape)
ratings.head()

(100105, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949.0
1,186,302,3,891717742.0
2,22,377,1,878887116.0
3,244,51,2,880606923.0
4,166,346,1,886397596.0


### Items

In [26]:
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Loading in train and test provided by GroupLens where the test data has 10 ratings for each user

In [27]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('../matrix_factorization/data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../matrix_factorization/data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

### Building collaborative filtering model from scratch

We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies

In [28]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [29]:
n_users

948

In [30]:
n_items

1682

Now, we will create a user-item matrix which can be used to calculate the similarity between users and items

In [31]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

Now, we will calculate the similarity. We can use the pairwise_distance function from sklearn to calculate the cosine similarity.

In [32]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

This gives us the item-item and user-user similarity in an array form. The next step is to make predictions based on these similarities. Let’s define a function to do just that.

In [33]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Finally, we will make predictions based on user similarity and item similarity

In [34]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')


### Building a recommendation engine using matrix factorization

Let us define a function to predict the ratings given by the user to all the movies which are not rated by him/her

In [35]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

#### Now we have a function that can predict the ratings. The input for this function are:

* R – The user-movie rating matrix
* K – Number of latent features
* alpha – Learning rate for stochastic gradient descent
* beta – Regularization parameter for bias
* iterations – Number of iterations to perform stochastic gradient descent


We have to convert the user item ratings to matrix form. It can be done using the pivot function in python

In [36]:
R= np.array(ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0))

In [37]:
R

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [38]:
R.shape



(948, 1682)

fillna(0) will fill all the missing ratings with 0. Now we have the R matrix. We can initialize the number of latent features, but the number of these features must be less than or equal to the number of original features.

Now let us predict all the missing ratings. Let’s take K=20, alpha=0.001, beta=0.01 and iterations=100

In [39]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=800)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()


Iteration: 20 ; error = 296.3682
Iteration: 40 ; error = 291.2907
Iteration: 60 ; error = 287.8241
Iteration: 80 ; error = 282.1847
Iteration: 100 ; error = 272.7453
Iteration: 120 ; error = 261.8550
Iteration: 140 ; error = 250.9267
Iteration: 160 ; error = 240.6152
Iteration: 180 ; error = 231.5714
Iteration: 200 ; error = 224.0079
Iteration: 220 ; error = 217.7992
Iteration: 240 ; error = 212.6964
Iteration: 260 ; error = 208.4595
Iteration: 280 ; error = 204.8929
Iteration: 300 ; error = 201.8524
Iteration: 320 ; error = 199.2288
Iteration: 340 ; error = 196.9426
Iteration: 360 ; error = 194.9335
Iteration: 380 ; error = 193.1541
Iteration: 400 ; error = 191.5690
Iteration: 420 ; error = 190.1483
Iteration: 440 ; error = 188.8672
Iteration: 460 ; error = 187.7076
Iteration: 480 ; error = 186.6529
Iteration: 500 ; error = 185.6879
Iteration: 520 ; error = 184.8031
Iteration: 540 ; error = 183.9877
Iteration: 560 ; error = 183.2345
Iteration: 580 ; error = 182.5350
Iteration: 600 ; e

In [42]:
prediction_matrix = mf.full_matrix()

In [43]:
# prediction_matrix = np.loadtxt("team_prediction_matrix.txt")

In [44]:
prediction_matrix.shape

(948, 1682)

In [45]:
np.savetxt("team_prediction_matrix.txt", prediction_matrix)

In [46]:
prediction_df = ratings
prediction_df["predicted_rating"] = ""
prediction_df.head()



Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating
0,196,242,3,881250949.0,
1,186,302,3,891717742.0,
2,22,377,1,878887116.0,
3,244,51,2,880606923.0,
4,166,346,1,886397596.0,


In [47]:
def getRatingPrediction(user_id, movie_id):
    return prediction_matrix.item(user_id-1,movie_id-1)


In [51]:
prediction_df["predicted_rating"] = prediction_df.apply(lambda x: getRatingPrediction(x["user_id"],x["movie_id"]),axis=1)

In [52]:
prediction_df["rating_diff"] = prediction_df["predicted_rating"] - prediction_df["rating"]
prediction_df["ratings_diff_sqrd"] = prediction_df["rating_diff"]**2

In [53]:
prediction_df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating,rating_diff,ratings_diff_sqrd
0,196,242,3,881250949.0,3.196622,0.196622,0.03866
1,186,302,3,891717742.0,3.252037,0.252037,0.063522
2,22,377,1,878887116.0,0.938344,-0.061656,0.003801
3,244,51,2,880606923.0,2.968029,0.968029,0.93708
4,166,346,1,886397596.0,1.894466,0.894466,0.80007


In [54]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [55]:
prediction_df.to_csv("team_predicted_ratings.csv")



In [56]:
user_rating_predictions = users
user_rating_predictions = user_rating_predictions.merge(ratings.groupby("user_id")["rating"].count(),on="user_id").rename(columns={"rating":"#_ratings"})


In [57]:
user_rating_predictions.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,#_ratings
0,1,24,M,technician,85711,272
1,2,53,F,other,94043,62
2,3,23,M,writer,32067,54
3,4,24,M,technician,43537,24
4,5,33,F,other,15213,175


In [58]:
def getRMSE(user_id):
    user = list(prediction_df.loc[prediction_df["user_id"] == user_id]["ratings_diff_sqrd"])
    length = len(user)
    err_sqrd_sum = sum(user)
    rmse = np.sqrt(err_sqrd_sum/length)
    return rmse

In [59]:
user_rating_predictions["RMSE"] = user_rating_predictions.apply(lambda x: getRMSE(x["user_id"]), axis=1)

In [60]:
user_rating_predictions.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,#_ratings,RMSE
0,1,24,M,technician,85711,272,0.64385
1,2,53,F,other,94043,62,0.544951
2,3,23,M,writer,32067,54,0.489931
3,4,24,M,technician,43537,24,0.334425
4,5,33,F,other,15213,175,0.55262


In [62]:
user_rating_predictions.to_csv("user_data.csv")