https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

In [29]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os

In [30]:
# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

#Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/includes_team_users.csv').drop("Unnamed: 0",axis=1)

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('data/ml-100k/includes_team_ratings.csv').drop("Unnamed: 0",axis=1)

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('data/ml-100k/u.item', sep='|', names=i_cols,
encoding='latin-1')

### Users

In [31]:
print(users.shape)
users.head()

(948, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Ratings

In [32]:
print(ratings.shape)
ratings.head()

(100105, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949.0
1,186,302,3,891717742.0
2,22,377,1,878887116.0
3,244,51,2,880606923.0
4,166,346,1,886397596.0


### Items

In [33]:
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Loading in train and test provided by GroupLens where the test data has 10 ratings for each user

In [34]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

### Building collaborative filtering model from scratch

We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies

In [35]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [36]:
n_users

948

In [37]:
n_items

1695

Now, we will create a user-item matrix which can be used to calculate the similarity between users and items

In [38]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

IndexError: index 3226 is out of bounds for axis 1 with size 1695

Now, we will calculate the similarity. We can use the pairwise_distance function from sklearn to calculate the cosine similarity.

In [39]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

This gives us the item-item and user-user similarity in an array form. The next step is to make predictions based on these similarities. Let’s define a function to do just that.

In [40]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Finally, we will make predictions based on user similarity and item similarity

In [41]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')


### Building a recommendation engine using matrix factorization

Let us define a function to predict the ratings given by the user to all the movies which are not rated by him/her

In [42]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

#### Now we have a function that can predict the ratings. The input for this function are:

* R – The user-movie rating matrix
* K – Number of latent features
* alpha – Learning rate for stochastic gradient descent
* beta – Regularization parameter for bias
* iterations – Number of iterations to perform stochastic gradient descent


We have to convert the user item ratings to matrix form. It can be done using the pivot function in python

In [43]:
R= np.array(ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0))

In [44]:
R

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 4., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 2., 5.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [45]:
R.shape



(948, 1695)

fillna(0) will fill all the missing ratings with 0. Now we have the R matrix. We can initialize the number of latent features, but the number of these features must be less than or equal to the number of original features.

Now let us predict all the missing ratings. Let’s take K=20, alpha=0.001, beta=0.01 and iterations=100

In [46]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=800)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()


Iteration: 20 ; error = 296.3803
Iteration: 40 ; error = 291.3372
Iteration: 60 ; error = 288.0480
Iteration: 80 ; error = 282.8872
Iteration: 100 ; error = 273.8418
Iteration: 120 ; error = 262.6104
Iteration: 140 ; error = 251.1417
Iteration: 160 ; error = 240.6600
Iteration: 180 ; error = 231.6590
Iteration: 200 ; error = 224.1449
Iteration: 220 ; error = 217.9366
Iteration: 240 ; error = 212.7957
Iteration: 260 ; error = 208.5007
Iteration: 280 ; error = 204.8731
Iteration: 300 ; error = 201.7748
Iteration: 320 ; error = 199.1015
Iteration: 340 ; error = 196.7761
Iteration: 360 ; error = 194.7356
Iteration: 380 ; error = 192.9336
Iteration: 400 ; error = 191.3322
Iteration: 420 ; error = 189.9012
Iteration: 440 ; error = 188.6170
Iteration: 460 ; error = 187.4571
Iteration: 480 ; error = 186.4067
Iteration: 500 ; error = 185.4493
Iteration: 520 ; error = 184.5750
Iteration: 540 ; error = 183.7722
Iteration: 560 ; error = 183.0329
Iteration: 580 ; error = 182.3495
Iteration: 600 ; e

TODO: having done the predicitons for the missing ratings we ought to backfill these values to the ratings dataframe

TODO: We have created our recommendation engine. Next we have to evaluate our recommendation engine

In [47]:
prediction_matrix = mf.full_matrix()

In [48]:
prediction_matrix.shape

(948, 1695)

In [49]:
prediction_df = ratings
prediction_df["predicted_rating"] = ""
prediction_df.head()



Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating
0,196,242,3,881250949.0,
1,186,302,3,891717742.0,
2,22,377,1,878887116.0,
3,244,51,2,880606923.0,
4,166,346,1,886397596.0,


In [None]:
def getRatingPrediction(user_id, movie_id):
    return prediction_matrix.item(prediction_df["user_id"]-1,prediction_df["movie_id"]-1)


In [53]:
for i in range(len(ratings)):
    prediction_df["predicted_rating"][i] = prediction_matrix.item(prediction_df["user_id"][i]-1,prediction_df["movie_id"][i]-1)

# prediction_df["predicted_rating"] = prediction_matrix.item(prediction_df["user_id"]-1,prediction_df["movie_id"]-1)
   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


KeyboardInterrupt: 

In [None]:
prediction_df.head()

In [None]:
# prediction_df.to_csv("predicted_ratings.csv")



In [54]:
np.savetxt("team_prediction_matrix.txt", prediction_matrix)

In [55]:
prediction_matrix

array([[4.50046382, 3.1119234 , 2.80799172, ..., 3.03346072, 3.24541784,
        4.04033442],
       [3.85269325, 3.66662468, 2.79114341, ..., 2.74633775, 3.17873173,
        4.55348124],
       [3.58383525, 0.99841762, 1.73723293, ..., 2.95626832, 2.92236336,
        2.46934363],
       ...,
       [2.36791588, 4.19804856, 5.33103597, ..., 2.77731791, 3.2569473 ,
        5.04015737],
       [2.43576462, 3.80167733, 3.32695992, ..., 1.14392729, 2.06664469,
        4.8962139 ],
       [4.79422898, 4.28894489, 3.00608056, ..., 2.99347067, 3.45557369,
        5.11957518]])

In [56]:
prediction_matrix[1,]

array([3.85269325, 3.66662468, 2.79114341, ..., 2.74633775, 3.17873173,
       4.55348124])

In [77]:
def top5rec(userId):
#     movie_recs = 
    orderedRecs = list(np.argsort(-prediction_matrix[userId - 1,]))[0:5]
#     print(orderedRecs)
    rec_df = pd.DataFrame()
#     print("Top 5 movie recommendations:")
    for movie in orderedRecs:
        rec_df = rec_df.append(items.loc[items["movie id"]==movie+1])
    movie_list = []
    for i,row in enumerate(rec_df.values):
        movie_data = {
            "movie_id": row[0],
            "movie_title": row[1],
            "release_date": row[2],
            "IMDb_URL": row[4]
        }
        movie_list.append(movie_data)
    return(movie_list)

In [None]:
prediction_df["rating_diff"] = prediction_df["predicted_rating"] - prediction_df["rating"]

In [78]:
top5rec(944)

[{'movie_id': 924,
  'movie_title': 'White Squall (1996)',
  'release_date': '01-Jan-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?White%20Squall%20(1996)'},
 {'movie_id': 289,
  'movie_title': 'Evita (1996)',
  'release_date': '25-Dec-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Evita%20(1996)'},
 {'movie_id': 155,
  'movie_title': 'Dirty Dancing (1987)',
  'release_date': '01-Jan-1987',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Dirty%20Dancing%20(1987)'},
 {'movie_id': 143,
  'movie_title': 'Sound of Music, The (1965)',
  'release_date': '01-Jan-1965',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Sound%20of%20Music,%20The%20(1965)'},
 {'movie_id': 966,
  'movie_title': 'Affair to Remember, An (1957)',
  'release_date': '01-Jan-1957',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Affair%20to%20Remember,%20An%20(1957)'}]

In [75]:
test_list = []
for i,row in enumerate(top5rec(944).values):
    movie_data = {
        "movie_id": row[0],
        "movie_title": row[1],
        "release_date": row[2],
        "IMDb_URL": row[4]
    }
    test_list.append(movie_data)

In [76]:
test_list

[{'movie_id': 924,
  'movie_title': 'White Squall (1996)',
  'release_date': '01-Jan-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?White%20Squall%20(1996)'},
 {'movie_id': 289,
  'movie_title': 'Evita (1996)',
  'release_date': '25-Dec-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Evita%20(1996)'},
 {'movie_id': 155,
  'movie_title': 'Dirty Dancing (1987)',
  'release_date': '01-Jan-1987',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Dirty%20Dancing%20(1987)'},
 {'movie_id': 143,
  'movie_title': 'Sound of Music, The (1965)',
  'release_date': '01-Jan-1965',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Sound%20of%20Music,%20The%20(1965)'},
 {'movie_id': 966,
  'movie_title': 'Affair to Remember, An (1957)',
  'release_date': '01-Jan-1957',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Affair%20to%20Remember,%20An%20(1957)'}]

In [110]:
def top5rec1(userId):
    rated_movies = list(ratings.loc[ratings["user_id"] ==userId,"movie_id"].values)
    print(len(rated_movies))
    orderedRecs = list(np.argsort(-prediction_matrix[userId - 1,]))
    print(len(orderedRecs))
    for movie in rated_movies:
        if movie-1 in orderedRecs:
            orderedRecs.remove(movie)
    print(len(orderedRecs))
    top5 = orderedRecs[0:5]
    rec_df = pd.DataFrame()
    for movie in top5:
        rec_df = rec_df.append(items.loc[items["movie id"]==movie+1])
    movie_list = []
    for i,row in enumerate(rec_df.values):
        movie_data = {
            "movie_id": row[0],
            "movie_title": row[1],
            "release_date": row[2],
            "IMDb_URL": row[4]
        }
        movie_list.append(movie_data)
    return(movie_list)

In [112]:
prediction_matrix[944-1,]

array([4.16673801, 2.37740139, 3.04021611, ..., 2.40054238, 2.62942993,
       3.63923996])

In [115]:
len(np.argsort(-prediction_matrix[944-1,]))

1695

In [130]:
test = list(ratings.movie_id.unique())
len(test)

1695

In [131]:
test1 = list(items["movie id"])
len(test1)

1682

In [132]:
for x in test1:
    if x in test:
        test.remove(x)

In [134]:
test

[3227, 1723, 2915, 2490, 3069, 3173, 2945, 2263, 3394, 2942, 3647, 3717, 2434]

In [136]:
items.loc[items["movie id"] == 1723]

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western


In [138]:
top5rec(948)

[{'movie_id': 304,
  'movie_title': 'Fly Away Home (1996)',
  'release_date': '13-Sep-1996',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Fly%20Away%20Home%20(1996)'},
 {'movie_id': 516,
  'movie_title': 'Local Hero (1983)',
  'release_date': '01-Jan-1983',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Local%20Hero%20(1983)'},
 {'movie_id': 735,
  'movie_title': 'Philadelphia (1993)',
  'release_date': '01-Jan-1993',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Philadelphia%20(1993)'},
 {'movie_id': 59,
  'movie_title': 'Three Colors: Red (1994)',
  'release_date': '01-Jan-1994',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Trzy%20kolory:%20Czerwony%20(1994)'},
 {'movie_id': 1153,
  'movie_title': 'Backbeat (1993)',
  'release_date': '01-Jan-1993',
  'IMDb_URL': 'http://us.imdb.com/M/title-exact?Backbeat%20(1993)'}]

In [141]:
ratings.loc[ratings["user_id"] ==1,"movie_id"].values

array([ 61, 189,  33, 160,  20, 202, 171, 265, 155, 117,  47, 222, 253,
       113, 227,  17,  90,  64,  92, 228, 266, 121, 114, 132,  74, 134,
        98, 186, 221,  84,  31,  70,  60, 177,  27, 260, 145, 174, 159,
        82,  56, 272,  80, 229, 140, 225, 235, 120, 125, 215,   6, 104,
        49, 206,  76,  72, 185,  96, 213, 233, 258,  81,  78, 212, 143,
       151,  51, 175, 107, 218, 209, 259, 108, 262,  12,  14,  97,  44,
        53, 163, 210, 184, 157, 201, 150, 183, 248, 208, 128, 242, 148,
       112, 193, 264, 219, 232, 236, 252, 200, 180, 250,  85,  91,  10,
       254, 129, 241, 130, 255, 103, 118,  54, 267,  24,  86, 196,  39,
       164, 230,  36,  23, 224,  73,  67,  65, 190, 100, 226, 243, 154,
       214, 161,  62, 188, 102,  69, 170,  38,   9, 246,  22,  21, 179,
       187, 135,  68, 146, 176, 166, 138, 247,  89,   2,  30,  63, 249,
       269,  32, 141, 211,  40, 270, 133, 239, 194, 256, 220,  93,   8,
       205, 234, 105, 147,  99,   1, 197, 173,  75, 268,  34, 14