### Matrix Factorization

Data Sources:
* https://grouplens.org/datasets/movielens/100k/
* https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-recommendation-engine-python/

In [None]:
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
os.chdir('../decision_tree/resources/raw/ml-latest-small')

In [2]:
os.listdir()

['links.csv',
 'movies.csv',
 'movie_recommender_100k_trained.h5',
 'ratings.csv',
 'README.txt',
 'tags.csv',
 'weights.h5']

In [3]:
# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

#Reading users file:
# u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
# users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

#Reading ratings file:
# r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
# ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings = pd.read_csv('ratings.csv',encoding='latin-1')

#Reading items file:
# i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
# 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
# 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# items = pd.read_csv('data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
items = pd.read_csv('movies.csv', encoding='latin-1').reset_index()

users = pd.DataFrame(ratings.userId.unique()).rename(columns={0: 'userId'})

### Users

In [4]:
print(users.shape)
users['userId'].max()

(610, 1)


610

### Ratings

In [5]:
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Items

In [6]:
print(items.shape)
items.head()

(9742, 4)


Unnamed: 0,index,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# merge users and ratings
df = pd.merge(ratings, items, on="movieId")
df.sort_values("userId").head()
genres = pd.get_dummies(df.genres.str.split('|',expand=True).stack()).sum(level=0)
# merge genres back to the original df
df = pd.concat([df,genres], axis=1)
# use drop to get remaining features
data = df.drop(['title', 'genres'],axis=1, inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,index,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,964982703,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,847434962,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,1106635946,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,1510577970,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,17,1,4.5,1305696483,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


### Loading in train and test provided by GroupLens where the test data has 10 ratings for each user

In [8]:
# os.chdir('../../../data/ml-100k')

In [9]:
r_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings_train = pd.read_csv('../../../data/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('../../../data/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

### Building collaborative filtering model from scratch

We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies

In [10]:
n_users = df.userId.unique().shape[0]
n_items = df.index.unique().shape[0]

In [11]:
n_users

610

In [12]:
n_items

100836

Now, we will create a user-item matrix which can be used to calculate the similarity between users and items

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
ratings1 = df[['userId', 'movieId', 'index', 'rating', 'timestamp']]
ratings1.head()

Unnamed: 0,userId,movieId,index,rating,timestamp
0,1,1,0,4.0,964982703
1,5,1,0,4.0,847434962
2,7,1,0,4.5,1106635946
3,15,1,0,2.5,1510577970
4,17,1,0,4.5,1305696483


In [15]:
data_matrix = np.zeros((n_users, n_items))
for line in ratings1.itertuples():
#     print(line[1]-1, line[3], line[4])
#     data_matrix[line[1]-1, line[2]-1] = line[3]
    data_matrix[line[1]-1, line[3]] = line[4]

Now, we will calculate the similarity. We can use the pairwise_distance function from sklearn to calculate the cosine similarity.

In [None]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

This gives us the item-item and user-user similarity in an array form. The next step is to make predictions based on these similarities. Let’s define a function to do just that.

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

Finally, we will make predictions based on user similarity and item similarity

In [None]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')


### Building a recommendation engine using matrix factorization

Let us define a function to predict the ratings given by the user to all the movies which are not rated by him/her

In [None]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha and beta.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # List of training samples
        self.samples = [
        (i, j, self.R[i, j])
        for i in range(self.num_users)
        for j in range(self.num_items)
        if self.R[i, j] > 0
        ]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 20 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    # Computing total mean squared error
    def mse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    # Stochastic gradient descent to get optimized P and Q matrix
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_rating(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    # Ratings for user i and movie j
    def get_rating(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie rating matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

#### Now we have a function that can predict the ratings. The input for this function are:

* R – The user-movie rating matrix
* K – Number of latent features
* alpha – Learning rate for stochastic gradient descent
* beta – Regularization parameter for bias
* iterations – Number of iterations to perform stochastic gradient descent


We have to convert the user item ratings to matrix form. It can be done using the pivot function in python

In [None]:
R= np.array(ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0))

In [None]:
R

In [None]:
R.shape

fillna(0) will fill all the missing ratings with 0. Now we have the R matrix. We can initialize the number of latent features, but the number of these features must be less than or equal to the number of original features.

Now let us predict all the missing ratings. Let’s take K=20, alpha=0.001, beta=0.01 and iterations=100

In [None]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=800)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()


TODO: having done the predicitons for the missing ratings we ought to backfill these values to the ratings dataframe

TODO: We have created our recommendation engine. Next we have to evaluate our recommendation engine

In [None]:
prediction_matrix = mf.full_matrix()

In [None]:
prediction_matrix.shape

Create a new dataframe "prediction_df" with the ratings column and add a column for predicted ratings. 

In [None]:
prediction_df = ratings
prediction_df["predicted_rating"] = ""
prediction_df.head()

In [None]:
for i in range(len(ratings)):
    prediction_df["predicted_rating"][i] = prediction_matrix.item(prediction_df["user_id"][i]-1,prediction_df["movie_id"][i]-1)
   

In [None]:
prediction_df.to_csv("predicted_ratings.csv")

In [None]:
np.savetxt("prediction_matrix.txt", prediction_matrix)
np.save('prediction_matrix.npy', prediction_matrix)

In [None]:
prediction_matrix

In [None]:
prediction_matrix[1,]

Create a function that recommends 5 movies to a user based on how likely they are to enjoy the movie.

In [None]:
def top5rec(userId):
    orderedRecs = list(np.argsort(-prediction_matrix[userId - 1,]))[0:5]
    print(orderedRecs)
    rec_df = pd.DataFrame()
    print("Top 5 movie recommendations:")
    for movie in orderedRecs:
        rec_df = rec_df.append(items.loc[items["movie id"]==movie+1])
    return(rec_df)

In [None]:
top5rec(2)