In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
# Read Data and store on dataframes
movie_database = pd.read_csv('Small_Data/movies.csv')
users_ratings = pd.read_csv('Small_Data/ratings.csv')
tags_database = pd.read_csv('Small_Data/tags.csv')
links_database = pd.read_csv('Small_Data/links.csv')

In [3]:
# Display the first 10 items from the movies data set
movie_database[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Display the first 5 items from the rating data set
users_ratings[0:5]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Shift users and movies ID by 1, so start at 0
users_ratings['userId'] = users_ratings['userId'] - 1
users_ratings['movieId'] = users_ratings['movieId'] - 1
movie_database['movieId'] = movie_database['movieId'] - 1

In [6]:
# Print ratings statistics describing numeric features
users_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,325.127564,19434.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,0.0,0.0,0.5,828124600.0
25%,176.0,1198.0,3.0,1019124000.0
50%,324.0,2990.0,3.5,1186087000.0
75%,476.0,8121.0,4.0,1435994000.0
max,609.0,193608.0,5.0,1537799000.0


In [7]:
"""
def build_feedback_matrix(ratings,movies_list,users_list):
    # Initiate feedback matrix with all zero ratings
    # Get all unique users ids
    feedback_matrix = np.zeros((len(users_list),len(movies_list)))
    #users = pd.unique(users_ratings['userId'])
    for user_id in users_list:
        # Get the ratings for especific user id
        ratings = users_ratings[users_ratings['userId'] == user_id]
        num_ratings = len(ratings)
        # Set the rating of all the movies that the user Id has give rating
        for j in ratings.index:
            movie_id = ratings['movieId'].loc[j]
            movie_index = movie_database[movie_database['movieId'] == movie_id].index.values
            feedback_matrix[user_id,movie_index] = ratings['rating'].loc[j]

    return feedback_matrix

"""
indices = users_ratings[['userId','movieId']].values


In [8]:
'''
Alg:
1. Get userId list list_user
2. Get movieId list list_movie
3. For each user:
    a. Get ratings by user
    b. For each index of ratings in users_ratings dataframe:
        i. Get movieid index (column index) in ratings['movieId']
        ii. Append (user, movie, rating) to output list
4. Returns 3-tuple (output_list, list_user, list_movie)
'''
def get_user_item_pairs():
    list_userId = np.array(pd.unique(users_ratings['userId']))
    list_movieId = np.array(pd.unique(movie_database['movieId']))
    movie_index_list = pd.Index(movie_database['movieId'])
    output_list = []
    # index by user
    for user_index in np.arange(0, len(list_userId)):
        # associated user_id with user
        user_id = list_userId[user_index]
        # ratings by user
        ratings = users_ratings[users_ratings['userId'] == user_id]
        # for each row index associated with user
        for index in ratings.index:
            # get movie id for finding movie index
            movie_id = users_ratings['movieId'].iloc[index]
            # get movie index to place in list
            movie_index = movie_index_list.get_loc(movie_id)#movie_database['movieId'].loc[movie_id]
            # get rating
            rating = users_ratings['rating'].iloc[index]
            #rating = ratings['rating'].loc[index]
            #output_list.append((user_index, movie_index, rating))
            output_list.append([user_index,movie_index,rating])
    
    #output_list = np.array(output_list,dtype='i4,i4,i4')
    data_set = pd.DataFrame(output_list,columns=['user_idx','movie_idx','rating'])
    
    return(data_set,list_userId,list_movieId)
    #return (np.array(output_list, dtype='i4,i4,i4'), list_userId, list_movieId)
#     return (np.array(output_list, dtype=[('user_index', '<i4'), ('movie_index', '<i4'), ('rating', '<i4')]), list_userId, list_movieId)
'''
Input:
    S is a collection of 3-tuples (user, movie, rating)
        user is the index of the user in list_userId
        movie is the index of the movie in list_movieId
        rating is the corresponding rating
    Output: mxn feedback matrix
        m is the length of list_userId
        n is the length of list_movieId
        Entry (i, j) is nonzero if list_userId[i] reviewed list_movieId[j]
'''
def build_feedback_matrix(data_set, list_userId_len, list_movieId_len):
    """
    m = list_userId.shape[0]
    n = list_movieId.shape[0]
    F = np.zeros((m, n))
    for (user_index, movie_index, rating) in S:
        F[user_index, movie_index] = rating
    return F
    """
    feedback_matrix = np.zeros((list_userId_len,list_movieId_len))
    for index in data_set.index:
        user_index = data_set['user_idx'].loc[index]
        movie_index = data_set['movie_idx'].loc[index]
        rating = data_set['rating'].loc[index]
        feedback_matrix[user_index,movie_index] = rating
    
    return feedback_matrix


# Break the dataset into test and train sets
# Keep a 0.1 holdout fraction to construct test
def break_data(data,holdout_fraction=0.1):
    #data_copy = data.copy()
    #data_copy = data_copy.sample(frac=1)
    #test_set = data_copy.sample(frac=holdout_fraction)
    #train_set = data_copy[~data_copy.index.isin(test_set.index)]
    test_set = data.sample(frac=holdout_fraction)
    train_set = data[~data.index.isin(test_set.index)]
    return train_set, test_set

In [9]:
# Construct the data set based on the index of user, movies bind with the respective rating
# Build the list of usersIds
# Build the list of movieIds
(user_item_rating, list_userId, list_movieId)  = get_user_item_pairs()

In [10]:
# Break the data set into train and test 
train_data, test_data = break_data(user_item_rating)

In [11]:
# Objetive function
# Implement Mean Square Error(MSE)
def calculate_loss(feedback_matrix,P,V):
    #Calculate the ratings matrix approximation
    predictions = np.dot(P,V.T)
    # Calculate the square error for each reating
    error_matrix = np.square((feedback_matrix - predictions))
    # Return the mean square error over the last dimension
    return np.mean(error_matrix)
    #return np.sum(error_matrix)

In [12]:
class CF_Model(object):
    def __init__(self,users,movies):
        self.embedding_vars = {}
        self.train_loss = []
        self.test_loss = []
        self.list_userId = users
        self.list_movieId = movies
        self.num_users = users.shape[0]
        self.num_movies = movies.shape[0]
        self.num_features = 0

    def embeddings(self):
        return self.embedding_vars
    
    def build_model(self,num_features=30,mu=0,std=1):
        """
            P: associative matrix btw Users and Features
            V: associative matrix btw Movies and Features
        """
        self.num_features = num_features
        P = np.random.normal(loc=mu,scale=std,size=(self.num_users,num_features))
        V = np.random.normal(loc=mu,scale=std,size=(self.num_movies,num_features))
        self.embedding_vars['P'] = P
        self.embedding_vars['V'] = V
    
    def train(self,data_set,num_iter=100,learning_rate=0.00002):
        train_data, test_data =  break_data(data_set)
        train_matrix = build_feedback_matrix(train_data,self.num_users,self.num_movies)
        test_matrix = build_feedback_matrix(test_data,self.num_users,self.num_movies)
        P = self.embedding_vars['P']
        V = self.embedding_vars['V']

        for epoch in range(num_iter):
            """
            for i in range(self.num_users):
                for j in range(self.num_movies):
                    if train_matrix[i,j] > 0:
                        eij = train_matrix[i,j] - np.dot(P[i,:],V[:,j])
                        
                        for k in range(self.num_features):
                            P[i,k] = P[i,k] + 2*learning_rate * eij * V[k,j]
                            V[j,k] = V[j,k] + 2*learning_rate * eij * P[k,j]
            """
            E = train_matrix - np.dot(P,V.T)
            P = P + learning_rate * (2 * E @ V)
            V = V + learning_rate * (2 * E.T @ P)
            train_error = calculate_loss(train_matrix,P,V)
            test_error = calculate_loss(test_matrix,P,V)
            print("i: " + str(epoch) + " Train Loss: " + str(train_error) + " - Test Loss: " + str(test_error))
            self.train_loss.append(train_error)
            self.test_loss.append(test_error)
        self.embedding_vars['P'] = P
        self.embedding_vars['V'] = V
    
    def plot_loss():
        fig = plt.figure()
        fig.suptitle('Loss vs Number of Epochs')
        ax = fig.add_subplot(111)
        x1 = np.arange(len(self.train_loss))
        x2 = np.arange(len(self.test_loss))
        ax.plot(x1, self.train_loss, label='Train Data')
        ax.plot(x2, self.test_loss, label='Test Data')
        ax.set_ylabel('MSE Loss')
        ax.set_xlabel('Epoch Number')
        plt.legend()
        plt.show()

In [13]:
model = CF_Model(list_userId, list_movieId)

In [14]:
model.build_model()

In [15]:
model.train(user_item_rating,num_iter=100,learning_rate=0.0001)

i: 0 Train Loss: 36.024946881393866 - Test Loss: 35.85338535526406
i: 1 Train Loss: 133.34063197123535 - Test Loss: 133.15110164198924
i: 2 Train Loss: 14597.738039573645 - Test Loss: 14597.717633862016
i: 3 Train Loss: 272839088046881.5 - Test Loss: 272839088035904.3
i: 4 Train Loss: 1.2461576918847275e+59 - Test Loss: 1.2461576918847275e+59
i: 5 Train Loss: 2.0748891385000706e+248 - Test Loss: 2.0748891385000706e+248
i: 6 Train Loss: inf - Test Loss: inf
i: 7 Train Loss: inf - Test Loss: inf
i: 8 Train Loss: nan - Test Loss: nan
i: 9 Train Loss: nan - Test Loss: nan
i: 10 Train Loss: nan - Test Loss: nan
i: 11 Train Loss: nan - Test Loss: nan
i: 12 Train Loss: nan - Test Loss: nan
i: 13 Train Loss: nan - Test Loss: nan
i: 14 Train Loss: nan - Test Loss: nan
i: 15 Train Loss: nan - Test Loss: nan
i: 16 Train Loss: nan - Test Loss: nan
i: 17 Train Loss: nan - Test Loss: nan
i: 18 Train Loss: nan - Test Loss: nan
i: 19 Train Loss: nan - Test Loss: nan
i: 20 Train Loss: nan - Test Loss: 