## Libraries

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import pickle

## Data Preprocessing 
* Data load
* Train/Test dataset splitting (for the evaluation part)
* Data cleaning (if needed)

There may be some other needsm, like listed below, when it comes to different algorithms. For popularity recommendation and collaborative recommendation, there is no need to merge data. For content-based recommendation, we need more information about movies to provide the similarity. For that part processing, it will be done later when it's needed. 
* Data integrating / transforming

### Load Data

In [2]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

In [3]:
def load(dataset):
    with open(dataset) as file:
        data = file.read().splitlines()
    file.close()
    columns = data[0].strip('\n').split("\t")
    rows = []
    rows_num = len(data)
    for i in range(1,rows_num):
        row = [int(value) if value.isdigit() else float(value) if isfloat(value) else value 
               for value in data[i].strip('\n').split("\t")]
        rows.append(row)
    df = pd.DataFrame(rows,columns=columns)
    return df 

In [4]:
# Another load function --  if there is any issue related with unicoding problem, use this function instead
def load2(dataset):
    with open(dataset,'rb') as file:
        data = [l.decode('utf8', 'ignore') for l in file.readlines()]
    file.close()
    columns = data[0].strip('\r\n').split("\t")
    rows = []
    rows_num = len(data)
    for i in range(1,rows_num):
        #row = [int(value) if value.isdigit() else float(value) if isfloat(value) else value 
               #for value in data[i].strip('\n').split("\t")]
        row = [int(value) if value.isdigit() else float(value) if isfloat(value) else value 
               for value in data[i].strip('\r\n').split("\t")]
        rows.append(row)
    df = pd.DataFrame(rows,columns=columns)
    return df 

In [5]:
def dataset_basic_info(data):
    print ("The shape of dataset:")
    print (data.shape)
    print ("\nAttribute names and types:")
    with pd.option_context('display.max_rows', None):
        print (data.dtypes)
    print ("\nAny missing values:")
    print (data.isnull().sum().sum())
    
    missing = pd.DataFrame(data.isnull().sum())
    missing.columns = ['Missing count']
    missing['Missing percentage'] = (missing['Missing count']/data.shape[0])*100
    print ("\nAny missing values:")
    print (missing)
    
    print ("\nExample:")
    print(data.head(1).T)

In [6]:
%%time
movies = load2('movies.dat')
user_ratedmovies = load2("user_ratedmovies.dat")

CPU times: user 8.17 s, sys: 592 ms, total: 8.76 s
Wall time: 9.12 s


In [8]:
dataset_basic_info(movies)

The shape of dataset:
(10197, 21)

Attribute names and types:
id                         int64
title                     object
imdbID                     int64
spanishTitle              object
imdbPictureURL            object
year                       int64
rtID                      object
rtAllCriticsRating        object
rtAllCriticsNumReviews    object
rtAllCriticsNumFresh      object
rtAllCriticsNumRotten     object
rtAllCriticsScore         object
rtTopCriticsRating        object
rtTopCriticsNumReviews    object
rtTopCriticsNumFresh      object
rtTopCriticsNumRotten     object
rtTopCriticsScore         object
rtAudienceRating          object
rtAudienceNumRatings      object
rtAudienceScore           object
rtPictureURL              object
dtype: object

Any missing values:
0

Any missing values:
                        Missing count  Missing percentage
id                                  0                 0.0
title                               0                 0.0
imdbID       

In [9]:
dataset_basic_info(user_ratedmovies)

The shape of dataset:
(855598, 9)

Attribute names and types:
userID           int64
movieID          int64
rating         float64
date_day         int64
date_month       int64
date_year        int64
date_hour        int64
date_minute      int64
date_second      int64
dtype: object

Any missing values:
0

Any missing values:
             Missing count  Missing percentage
userID                   0                 0.0
movieID                  0                 0.0
rating                   0                 0.0
date_day                 0                 0.0
date_month               0                 0.0
date_year                0                 0.0
date_hour                0                 0.0
date_minute              0                 0.0
date_second              0                 0.0

Example:
                  0
userID         75.0
movieID         3.0
rating          1.0
date_day       29.0
date_month     10.0
date_year    2006.0
date_hour      23.0
date_minute    17.0
date_second  

### Train/Test Split

In [7]:
um_train_df, umtest_df = train_test_split(user_ratedmovies,
                                          stratify=user_ratedmovies['userID'],
                                          train_size=0.8, test_size=0.2, random_state=99)
print ("Size of train set: {}\nSize of test set: {}".format(um_train_df.shape[0],umtest_df.shape[0]))

Size of train set: 684478
Size of test set: 171120


## Basic Function 
These are functions needed whichever algorithms will be used

### Metrics

In [127]:
# Similarity for Collabrative item-based recommender 

def cosine(X, Y):
    X_norm = (sum(map(lambda x:x*x, X)))**0.5
    Y_norm = (sum(map(lambda y:y*y, Y)))**0.5
    XdotY = sum(map(lambda x,y:x*y, X, Y))
    cosine = XdotY/(X_norm*Y_norm)
    return cosine

### Evaluations

In [8]:
def RMSE(orginal, predited):
    '''orginal: the original user rating
    predicted: new ratings based on prediction'''
    original = np.array(orginal)
    predicted = np.array(predited)
    rmse = np.sqrt(((orginal - predited) ** 2).mean())
    return rmse

def MAE(original, predicted):
    '''orginal: the original user rating
    predicted: new ratings based on prediction'''
    original = np.array(original)
    predicted = np.array(predicted)
    mae = sum(abs(original-predicted)).mean()
    return mae

def Evaluations(orginal, predited):
    '''Function evaluations is created to compare the evaluation from different approaches
    MAE: metric used in class
    RMSE: metric used in Netflix challenging
    '''
    rmse = RMSE(orginal, predited)
    mae = MAE(original, predicted)
    return rmse, mae

## Create Classes
Classes stands for different recommendation approaches

### Popularity Recommendation
Recommend the most popular items

In [113]:
def popularity_table(user_ratedmovies):
    table = user_ratedmovies.groupby('movieID')['rating'].agg(['mean','count']).sort_values(by=['mean','count'],ascending=False)
    better_table = table.loc[table['count']>=100]
    count_table = user_ratedmovies.groupby('movieID')['rating'].agg(['mean','count']).sort_values(by=['count','mean'],ascending=False)
    return table, better_table, count_table

In [114]:
popularity_df, better_popularity_df, count_popularity_df = popularity_table(um_train_df)

In [110]:
popularity_df.head(20)

Unnamed: 0_level_0,mean,count
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1
47117,5.0,2
320,5.0,1
404,5.0,1
701,5.0,1
1098,5.0,1
1575,5.0,1
1819,5.0,1
2063,5.0,1
3630,5.0,1
4044,5.0,1


In [107]:
better_popularity_df.head(20)

Unnamed: 0_level_0,mean,count
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1
318,4.36911,1146
858,4.323831,877
50,4.289779,949
44555,4.275424,236
1203,4.249304,359
2959,4.249126,1144
296,4.248172,1231
3435,4.246114,193
912,4.245543,617
750,4.241742,666


In [115]:
count_popularity_df.head(20)

Unnamed: 0_level_0,mean,count
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,4.164906,1325
356,3.927835,1261
4993,4.083533,1251
296,4.248172,1231
5952,4.019024,1209
7153,4.089916,1190
2858,4.113946,1176
593,4.062608,1158
318,4.36911,1146
2959,4.249126,1144


In [9]:
class PopularityRecommender:
    '''popularity-based recommendation algorithm: 
    Given a user U and an item I, 
    compute the predicted rating of U on I as 
    the mean rating for I among all users who have rated I.
    In other words, this method is to recommend most popular movies that the user hasn't watched'''
    
    model_name= 'Popularity'
    
    def __init__(self, movie_df, popularity_kind):
        ''':param: 
        movie_df: the movie information table, at least include movieID and movieName
        
        popularity_kind: Based on different defination of "popularity",diffferent tables are used
        1: popularity means the highest mean value, secondary consideration is count value (how many people rated the movie),
        2: popularity based on the description in "1", but add 100 as count threshold
        3: popularity means most people rated the movie, secondary consideration is the mean rating of the movie
        '''
        self.popularity_kind = popularity_kind
        self.moviedf = movie_df
    
    
    def get_model_name(self):
        return self.model_name
            
    
    def fit(self, user_ratedmovies_df):
        '''Fit for this approach is to create the popularity table
        Fit with training dataset
        
        Based on different defination of "popularity",
        diffferent tables are used
        
        :param: 
        user_ratedmovies_df: whole user_ratedmoives dataset
        
        :output: the popularity table based on different popularity defination'''
        
        self.umdf = user_ratedmovies_df
        self.ptable_overall = self.umdf.groupby('movieID')['rating'].agg(['mean','count']).sort_values(by=['mean','count'],
                                                                                                       ascending=False)
        
        if self.popularity_kind == 1:
            self.ptable = self.ptable_overall
        
        elif self.popularity_kind == 2:
            self.ptable = self.ptable_overall.loc[self.ptable_overall['count']>=100]
        
        elif self.popularity_kind == 3:
            self.ptable = self.umdf.groupby('movieID')['rating'].agg(['mean','count']).sort_values(by=['count','mean'],
                                                                                                   ascending=False)
        else:
            print("Error: popularity_kind not exist.")
    
    
    def predict(self, data_topredict):
        '''Prediction unknown data (test dataset)
        
        :param: a dataset including userID and movieID
        
        :return: the list of movie ratings, same order as the input
        '''
        movie_predict_list  = data_topredict['movieID'].tolist()
        popularity_predict_list = [self.ptable_overall.ix[movie]['mean'] 
                                   for movie in movie_predict_list]
        return popularity_predict_list

    
    def recommend(self, userIDlist, topn=20):
        '''Print the recommendations out'''
        recommend_id = []
        recommend_name = []
        
        users = self.umdf['userID'].unique().tolist()
        
        for user in userIDlist:
            if user in users:
                user_table = self.umdf.loc[self.umdf['userID']==user]
                movies_watched = user_table['movieID'].tolist()
            else:
            # if the user not exit in the known datset, it's taken as new user, the watchlist is none
                movies_watched = []
            new_table = self.ptable.reset_index()
            recommend_movie_list = new_table[~new_table['movieID'].isin(movies_watched)]['movieID'].head(topn).tolist()
            recommend_movie_list_name = [self.moviedf.loc[self.moviedf['id']==movie]['title'] 
                                         for movie in recommend_movie_list]
            recommend_id.append(recommend_movie_list)
            recommend_name.append(recommend_movie_list_name)
        
        recommend_out = {'userID':userIDlist,
                         'recommend_movieID':recommend_id,
                         'recommend_movieName':recommend_name}
        
        recommend_df = pd.DataFrame(recommend_out)
        recommend_df.set_index('userID', inplace=True)
        
        return recommend_df

In [10]:
popularity_model = PopularityRecommender(movies,popularity_kind=3)

In [11]:
%%time
popularity_model.get_model_name()

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs


'Popularity'

In [12]:
%%time
popularity_model.fit(user_ratedmovies)

CPU times: user 144 ms, sys: 68.2 ms, total: 212 ms
Wall time: 222 ms


In [13]:
%%time
popularity_model_predict = popularity_model.predict(umtest_df)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


CPU times: user 35.6 s, sys: 471 ms, total: 36.1 s
Wall time: 38.4 s


In [14]:
%%time
user_list = [75, 78, 71534]
popularity_model_recommend = popularity_model.recommend(user_list)

CPU times: user 80.7 ms, sys: 8.88 ms, total: 89.6 ms
Wall time: 90.7 ms


### Collaborative Recommendation
Simiarity measurement: Pearson Correlation (works well based on rating); Manhattan distance, Euclidean distance, Cosine similarity

User-based collaborative -- pearson correlation
Recommendations based on ratings from other (similar) users

Item-based collaborative -- cosine similarity (prediction of item I for user a is based on the past ratings of user a on items similar to i)
Recommendations based on (similar) contents -- in this case, movies. 

DOESN'T REQUIRE ANY INFO ABOUT ITEMS

In [23]:
class CollaborativeRecommender:
    model_name = "Collaborative"

    def __init__(self, base, movie_df, user_ratedmovies):
        """:param:
        base: which collaborative recommender used
        base = 1: user-based collaborative
        base = 2: item-based collaborative
        base = 3: combination of user_based and iten_based collaborative

        movie_df: the movie information table, at least include movieID and movieName

        user_ratedmovies: whole user_ratedmoives dataset
        """
        self.base = base
        self.moviedf = movie_df
        self.umdf = user_ratedmovies

    def get_model_name(self):
        if self.base == 1:
            b_name = "User-based"
        elif self.base == 2:
            b_name = "Item-based"
        elif self.base == 3:
            b_name = "User-based and Item-based Combined"
        else:
            b_name = ""
        name = b_name + ' ' + self.model_name
        return name

    def create_table(self):
        movie_list = self.moviedf['id'].tolist()
        user_list = self.umdf['userID'].unique().tolist()
        table = pd.DataFrame(np.nan, index=user_list, columns=movie_list)

        user_ratings_num = self.umdf.shape[0]
        for i in range(user_ratings_num):
            user_info = self.umdf.iloc[i]
            user_info_id = user_info['userID']
            user_info_movie = user_info['movieID']
            user_info_rating = user_info['rating']
            table.loc[user_info_id, user_info_movie] = user_info_rating

        user_item_table = table
        term_user_table = table.T

        return user_item_table, term_user_table

    def fit(self, user_item_table, term_user_table):
        """process to create the similarity matrix
        :output: the similarity matrix"""

        self.user_item_table = user_item_table
        self.term_user_table = term_user_table

        if self.base == 1:  # similarity for users -- pearson correlation
            index_list = self.user_item_table.index.values.tolist()
            similarity_table = pd.DataFrame(0.0, index=index_list, columns=index_list)
            for user in index_list:
                user_info = self.user_item_table.loc[user]
                other_users = index_list[:]
                other_users.remove(user)
                for other_user in other_users:
                    other_user_info = self.user_item_table.loc[other_user]
                    nas = np.logical_or(np.isnan(user_info), np.isnan(other_user_info))
                    corr = pearsonr(user_info[~nas], other_user_info[~nas])
                    similarity_table.loc[user, other_user] = corr[0]

        elif self.base == 2:  # similarity for items -- cosine similarity
            index_list = self.term_user_table.index.values.tolist()
            similarity_table = pd.DataFrame(0.0, index=index_list, columns=index_list)
            for movie in index_list:
                movie_info = self.term_user_table.loc[movie]
                other_movies = index_list[:]
                other_movies.remove(movie)
                for other_movie in other_movies:
                    other_movie_info = self.term_user_table.loc[other_movie]
                    nas = np.logical_or(np.isnan(movie_info), np.isnan(other_movie_info))
                    if len(movie_info[~nas]) == 0 or len(movie_info[~nas]) == 0:
                        cosin = 0
                    else:
                        cosin = cosine_similarity(movie_info[~nas], other_movie_info[~nas]).item(0)
                    similarity_table.loc[movie, other_movie] = cosin

        elif self.base == 3:
            similarity_algorithms = ['user_based', 'item_based']
            similarity_table_dict = {}
            for algorithm in similarity_algorithms:
                if algorithm == 'user_based':
                    index_list = self.user_item_table.index.values.tolist()
                    similarity_table = pd.DataFrame(0.0, index=index_list, columns=index_list)
                    for user in index_list:
                        user_info = self.user_item_table.loc[user]
                        other_users = index_list[:]
                        other_users.remove(user)
                        for other_user in other_users:
                            other_user_info = self.user_item_table.loc[other_user]
                            nas = np.logical_or(np.isnan(user_info), np.isnan(other_user_info))
                            corr = pearsonr(user_info[~nas], other_user_info[~nas])
                            similarity_table.loc[user, other_user] = corr[0]
                    similarity_table_dict[algorithm] = similarity_table
                else:
                    index_list = self.term_user_table.index.values.tolist()
                    similarity_table = pd.DataFrame(0.0, index=index_list, columns=index_list)
                    for movie in index_list:
                        movie_info = self.term_user_table.loc[movie]
                        other_movies = index_list[:]
                        other_movies.remove(movie)
                        for other_movie in other_movies:
                            other_movie_info = self.term_user_table.loc[other_movie]
                            nas = np.logical_or(np.isnan(movie_info), np.isnan(other_movie_info))
                            if len(movie_info[~nas]) == 0 or len(movie_info[~nas]) == 0:
                                cosin = 0
                            else:
                                cosin = cosine_similarity(movie_info[~nas], other_movie_info[~nas]).item(0)
                            similarity_table.loc[movie, other_movie] = cosin
                    similarity_table_dict[algorithm] = similarity_table
            # even though the output for the base 3 is a dictionary
            # to keep it consistant with base 1 and 2, the dictionary is assigned to same name object "similarity_table"
            similarity_table = similarity_table_dict

        else:
            print("Error: the base selection is not available.")

        self.similarity_table = similarity_table

    def predict(self, data_topredict, topn=20):
        """Prediction unknown data (test dataset)
        :param:
        data_topredict: a dataset including userID and movieID
        topn: the top n most similar objects
        :return:
        the list of movie ratings, same order as the input"""

        predict_num = data_topredict.shape[0]
        output = []

        if self.base == 1:
            for num in range(predict_num):
                movie_id = data_topredict.iloc[num]['movieID']
                user_id = data_topredict.iloc[num]['userID']
                similarity_to_user = self.similarity_table.sort_values(by=user_id, ascending=False).index.values

                n = 1
                i = 0
                ratings = 0
                while n <= topn:
                    try:
                        similar_user_id = similarity_to_user[i]
                        similar_user_rating = self.user_item_table.loc[similar_user_id, movie_id]
                        i += 1
                        if similar_user_rating > 0:
                            ratings += similar_user_rating
                            n += 1
                    except IndexError:
                        n = n-1
                        break
                rating = ratings / n
                output.append(rating)

        elif self.base == 2:
            for num in range(predict_num):
                movie_id = data_topredict.iloc[num]['movieID']
                user_id = data_topredict.iloc[num]['userID']
                similarity_to_movie = self.similarity_table.sort_values(by=movie_id, ascending=False).index.values

                n = 1
                i = 0
                ratings = 0
                while n <= topn:
                    try:
                        similar_movie_id = similarity_to_movie[i]
                        similar_movie_rating = self.term_user_table.loc[similar_movie_id, user_id]
                        i += 1
                        if similar_movie_rating > 0:
                            ratings += similar_movie_rating
                            n += 1
                    except IndexError:
                        n = n-1
                        break
                rating = ratings / n
                output.append(rating)

        elif self.base == 3:
            similarity_algorithms = ['user_based', 'item_based']
            output_dict = {key: [] for key in similarity_algorithms}
            for algorithm in similarity_algorithms:
                if algorithm == 'user_based':
                    output = output_dict[algorithm]
                    for num in range(predict_num):
                        similarity_table = self.similarity_table[algorithm]
                        movie_id = data_topredict.iloc[num]['movieID']
                        user_id = data_topredict.iloc[num]['userID']
                        similarity_to_user = similarity_table.sort_values(by=user_id, ascending=False).index.values

                        n = 1
                        i = 0
                        ratings = 0
                        while n <= topn:
                            try:
                                similar_user_id = similarity_to_user[i]
                                similar_user_rating = self.user_item_table.loc[similar_user_id, movie_id]
                                i += 1
                                if similar_user_rating > 0:
                                    ratings += similar_user_rating
                                    n += 1
                            except IndexError:
                                n = n-1
                                break
                        rating = ratings / n
                        output.append(rating)
                    output_dict[algorithm] = output
                else:
                    output = output_dict[algorithm]
                    for num in range(predict_num):
                        similarity_table = self.similarity_table[algorithm]
                        movie_id = data_topredict.iloc[num]['movieID']
                        user_id = data_topredict.iloc[num]['userID']
                        similarity_to_movie = similarity_table.sort_values(by=movie_id, ascending=False).index.values

                        n = 1
                        i = 0
                        ratings = 0
                        while n <= topn:
                            try:
                                similar_movie_id = similarity_to_movie[i]
                                similar_movie_rating = self.term_user_table.loc[similar_movie_id, user_id]
                                i += 1
                                if similar_movie_rating > 0:
                                    ratings += similar_movie_rating
                                    n += 1
                            except IndexError:
                                n = n-1
                                break
                        rating = ratings / n
                        output.append(rating)
                    output_dict[algorithm] = output

            output1_array = np.array(output_dict['user_based'])
            output2_array = np.array(output_dict['item_based'])
            output = list(0.5 * output1_array + 0.5 * output2_array)

        return output

    def recommend(self, useridlist, topn_similarity=20, topn_recommends=20):
        """Print the recommendations out

        :param:
        userIDlist: list of users needing recommendation
        topn: the top n recommendations for the user

        :output:
        return a dataframe including userID,
        recommended movie id list,
        recommended movie name list
        """
        recommend_id = []
        recommend_name = []

        users = self.umdf['userID'].unique().tolist()
        movies = set(self.umdf['movieID'].unique())

        if self.base == 1:
            for user in useridlist:
                if user in users:
                    user_table = self.umdf.loc[self.umdf['userID'] == user]
                    movies_watched = set(user_table['movieID'])
                else:
                    movies_watched = set()
                movies_to_predict_id = list(movies - movies_watched)

                movies_to_predict_rating = []

                for movie in movies_to_predict_id:
                    similarity_to_user = self.similarity_table.sort_values(by=user, ascending=False).index.values
                    n = 1
                    i = 0
                    ratings = 0
                    while n <= topn_similarity:
                        try:
                            similar_user_id = similarity_to_user[i]
                            similar_user_rating = self.user_item_table.loc[similar_user_id, movie]
                            i += 1
                            if similar_user_rating > 0:
                                ratings += similar_user_rating
                                n += 1
                        except IndexError:
                            n = n-1
                            break
                    rating = ratings / n
                    movies_to_predict_rating.append(rating)

                topn_movies_index = sorted(range(len(movies_to_predict_rating)),
                                           key=lambda mi: movies_to_predict_rating[mi], reverse=True)[:topn_recommends]
                topn_movies_id = [movies_to_predict_id[index] for index in topn_movies_index]
                topn_movies_name = [self.moviedf.loc[self.moviedf['id'] == movie]['title']
                                    for movie in topn_movies_id]
                recommend_id.append(topn_movies_id)
                recommend_name.append(topn_movies_name)

        elif self.base == 2:
            for user in useridlist:
                if user in users:
                    user_table = self.umdf.loc[self.umdf['userID'] == user]
                    movies_watched = set(user_table['movieID'])
                else:
                    movies_watched = set()
                movies_to_predict_id = list(movies - movies_watched)

                movies_to_predict_rating = []

                for movie in movies_to_predict_id:
                    similarity_to_movie = self.similarity_table.sort_values(by=movie, ascending=False).index.values
                    n = 1
                    i = 0
                    ratings = 0
                    while n <= topn_similarity:
                        try:
                            similar_movie_id = similarity_to_movie[i]
                            similar_movie_rating = self.item_user_table.loc[similar_movie_id, user]
                            i += 1
                            if similar_movie_rating > 0:
                                ratings += similar_movie_rating
                                n += 1
                        except IndexError:
                            n = n-1
                            break
                    rating = ratings / n
                    movies_to_predict_rating.append(rating)

                topn_movies_index = sorted(range(len(movies_to_predict_rating)),
                                           key=lambda mi: movies_to_predict_rating[mi], reverse=True)[:topn_recommends]
                topn_movies_id = [movies_to_predict_id[index] for index in topn_movies_index]
                topn_movies_name = [self.moviedf.loc[self.moviedf['id'] == movie]['title']
                                    for movie in topn_movies_id]
                recommend_id.append(topn_movies_id)
                recommend_name.append(topn_movies_name)

        elif self.base == 3:
            similarity_algorithms = ['user_based', 'item_based']

            for user in useridlist:
                if user in users:
                    user_table = self.umdf.loc[self.umdf['userID'] == user]
                    movies_watched = set(user_table['movieID'])
                else:
                    movies_watched = set()
                movies_to_predict_id = list(movies - movies_watched)

                movies_to_predict_rating_dict = {key: [] for key in similarity_algorithms}

                for algorithm in similarity_algorithms:
                    if algorithm == 'user_based':
                        movies_to_predict_rating_user = movies_to_predict_rating_dict[algorithm]
                        similarity_table = self.similarity_table[algorithm]

                        for movie in movies_to_predict_id:
                            similarity_to_user = similarity_table.sort_values(by=user, ascending=False).index.values
                            
                            n = 1
                            i = 0
                            ratings = 0
                            while n <= topn_similarity:
                                try:
                                    similar_user_id = similarity_to_user[i]
                                    similar_user_rating = self.user_item_table.loc[similar_user_id, movie]
                                    i += 1
                                    if similar_user_rating > 0:
                                        ratings += similar_user_rating
                                        n += 1
                                except IndexError:
                                    n = n-1
                                    break
                            rating = ratings / n
                            movies_to_predict_rating_user.append(rating)

                        movies_to_predict_rating_dict[algorithm] = movies_to_predict_rating_user

                    else:
                        movies_to_predict_rating_item = movies_to_predict_rating_dict[algorithm]
                        similarity_table = self.similarity_table[algorithm]

                        for movie in movies_to_predict_id:
                            similarity_to_movie = similarity_table.sort_values(by=movie, ascending=False).index.values
                            n = 1
                            i = 0
                            ratings = 0
                            while n <= topn_similarity:
                                try:
                                    similar_movie_id = similarity_to_movie[i]
                                    similar_movie_rating = self.item_user_table.loc[similar_movie_id, user]
                                    i += 1
                                    if similar_movie_rating > 0:
                                        ratings += similar_movie_rating
                                        n += 1
                                except IndexError:
                                    n = n-1
                                    break
                            rating = ratings / n
                            movies_to_predict_rating_item.append(rating)

                        movies_to_predict_rating_dict[algorithm] = movies_to_predict_rating_item

                movies_to_predict_rating1_array = np.array(movies_to_predict_rating_dict['user_based'])
                movies_to_predict_rating2_array = np.array(movies_to_predict_rating_dict['item_based'])
                movies_to_predict_rating = list(movies_to_predict_rating1_array * 0.5 + movies_to_predict_rating2_array * 0.5)

                topn_movies_index = sorted(range(len(movies_to_predict_rating)),
                                           key=lambda mi: movies_to_predict_rating[mi], reverse=True)[:topn_recommends]
                topn_movies_id = [movies_to_predict_id[index] for index in topn_movies_index]
                topn_movies_name = [self.moviedf.loc[self.moviedf['id'] == movie]['title']
                                    for movie in topn_movies_id]
                recommend_id.append(topn_movies_id)
                recommend_name.append(topn_movies_name)

        recommend_out = {'userID': useridlist,
                         'recommend_movieID': recommend_id,
                         'recommend_movieName': recommend_name}

        recommend_df = pd.DataFrame(recommend_out)
        recommend_df.set_index('userID', inplace=True)

        return recommend_df


In [24]:
%%time
collaborativemodel_user = CollaborativeRecommender(base=1,
                                                  movie_df=movies,
                                                  user_ratedmovies=user_ratedmovies)

CPU times: user 15 µs, sys: 9 µs, total: 24 µs
Wall time: 30 µs


In [25]:
%%time
user_item_table,item_user_table = collaborativemodel_user.create_table()

CPU times: user 6min 40s, sys: 2.12 s, total: 6min 42s
Wall time: 6min 47s


In [26]:
%%time
collaborativemodel_user.fit(user_item_table, item_user_table)

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den


CPU times: user 1h 30min 17s, sys: 34.3 s, total: 1h 30min 52s
Wall time: 1h 32min 33s


In [27]:
%%time
collaborativemodel_user_predict = collaborativemodel_user.predict(umtest_df, topn=20)

CPU times: user 37min 49s, sys: 16min 55s, total: 54min 45s
Wall time: 55min 36s


In [28]:
%%time
user_list = [325, 71509]
collaborativemodel_user_recommend = collaborativemodel_user.recommend(user_list,topn_similarity=20, topn_recommends=20)
collaborativemodel_user_recommend

CPU times: user 10min 10s, sys: 2min 28s, total: 12min 39s
Wall time: 12min 57s


In [31]:
filename = "Collaborative_user_model_alldata.pickle"
with open(filename, "wb") as file:
    pickle.dump(collaborativemodel_user,file)