In [9]:
# for recommender
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle

In [34]:
def conv(val):
    try:
        return int(val)
    except:
        return -1

# convert to json to return the result through http
def to_json(recommend_list: list()):
    movie_dict = {}
    for i, movie_id in enumerate(recommend_list):
        movie_dict[str(i)] = str(movie_id) 
    return movie_dict

def loadTFIDF(savePath):
        """
            savePath (str): * do not include ".pickle"
        """
        file_to_read = open(savePath + ".pickle", "rb")
        tfidf = pickle.load(file_to_read)
        file_to_read.close()
        return tfidf    

class MOVIE_TFIDF:

    def __init__(self, dataPath=None, savePath=None, nrows=None):
        """
            savePath (str): path of the saved MOVIE_TFIDF, not includes '.pkl' 
        """
        # to load the existing tfidf
        if savePath is not None:
            self = loadTFIDF(savePath)
        else:
            self._data = self.__readData(dataPath, nrows=nrows)
            self._tfidf_matrix = self.__computeTFIDFmatrix(self._data['tfidf'])
            self.movieid_index = self._data['id']
            del self._data
            self._cosine_sim = self.__computeSimilarity(self._tfidf_matrix)
            del self._tfidf_matrix

    def __readData(self, dataPath, nrows):
        movies_tfidf = pd.read_csv(dataPath, 
                                   nrows=nrows, 
                                   dtype={'title': str, 'genres': str}, 
                                   converters={'id': conv})
        
        # drop na and dups
        movies_tfidf.dropna(how="any", inplace=True)
        movies_tfidf.drop_duplicates(inplace=True)
        
        # title + genre + overview
        genres = movies_tfidf['genres']
        title = movies_tfidf['title']
        overview = movies_tfidf['overview']
        movies_tfidf['tfidf'] = genres + '|' + title + "|" + overview
        
        # set index
        movies_tfidf.index = range(len(movies_tfidf))
        return movies_tfidf

    def __computeTFIDFmatrix(self, movieGenres):
        tf = TfidfVectorizer(analyzer='word', 
                             ngram_range=(1, 2),
                             min_df=0, 
                             stop_words='english')
        
        tfidf_matrix = tf.fit_transform(movieGenres)
        return tfidf_matrix

    def __computeSimilarity(self, tfidf_matrix):
        cosine_sim = cosine_similarity(X = tfidf_matrix,
                                       Y = tfidf_matrix,
                                       dense_output = False)
        return cosine_sim

    def recommend(self, target_id: int, rated_movie_id: list() = None, numRecommendation=10) -> list():
        """
            target_id (int): the movie's id that needs recommendation
            numRecommendation (int): number of recommend movies, default = 10
            return list(): list of recommend ids
        """
        if any(self.movieid_index.isin([target_id])) is True:
            print('Target id is exists')
        else:
            return ['Target id is not exists']
        
        recommendLists = []
        
        # get the index of the target movie
        movie_index = self.movieid_index[self.movieid_index == target_id].index
        print('The target index: ', movie_index)
        print('the target id in data: ', self.movieid_index.loc[movie_index])
        
        # get all similarities of the target movie and the others
        movie_sims = self._cosine_sim[movie_index, :].toarray()[0]
        
        # sort by sim but get indexes only
        sorted_indexes = np.argsort(movie_sims)
        
        # find most similar movies
        for i in sorted_indexes:
            if (movie_sims[i] != 1.0 and numRecommendation > 0):
                # append the movie's ids
                candidate_movie_id = self.movieid_index.loc[i]
                # avoid rated movies
                if (rated_movie_id is not None) and (candidate_movie_id in rated_movie_id):
                    continue
                recommendLists.append(candidate_movie_id)
                numRecommendation -= 1
            else:
                break
        return recommendLists

    def saveTFIDF(self, savePath):
        """
            savePath (str): * do not include ".pickle"
        """
        file_to_store = open(savePath + ".pkl", "wb")
        pickle.dump(self, file_to_store, protocol=4)
        file_to_store.close()


In [35]:
DATA_PATH = './data/movie_tfidf.csv'
N_ROWS = None # all rows

In [36]:
tfidf = MOVIE_TFIDF(dataPath=DATA_PATH, nrows=N_ROWS)

In [37]:
tfidf.recommend(target_id=44919)

['Target id is not exists']

In [38]:
tfidf.recommend(target_id=862)

Target id is exists
The target index:  Int64Index([0], dtype='int64')
the target id in data:  0    862
Name: id, dtype: int64


[1571, 31687, 111310, 94917, 107643, 44399, 32084, 42191, 1549, 26694]

In [39]:
movies = pd.read_csv('./data/Movies.csv', 
                     dtype={'movie_id': int})
movies.head()

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_paths
0,862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg
1,8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,1995-12-15,
2,949,tt0113277,Heat,"Obsessive master thief, Neil McCauley leads a ...",1995-12-15,
3,710,tt0113189,GoldenEye,James Bond must unmask the mysterious head of ...,1995-11-16,/HORpg5CSkmeQlAolx3bKMrKgfi.jpg
4,1408,tt0112760,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",1995-12-22,


In [40]:
recommendation = tfidf.recommend(target_id=862)

Target id is exists
The target index:  Int64Index([0], dtype='int64')
the target id in data:  0    862
Name: id, dtype: int64


In [41]:
movies[movies['movie_id']==862]

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_paths
0,862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg


In [42]:
movies[movies['movie_id'].isin(recommendation)]

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_paths
2721,1571,tt0337978,Live Free or Die Hard,"John McClane is back and badder than ever, and...",2007-06-20,/dQP1lu4tBtCiAMeCRcuTFpJiM7y.jpg
3226,31687,tt1520496,Circle of Eight,Jessica moves into a loft on the eighth floor ...,2009-10-27,
3229,111310,tt0025607,Operator 13,Union spy Gail Loveless impersonates a black m...,1934-06-08,
3230,94917,tt0055489,Summer and Smoke,"In a small Mississippi town in 1916, an eccent...",1961-11-16,
3231,107643,tt0418815,Bloody Territories,A once-powerful yakuza clan disbands as a resu...,1969-07-26,
3232,44399,tt0033405,Blood and Sand,Bullfighter Juan Gallardo falls for socialite ...,1941-05-30,
3233,32084,tt1483831,Lebanon,"June, 1982 - The First Lebanon War. A lone tan...",2009-10-10,
3234,42191,tt0037832,Johnny Angel,George Raft plays a sailor who sets out to sol...,1945-10-24,
3237,1549,tt0338977,Games of Love and Chance,The life of a band of teenager in a suburb nea...,2003-11-25,
3238,26694,tt0956101,Big River Man,Follows Martin Strel as he attempts to cover 3...,2009-01-15,


In [None]:
tfidf.saveTFIDF('./model/tfidf')

In [None]:
load_tfidf = loadTFIDF('./model/tfidf')

In [None]:
load_tfidf.recommend(44919)

['Target id is not exists']

In [None]:
recommendation = load_tfidf.recommend(target_id=107643)

Target id is exists
The target index:  Int64Index([3219], dtype='int64')
the target id in data:  3219    107643
Name: id, dtype: int64
