In [8]:
# for recommender
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle

In [20]:
def conv(val):
    try:
        return int(val)
    except:
        return -1

# convert to json to return the result through http
def to_json(recommend_list: list()):
    movie_dict = {}
    for i, movie_id in enumerate(recommend_list):
        movie_dict[str(i)] = str(movie_id) 
    return movie_dict

def loadTFIDF(savePath):
        """
            savePath (str): * do not include ".pkl"
        """
        file_to_read = open(savePath + ".pkl", "rb")
        tfidf = pickle.load(file_to_read)
        file_to_read.close()
        return tfidf    

class MOVIE_TFIDF:

    def __init__(self, dataPath=None, savePath=None, nrows=None):
        """
            savePath (str): path of the saved MOVIE_TFIDF, not includes '.pkl' 
        """
        # to load the existing tfidf
        if savePath is not None:
            self = loadTFIDF(savePath)
        else:
            self._data = self.__readData(dataPath, nrows=nrows)
            self._tfidf_matrix = self.__computeTFIDFmatrix(self._data['tfidf'])
            self.movieid_index = self._data['id']
            del self._data
            self._cosine_sim = self.__computeSimilarity(self._tfidf_matrix)
            del self._tfidf_matrix

    def __readData(self, dataPath, nrows):
        movies_tfidf = pd.read_csv(dataPath, 
                                   nrows=nrows, 
                                   dtype={'title': str, 'genres': str}, 
                                   converters={'id': conv})
        
        # drop na and dups
        movies_tfidf.dropna(how="any", inplace=True)
        movies_tfidf.drop_duplicates(inplace=True)
        
        # title + genre + overview
        genres = movies_tfidf['genres']
        title = movies_tfidf['title']
        overview = movies_tfidf['overview']
        movies_tfidf['tfidf'] = genres + '|' + title + "|" + overview
        
        # set index
        movies_tfidf.index = range(len(movies_tfidf))
        return movies_tfidf

    def __computeTFIDFmatrix(self, movieGenres):
        tf = TfidfVectorizer(analyzer='word', 
                             ngram_range=(1, 2),
                             min_df=0, 
                             stop_words='english')
        
        tfidf_matrix = tf.fit_transform(movieGenres)
        return tfidf_matrix

    def __computeSimilarity(self, tfidf_matrix):
        cosine_sim = cosine_similarity(X = tfidf_matrix,
                                       Y = tfidf_matrix,
                                       dense_output = False)
        return cosine_sim

    def recommend(self, target_id: int, rated_movie_id: list() = None, numRecommendation=10) -> list():
        """
            target_id (int): the movie's id that needs recommendation
            numRecommendation (int): number of recommend movies, default = 10
            return list(): list of recommend ids
        """
        if any(self.movieid_index.isin([target_id])) is True:
            print('Target id is exists')
        else:
            return ['Target id is not exists']
        
        recommendLists = []
        
        # get the index of the target movie
        movie_index = self.movieid_index[self.movieid_index == target_id].index
        print('The target index: ', movie_index)
        print('the target id in data: ', self.movieid_index.loc[movie_index])
        
        # get all similarities of the target movie and the others
        movie_sims = self._cosine_sim[movie_index, :].toarray()[0]
        
        # sort by sim but get indexes only
        sorted_indexes = np.argsort(movie_sims)
        
        # find most similar movies
        for i in sorted_indexes:
            if (movie_sims[i] != 1.0 and numRecommendation > 0):
                # append the movie's ids
                candidate_movie_id = self.movieid_index.loc[i]
                # avoid rated movies
                if (rated_movie_id is not None) and (candidate_movie_id in rated_movie_id):
                    continue
                recommendLists.append(candidate_movie_id)
                numRecommendation -= 1
            else:
                break
        return recommendLists

    def saveTFIDF(self, savePath):
        """
            savePath (str): * do not include ".pkl"
        """
        file_to_store = open(savePath + ".pkl", "wb")
        pickle.dump(self, file_to_store, protocol=4)
        file_to_store.close()


In [10]:
DATA_PATH = './data/movie_tfidf.csv'
N_ROWS = None # all rows

In [11]:
tfidf = MOVIE_TFIDF(dataPath=DATA_PATH, nrows=N_ROWS)

In [12]:
tfidf.recommend(target_id=44919)

['Target id is not exists']

In [13]:
tfidf.recommend(target_id=862)

Target id is exists
The target index:  Int64Index([0], dtype='int64')
the target id in data:  0    862
Name: id, dtype: int64


[111109, 4913, 67087, 130492, 1116, 1249, 171771, 1776, 42952, 31148]

In [14]:
movies = pd.read_csv('./data/Movies.csv', 
                     dtype={'movie_id': int})
movies.head()

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_path
0,862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",10/30/1995,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg
1,8844,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,12/15/1995,/6aGn2X51bahFoOI8wE1h2VGTgcH.jpg
2,949,tt0113277,Heat,"Obsessive master thief, Neil McCauley leads a ...",12/15/1995,/obpPQskaVpSiC9RcJRB6iWDTCXS.jpg
3,710,tt0113189,GoldenEye,James Bond must unmask the mysterious head of ...,11/16/1995,/bFzjdy6ucvNlXmJwoSoYfufV6lP.jpg
4,1408,tt0112760,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",12/22/1995,/hYdeBZ4BFXivdouxLfQGWNE6zRx.jpg


In [15]:
recommendation = tfidf.recommend(target_id=862)

Target id is exists
The target index:  Int64Index([0], dtype='int64')
the target id in data:  0    862
Name: id, dtype: int64


In [16]:
movies[movies['movie_id']==862]

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_path
0,862,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",10/30/1995,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg


In [17]:
movies[movies['movie_id'].isin(recommendation)]

Unnamed: 0,movie_id,imdb_id,title,overview,release_date,poster_path
2542,4913,tt0425600,"Sorry, Haters",Against the anxieties and fears of post-9/11 A...,9/10/2005,/5jA1Ka8ZvjNbBDkhQ7LowcCi5Zh.jpg
2546,1116,tt0460989,The Wind That Shakes the Barley,In 1920s Ireland young doctor Damien O'Donovan...,6/23/2006,/9XquDdOGrlC0EAbPoOXALqS2dDh.jpg
2549,1249,tt0427969,Hollywoodland,The complicated life and controversial suicide...,8/31/2006,/fhLRUXpVGBsDL2dMWkOwuPNzlrq.jpg
2550,171771,tt2449612,The Scar,The Scar draws an intense psychological suspen...,3/3/2013,
2551,1776,tt0486358,Jesus Camp,A growing number of Evangelical Christians bel...,9/15/2006,/6ueSRBYaicEYvR2sFHVvptKNaxI.jpg
2555,31148,tt0032342,City for Conquest,The heartbreaking but hopeful tale of Danny Ke...,9/21/1940,/voqHla2OkukEyVgQBOuMjhzWUCu.jpg
4952,42952,tt0401089,72 Meters,The film begins in the 1980s Soviet Union. Two...,2/12/2004,/iMQetgfzrxeBjy61KJK31jaZG2o.jpg
4955,130492,tt0276594,Uno bianca,"Rimini, 1991. For more than a year, the uno bi...",2/5/2001,/dOH8DzZIieH9NOJxF5LXxBe0HaZ.jpg
4956,67087,tt0085385,Curtains,Six young actresses auditioning for a movie ro...,3/4/1983,/jpMClX0G1Gsia1Dn1HaFGPXqjd0.jpg
5541,111109,tt2028550,Century of Birthing,An artist struggles to finish his work while a...,11/17/2011,/Z0qIhiDpdm2Sd8V2M1Nk6oLKdP.jpg


In [18]:
tfidf.saveTFIDF('./model/tfidf')

In [21]:
load_tfidf = loadTFIDF('./model/tfidf')

In [22]:
load_tfidf.recommend(44919)

['Target id is not exists']

In [23]:
recommendation = load_tfidf.recommend(target_id=107643)

Target id is exists
The target index:  Int64Index([3219], dtype='int64')
the target id in data:  3219    107643
Name: id, dtype: int64
