In [2]:
# !pip3 install implicit
import implicit
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import pickle

from tqdm import tqdm
from glob import glob
from IPython.core.interactiveshell import InteractiveShell
from sklearn.neighbors import NearestNeighbors

InteractiveShell.ast_node_interactivity = 'all'

In [3]:
ratings = pd.read_csv('../data/ratings.csv')
features = pickle.load(open('../data/preprocess/processed_infos.p', 'rb'))
info = pickle.load(open( "../data/preprocess/infos_tmdb.p", "rb" ))
text = pickle.load(open('../data/preprocess/texts_gpt2.p', 'rb'))

In [19]:
id_features = set(features.keys())
id_info     = set(info.keys())
id_text     = set(text.keys())

id_all      = id_features.intersection(id_info).intersection(id_text)
id_all      = sorted(id_all)

In [23]:
ratings = ratings.loc[ratings.movieId.isin(id_all)]

In [24]:
counting = ratings.groupby(['movieId']).size().reset_index()
counting.columns = ['movieId', 'count']
counting.sort_values('count', ascending=False)['movieId'].head().values

array([ 356,  318,  296,  593, 2571])

In [55]:
id_features = set(features.keys())
id_info     = set(info.keys())
id_text     = set(text.keys())

id_all      = id_features.intersection(id_info).intersection(id_text)
id_all      = sorted(id_all)

In [56]:
dict_features = {}
for i in tqdm(id_all):
    tmp = {}
    tmp.update(info[i])
    tmp.update(features[i])
    tmp.update({"text": text[i]})
    dict_features[i] = tmp

100%|██████████| 26770/26770 [00:00<00:00, 123313.52it/s]


# Features
* popularity
* budget
* revenue
* runtime
* vote_average
* vote_count
* text

In [30]:
def getMovie(n=5):
    # random index
    _id = id_all.copy()
    np.random.shuffle(_id)
    _idx = _idx[:n]
    movies_info     = list(map(lambda i: dict_features[i], _id))
    
    return movies_info

In [40]:
def transform(obj):
    tmp = np.append(obj['text'], obj['popularity'])
    tmp = np.append(tmp, obj['budget'])
    tmp = np.append(tmp, obj['revenue'])
    tmp = np.append(tmp, obj['runtime'])
    tmp = np.append(tmp, obj['vote_average'])
    tmp = np.append(tmp, obj['vote_count'])
    return tmp

In [73]:
X = []
for i in tqdm(id_all):
    X.append(transform(dict_features[i]))

100%|██████████| 26770/26770 [00:00<00:00, 34170.86it/s]


In [74]:
X = np.array(X)
X[np.isnan(X)] = 0

In [75]:
model = NearestNeighbors().fit(X)

In [90]:
def getSimilar(model, obj, n=5):
    _id = model.kneighbors([transform(obj)], n, return_distance=False)
    _id = np.array(id_all)[_id]
    return _id

In [95]:
lst_id = getSimilar(model, dict_features[1])[0]

In [96]:
lst_id

array([     1,  55110,    437, 102943,  88005])

In [102]:
name_all = []
for i in id_all:
    name_all.append((dict_features[i]['original_title'], i))

In [103]:
name_all

[('Toy Story', 1),
 ('Jumanji', 2),
 ('Grumpier Old Men', 3),
 ('Waiting to Exhale', 4),
 ('Father of the Bride Part II', 5),
 ('Heat', 6),
 ('Sabrina', 7),
 ('Tom and Huck', 8),
 ('Sudden Death', 9),
 ('GoldenEye', 10),
 ('The American President', 11),
 ('Dracula: Dead and Loving It', 12),
 ('Balto', 13),
 ('Nixon', 14),
 ('Cutthroat Island', 15),
 ('Casino', 16),
 ('Sense and Sensibility', 17),
 ('Four Rooms', 18),
 ('Ace Ventura: When Nature Calls', 19),
 ('Money Train', 20),
 ('Get Shorty', 21),
 ('Copycat', 22),
 ('Assassins', 23),
 ('Powder', 24),
 ('Leaving Las Vegas', 25),
 ('Othello', 26),
 ('Now and Then', 27),
 ('Persuasion', 28),
 ('La Cité des Enfants Perdus', 29),
 ('摇啊摇，摇到外婆桥', 30),
 ('Dangerous Minds', 31),
 ('Twelve Monkeys', 32),
 ('Guillaumet, les ailes du courage', 33),
 ('Babe', 34),
 ('Carrington', 35),
 ('Dead Man Walking', 36),
 ('Across the Sea of Time', 37),
 ('It Takes Two', 38),
 ('Clueless', 39),
 ('Cry, the Beloved Country', 40),
 ('Richard III', 41),
 ('D