In [1]:
from gensim.models.doc2vec import Doc2Vec
import deepdish as dd
import numpy as np

In [2]:
def cosine_sim(v1, v2):
    return np.sum(v1 * v2) / (np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2))))

## Compare DocVecs

In [3]:
path = '../doc2vec-models/2016-04-14_17.36.08_20e_pv-dbow_size50_lr0.025_window8_neg5'
model = Doc2Vec.load(path)

In [4]:
# jurassic park
# imdb_id 107290
# movie_id 82
# 261 ratings in our ML-100k

# jurassic park 2 (Lost World: Jurassic Park, The (1997))
# imdb_id 119567
# movie_id 252
# 158 ratings in our ML-100k

In [5]:
# similar movies to jurassic park
model.docvecs.most_similar(positive=['107290.txt'])

[('119675.txt', 0.7279843091964722),
 ('119567.txt', 0.679550290107727),
 ('120004.txt', 0.6715301871299744),
 ('90605.txt', 0.6436644792556763),
 ('118928.txt', 0.633784294128418),
 ('97443.txt', 0.6167540550231934),
 ('117998.txt', 0.6146465539932251),
 ('56931.txt', 0.6136928200721741),
 ('88760.txt', 0.6116420030593872),
 ('118689.txt', 0.5930735468864441)]

In [6]:
# most similar movie to jurassic park in ML-100k:
# imdb_id 119675
# movie_id 264
# title Mimic

In [7]:
# disimilar movies to jurassic park
model.docvecs.most_similar(negative=['107290.txt']) # jurassic park

[('16630.txt', 0.11265313625335693),
 ('36868.txt', 0.09027586132287979),
 ('98635.txt', 0.08478951454162598),
 ('89853.txt', 0.08220337331295013),
 ('99334.txt', 0.0779241994023323),
 ('57710.txt', 0.07723505795001984),
 ('113083.txt', 0.07643953710794449),
 ('168740.txt', 0.07486752420663834),
 ('189142.txt', 0.06650149822235107),
 ('156887.txt', 0.04408472776412964)]

In [8]:
# most dissimilar to jurassic park in ML-100k:
# imdb_id 113083
# movie_id 927
# title "Flower of My Secret, The (Flor de mi secreto, La) (1995)"

In [9]:
jp_docvec = model.docvecs['107290.txt']

In [10]:
jp2_docvec = model.docvecs['119567.txt']

In [11]:
flower_docvec = model.docvecs['113083.txt']

In [12]:
# the hunt for red october
# imdb_id 99810
# movie_id 265
red_docvec = model.docvecs['99810.txt']

In [13]:
# men in black
# imdb_id 119654
# movie_id 257
men_docvec = model.docvecs['119654.txt']

In [14]:
cosine_sim(jp_docvec, jp2_docvec)

0.67955035

In [15]:
cosine_sim(jp_docvec, flower_docvec)

-0.076439522

In [16]:
cosine_sim(jp_docvec, red_docvec)

0.31112918

In [17]:
cosine_sim(jp_docvec, men_docvec)

0.38330352

## Compare Item Factors

In [18]:
loaded = dd.io.load('../models/mpcf-si/2016-06-20_18.58.44_si_ml-100k_e20_tt-0.7_lambda-item-0.1.h5')

In [19]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[82]
jp2_id = items_map[252]
flower_id = items_map[927]
red_id = items_map[265]
men_id = items_map[257]

In [20]:
item_factors = loaded['params']['Q']

In [21]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.27288967817141402

In [22]:
cosine_sim(item_factors[jp_id], item_factors[flower_id])

-0.50515176737951706

In [23]:
cosine_sim(item_factors[jp_id], item_factors[red_id])

0.71786133202780744

In [24]:
cosine_sim(item_factors[jp_id], item_factors[men_id])

0.18186173426419175

### Removes all but 10 ratings of Jurassic Parc and Jurassic Parc 2 in train

In [25]:
loaded = dd.io.load('../models/mpcf-si/2016-06-21_14.22.39_si_ml-100k_e20_tt-0.7_only-some-jurassic-park.h5')

In [26]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[82]
jp2_id = items_map[252]
flower_id = items_map[927]
red_id = items_map[265]
men_id = items_map[257]

In [27]:
item_factors = loaded['params']['Q']

In [28]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.12227672813951915

In [29]:
cosine_sim(item_factors[jp_id], item_factors[flower_id])

-0.2175744707509619

In [30]:
cosine_sim(item_factors[jp_id], item_factors[red_id])

0.37273054348096568

In [31]:
cosine_sim(item_factors[jp_id], item_factors[men_id])

0.13628980628300397