In [47]:
from gensim.models.doc2vec import Doc2Vec
import deepdish as dd
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
def cosine_sim(v1, v2):
    #return cosine_similarity(v1,v2)
    #return 1 - spatial.distance.cosine(v1, v2)
    return np.sum(v1 * v2) / (np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2))))

def angular_sim(v1, v2):
    return 1 - (np.arccos(cosine_sim(v1,v2))/np.pi)

## Compare DocVecs

In [34]:
path = '../doc2vec-models/2016-04-14_17.36.08_20e_pv-dbow_size50_lr0.025_window8_neg5'
model = Doc2Vec.load(path)

In [35]:
# jurassic park
# imdb_id 107290
# movie_id 82
# 261 ratings in our ML-100k

# jurassic park 2 (Lost World: Jurassic Park, The (1997))
# imdb_id 119567
# movie_id 252
# 158 ratings in our ML-100k

In [36]:
# similar movies to jurassic park
model.docvecs.most_similar(positive=['107290.txt'])

[('119675.txt', 0.7279843091964722),
 ('119567.txt', 0.679550290107727),
 ('120004.txt', 0.6715301871299744),
 ('90605.txt', 0.6436644792556763),
 ('118928.txt', 0.633784294128418),
 ('97443.txt', 0.6167540550231934),
 ('117998.txt', 0.6146465539932251),
 ('56931.txt', 0.6136928200721741),
 ('88760.txt', 0.6116420030593872),
 ('118689.txt', 0.5930735468864441)]

In [37]:
# most similar movie to jurassic park in ML-100k:
# imdb_id 119675
# movie_id 264
# title Mimic

In [38]:
# disimilar movies to jurassic park
model.docvecs.most_similar(negative=['107290.txt']) # jurassic park

[('16630.txt', 0.11265313625335693),
 ('36868.txt', 0.09027586132287979),
 ('98635.txt', 0.08478951454162598),
 ('89853.txt', 0.08220337331295013),
 ('99334.txt', 0.0779241994023323),
 ('57710.txt', 0.07723505795001984),
 ('113083.txt', 0.07643953710794449),
 ('168740.txt', 0.07486752420663834),
 ('189142.txt', 0.06650149822235107),
 ('156887.txt', 0.04408472776412964)]

In [39]:
# most dissimilar to jurassic park in ML-100k:
# imdb_id 113083
# movie_id 927
# title "Flower of My Secret, The (Flor de mi secreto, La) (1995)"

In [40]:
jp_docvec = model.docvecs['107290.txt']

In [41]:
jp2_docvec = model.docvecs['119567.txt']

In [42]:
flower_docvec = model.docvecs['113083.txt']

In [43]:
# the hunt for red october
# imdb_id 99810
# movie_id 265
red_docvec = model.docvecs['99810.txt']

In [44]:
# men in black
# imdb_id 119654
# movie_id 257
men_docvec = model.docvecs['119654.txt']

In [105]:
top_docvec = model.docvecs['92099.txt']

#### cosine similarity

In [45]:
cosine_sim(jp_docvec, jp2_docvec)

0.67955038111142396

In [51]:
cosine_sim(jp_docvec, flower_docvec)

-0.076439522

In [16]:
cosine_sim(jp_docvec, red_docvec)

0.31112918

In [17]:
cosine_sim(jp_docvec, men_docvec)

0.38330352

In [106]:
cosine_sim(jp_docvec, top_docvec)

0.31809315

#### angular similarity

In [58]:
angular_sim(jp_docvec, jp2_docvec)

0.73782508598383001

In [59]:
angular_sim(jp_docvec, flower_docvec)

0.47564478370731478

In [60]:
angular_sim(jp_docvec, red_docvec)

0.60070717906654991

In [61]:
angular_sim(jp_docvec, men_docvec)

0.62521367013558338

## Compare Item Factors

In [92]:
loaded = dd.io.load('../models/mpcf-si/2016-06-20_18.58.44_si_ml-100k_e20_tt-0.7_lambda-item-0.1.h5')

In [93]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[82]
jp2_id = items_map[252]
flower_id = items_map[927]
red_id = items_map[265]
men_id = items_map[257]

In [94]:
item_factors = loaded['params']['Q']

#### cosine similarity

In [65]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.27288967817141402

In [66]:
cosine_sim(item_factors[jp_id], item_factors[flower_id])

-0.50515176737951706

In [67]:
cosine_sim(item_factors[jp_id], item_factors[red_id])

0.71786133202780744

In [68]:
cosine_sim(item_factors[jp_id], item_factors[men_id])

0.18186173426419175

In [102]:
highest = 0
movie_id = None
for m_id, idx in items_map.iteritems():
    sim = cosine_sim(item_factors[jp_id], item_factors[idx])
    if sim > highest and idx != jp_id:
        highest = sim
        movie_id = m_id

In [103]:
highest

0.8503261497977197

In [104]:
movie_id # top gun

161

#### angular similarity

In [69]:
angular_sim(item_factors[jp_id], item_factors[jp2_id])

0.58797940085575473

In [70]:
angular_sim(item_factors[jp_id], item_factors[flower_id])

0.33143651368090354

In [71]:
angular_sim(item_factors[jp_id], item_factors[red_id])

0.75487882870779344

In [72]:
angular_sim(item_factors[jp_id], item_factors[men_id])

0.55821232986426428

### Removes all but 10 ratings of Jurassic Parc and Jurassic Parc 2 in train

In [73]:
loaded = dd.io.load('../models/mpcf-si/2016-06-21_14.22.39_si_ml-100k_e20_tt-0.7_only-some-jurassic-park.h5')

In [74]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[82]
jp2_id = items_map[252]
flower_id = items_map[927]
red_id = items_map[265]
men_id = items_map[257]

In [75]:
item_factors = loaded['params']['Q']

#### cosine similarity

In [76]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.12227672813951915

In [77]:
cosine_sim(item_factors[jp_id], item_factors[flower_id])

-0.2175744707509619

In [78]:
cosine_sim(item_factors[jp_id], item_factors[red_id])

0.37273054348096568

In [79]:
cosine_sim(item_factors[jp_id], item_factors[men_id])

0.13628980628300397

#### angular similarity

In [80]:
angular_sim(item_factors[jp_id], item_factors[jp2_id])

0.53901954060677193

In [81]:
angular_sim(item_factors[jp_id], item_factors[flower_id])

0.43018550100343755

In [82]:
angular_sim(item_factors[jp_id], item_factors[red_id])

0.62157842136986829

In [83]:
angular_sim(item_factors[jp_id], item_factors[men_id])

0.54351783192160341

## NO Side Information

In [85]:
loaded = dd.io.load('../models/mpcf/2016-06-20_18.30.50_no-si_ml-100k_e20_tt-0.7_base.h5')

In [86]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[82]
jp2_id = items_map[252]
flower_id = items_map[927]
red_id = items_map[265]
men_id = items_map[257]

In [87]:
item_factors = loaded['params']['Q']

In [88]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.30718885690267261

In [89]:
cosine_sim(item_factors[jp_id], item_factors[flower_id])

-0.50946977230538903

In [90]:
cosine_sim(item_factors[jp_id], item_factors[red_id])

0.70613019925859721

In [91]:
cosine_sim(item_factors[jp_id], item_factors[men_id])

0.27512853062517278