In [1]:
from gensim.models.doc2vec import Doc2Vec
import deepdish as dd
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def cosine_sim(v1, v2):
    #return cosine_similarity(v1,v2)
    #return 1 - spatial.distance.cosine(v1, v2)
    return np.sum(v1 * v2) / (np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2))))

def angular_sim(v1, v2):
    return 1 - (np.arccos(cosine_sim(v1,v2))/np.pi)

## Compare DocVecs

In [3]:
path = '../doc2vec-models/2016-04-14_17.36.08_20e_pv-dbow_size50_lr0.025_window8_neg5'
model = Doc2Vec.load(path)

In [4]:
# jurassic park
jp_imdb_id = 107290
jp_movie_id = 82
# 261 ratings in our ML-100k

# jurassic park 2 (Lost World: Jurassic Park, The (1997))
jp2_imdb_id = 119567
jp2_movie_id = 252
# 158 ratings in our ML-100k

In [5]:
model.docvecs.similarity('{}.txt'.format(jp_imdb_id),'{}.txt'.format(jp2_imdb_id))

0.67955040301080083

In [6]:
# disimilar movies to jurassic park
model.docvecs.most_similar(negative=['{}.txt'.format(jp_imdb_id)])

[('16630.txt', 0.11265313625335693),
 ('36868.txt', 0.09027586132287979),
 ('98635.txt', 0.08478951454162598),
 ('89853.txt', 0.08220337331295013),
 ('99334.txt', 0.0779241994023323),
 ('57710.txt', 0.07723505795001984),
 ('113083.txt', 0.07643953710794449),
 ('168740.txt', 0.07486752420663834),
 ('189142.txt', 0.06650149822235107),
 ('156887.txt', 0.04408472776412964)]

In [7]:
# most dissimilar to jurassic park
harry_imdb_id = 98635
harry_movie_id = 216
# title "When Harry Met Sally"

In [8]:
# Scream
scr_imdb_id = 117571
scr_movie_id = 288

# Scream 2
scr2_imdb_id = 120082
scr2_movie_id = 895

In [9]:
model.docvecs.similarity('{}.txt'.format(scr_imdb_id),'{}.txt'.format(scr2_imdb_id))

0.75141428516212705

In [40]:
# disimilar movies to scream
model.docvecs.most_similar(negative=['{}.txt'.format(scr_imdb_id)])

[('65938.txt', 0.0465000718832016),
 ('92048.txt', 0.03354673832654953),
 ('120633.txt', 0.027388297021389008),
 ('85859.txt', 0.026076868176460266),
 ('200071.txt', 0.0013969168066978455),
 ('66473.txt', -0.00011243671178817749),
 ('110167.txt', -0.0009681433439254761),
 ('98105.txt', -0.00425439327955246),
 ('181984.txt', -0.007469929754734039),
 ('62218.txt', -0.007664471864700317)]

In [42]:
# dissimilar to scream
# Local Hero (1983)
hero_imdb_id = 85859
hero_movie_id = 516

In [10]:
# GoldenEye (james bond)
ge_imdb_id = 113189
ge_movie_id = 2

# tomorrow never dies (james bond)
tom_imdb_id = 120347
tom_movie_id = 751

In [11]:
model.docvecs.similarity('{}.txt'.format(ge_imdb_id),'{}.txt'.format(tom_imdb_id))

0.62593487374729428

In [59]:
# disimilar movies to golden eye
model.docvecs.most_similar(negative=['{}.txt'.format(ge_imdb_id)])

[('125664.txt', 0.09645520150661469),
 ('81150.txt', 0.06718134880065918),
 ('34492.txt', 0.062411241233348846),
 ('123209.txt', 0.05626177042722702),
 ('114354.txt', 0.0500088669359684),
 ('176422.txt', 0.047035086899995804),
 ('126604.txt', 0.039543554186820984),
 ('48473.txt', 0.0331258624792099),
 ('160338.txt', 0.032457709312438965),
 ('97940.txt', 0.03106977790594101)]

In [60]:
# dissimilar to golden eye
# title Pather Panchali (1955)
pather_imdb_id = 48473
pather_movie_id = 1449

# Item Factor Space

In [12]:
def find_most_similar(i_id, items_map, item_factors):
    sim = 0
    sim_movie_id = None
    for m_id, idx in items_map.iteritems():
        cs =cosine_sim(item_factors[i_id], item_factors[idx])
        if cs > sim and idx != i_id:
            sim = cs
            sim_movie_id = m_id
    return sim_movie_id, sim

## MPCFs ( no side information)

In [61]:
loaded = dd.io.load('../models/mpcf/2016-06-20_18.30.50_no-si_ml-100k_e20_tt-0.7_base.h5')

In [62]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[jp_movie_id]
jp2_id = items_map[jp2_movie_id]
harry_id = items_map[harry_movie_id]
scr_id = items_map[scr_movie_id]
scr2_id = items_map[scr2_movie_id]
hero_id = items_map[hero_movie_id]
ge_id = items_map[ge_movie_id]
tom_id = items_map[tom_movie_id]
pather_id = items_map[pather_movie_id]

In [63]:
item_factors = loaded['params']['Q']

#### cosine similarity

In [46]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.30718885690267261

In [47]:
cosine_sim(item_factors[jp_id], item_factors[harry_id])

0.52941647777463263

In [48]:
cosine_sim(item_factors[scr_id], item_factors[scr2_id])

0.42924343717933044

In [49]:
cosine_sim(item_factors[scr_id], item_factors[hero_id])

-0.35885342322376074

In [50]:
cosine_sim(item_factors[ge_id], item_factors[tom_id])

0.074845562271428231

In [64]:
cosine_sim(item_factors[ge_id], item_factors[pather_id])

-0.10025061364996828

In [20]:
m_id, sim = find_most_similar(jp_id, items_map, item_factors)
print sim
print m_id
#speed

0.831198273746
568


In [21]:
m_id, sim = find_most_similar(scr2_id, items_map, item_factors)
print sim
print m_id
# I Know What You Did Last Summer

0.710174697955
682


In [22]:
m_id, sim = find_most_similar(ge_id, items_map, item_factors)
print sim
print m_id
# Under Siege

0.836484808542
233


## MPCFs-SI

In [65]:
loaded = dd.io.load('../models/mpcf-si/2016-06-27_08.23.04_si_ml-100k_e20_tt-0.7_task-1.h5')

In [66]:
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[jp_movie_id]
jp2_id = items_map[jp2_movie_id]
harry_id = items_map[harry_movie_id]
scr_id = items_map[scr_movie_id]
scr2_id = items_map[scr2_movie_id]
hero_id = items_map[hero_movie_id]
ge_id = items_map[ge_movie_id]
tom_id = items_map[tom_movie_id]
pather_id = items_map[pather_movie_id]

In [67]:
item_factors = loaded['params']['Q']

In [54]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.27337909566927959

In [55]:
cosine_sim(item_factors[jp_id], item_factors[harry_id])

0.4804453126265727

In [56]:
cosine_sim(item_factors[scr_id], item_factors[scr2_id])

0.5793326054088942

In [57]:
cosine_sim(item_factors[scr_id], item_factors[hero_id])

-0.40875048353675186

In [58]:
cosine_sim(item_factors[ge_id], item_factors[tom_id])

0.29936653845439964

In [68]:
cosine_sim(item_factors[ge_id], item_factors[pather_id])

-0.16131507567898917

In [30]:
m_id, sim = find_most_similar(jp_id, items_map, item_factors)
print sim
print m_id
#speed

0.775353358153
568


In [31]:
m_id, sim = find_most_similar(scr2_id, items_map, item_factors)
print sim
print m_id
# I Know What You Did Last Summer

0.62861637397
682


In [32]:
m_id, sim = find_most_similar(ge_id, items_map, item_factors)
print sim
print m_id
# Batman Returns

0.731727751178
231


### Removes all but 10 ratings of Jurassic Parc and Jurassic Parc 2 in train

In [33]:
loaded = dd.io.load('../models/mpcf-si/2016-06-21_14.22.39_si_ml-100k_e20_tt-0.7_only-some-jurassic-park.h5')

In [34]:
item_factors = loaded['params']['Q']
items_map = loaded['items'] # maps movie_id to 0-based index
jp_id = items_map[jp_movie_id]
jp2_id = items_map[jp2_movie_id]
harry_id = items_map[harry_movie_id]
scr_id = items_map[scr_movie_id]
scr2_id = items_map[scr2_movie_id]
ge_id = items_map[ge_movie_id]
tom_id = items_map[tom_movie_id]
pather_id = items_map[pather_movie_id]

In [35]:
cosine_sim(item_factors[jp_id], item_factors[jp2_id])

0.12227672813951915

In [37]:
cosine_sim(item_factors[jp_id], item_factors[harry_id])

0.25654223392175224

In [38]:
cosine_sim(item_factors[scr_id], item_factors[scr2_id])

0.40252329571319434

In [39]:
cosine_sim(item_factors[ge_id], item_factors[tom_id])

0.16058223408051639