In [1]:
from gensim.models.doc2vec import Doc2Vec
import deepdish as dd
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def cosine_sim(v1, v2):
    #return cosine_similarity(v1,v2)
    #return 1 - spatial.distance.cosine(v1, v2)
    return np.sum(v1 * v2) / (np.sqrt(np.sum(np.square(v1))) * np.sqrt(np.sum(np.square(v2))))

def angular_sim(v1, v2):
    return 1 - (np.arccos(cosine_sim(v1,v2))/np.pi)

# Document Vectors Space

In [3]:
path = '../doc2vec-models/2016-04-14_17.36.08_20e_pv-dbow_size50_lr0.025_window8_neg5'
model = Doc2Vec.load(path)

In [4]:
# free willy
fw_imdb_id = 106965
fw_movie_id = 455

# free willy 2
fw2_imdb_id = 113114
fw2_movie_id = 169

In [5]:
model.docvecs.similarity('{}.txt'.format(fw_imdb_id),'{}.txt'.format(fw2_imdb_id))

0.86940241592695133

In [6]:
# similar movies to free willy
# free willy 2 is most similar
model.docvecs.most_similar(positive=['{}.txt'.format(fw_imdb_id)])

[('113114.txt', 0.8694024085998535),
 ('119152.txt', 0.8322009444236755),
 ('91557.txt', 0.6343326568603516),
 ('77766.txt', 0.629421591758728),
 ('113028.txt', 0.6236975789070129),
 ('116329.txt', 0.6219675540924072),
 ('134619.txt', 0.6181328892707825),
 ('98627.txt', 0.6160564422607422),
 ('117427.txt', 0.6138113141059875),
 ('97388.txt', 0.612464189529419)]

In [7]:
# disimilar movies to free willy
model.docvecs.most_similar(negative=['{}.txt'.format(fw_imdb_id)])

[('119280.txt', 0.014068834483623505),
 ('116192.txt', 0.009390600025653839),
 ('110588.txt', 0.00923667848110199),
 ('113987.txt', 0.000695057213306427),
 ('59319.txt', -0.0006838738918304443),
 ('96332.txt', -0.0033706873655319214),
 ('117561.txt', -0.019551947712898254),
 ('144214.txt', -0.020484495908021927),
 ('112346.txt', -0.023167015984654427),
 ('125439.txt', -0.028610385954380035)]

In [8]:
# dissimilar to free willy
# Mrs. Brown, drama/romance
brown_imdb_id = 119280
brown_movie_id = 1643

In [9]:
model.docvecs.similarity('{}.txt'.format(fw_imdb_id),'{}.txt'.format(brown_imdb_id))

-0.014068827695707734

In [10]:
# jurassic park
jp_imdb_id = 107290
jp_movie_id = 480

# jurassic park 2
jp2_imdb_id = 119567
jp2_movie_id = 1544

In [11]:
model.docvecs.similarity('{}.txt'.format(jp_imdb_id),'{}.txt'.format(jp2_imdb_id))

0.67955040301080083

In [12]:
# similar movies to jurassic park
# jurassic park 2 is second most similar movie
model.docvecs.most_similar(positive=['{}.txt'.format(jp_imdb_id)])

[('119675.txt', 0.7279843091964722),
 ('119567.txt', 0.679550290107727),
 ('120004.txt', 0.6715301871299744),
 ('90605.txt', 0.6436644792556763),
 ('118928.txt', 0.633784294128418),
 ('97443.txt', 0.6167540550231934),
 ('117998.txt', 0.6146465539932251),
 ('56931.txt', 0.6136928200721741),
 ('88760.txt', 0.6116420030593872),
 ('118689.txt', 0.5930735468864441)]

In [13]:
# disimilar movies to jurassic park
model.docvecs.most_similar(negative=['{}.txt'.format(jp_imdb_id)])

[('16630.txt', 0.11265313625335693),
 ('36868.txt', 0.09027586132287979),
 ('98635.txt', 0.08478951454162598),
 ('89853.txt', 0.08220337331295013),
 ('99334.txt', 0.0779241994023323),
 ('57710.txt', 0.07723505795001984),
 ('113083.txt', 0.07643953710794449),
 ('168740.txt', 0.07486752420663834),
 ('189142.txt', 0.06650149822235107),
 ('156887.txt', 0.04408472776412964)]

In [14]:
# most dissimilar to jurassic park
# Battling Butler, comedy
butler_imdb_id = 16630
butler_movie_id = 3012

In [15]:
model.docvecs.similarity('{}.txt'.format(jp_imdb_id),'{}.txt'.format(butler_imdb_id))

-0.11265313836867154

In [16]:
# scream
scr_imdb_id = 117571
scr_movie_id = 1407

# scream 2
scr2_imdb_id = 120082
scr2_movie_id = 1717

In [17]:
model.docvecs.similarity('{}.txt'.format(scr_imdb_id),'{}.txt'.format(scr2_imdb_id))

0.75141428516212705

In [18]:
# similar movies to scream
# scream 2 is second most similar
model.docvecs.most_similar(positive=['{}.txt'.format(scr_imdb_id)])

[('134084.txt', 0.7863492369651794),
 ('120082.txt', 0.7514142394065857),
 ('91954.txt', 0.6668806076049805),
 ('155776.txt', 0.6493260264396667),
 ('111686.txt', 0.6400280594825745),
 ('146336.txt', 0.636536180973053),
 ('192731.txt', 0.6228945851325989),
 ('204626.txt', 0.616219699382782),
 ('110632.txt', 0.6104223728179932),
 ('103919.txt', 0.6087551712989807)]

In [19]:
# disimilar movies to scream
model.docvecs.most_similar(negative=['{}.txt'.format(scr_imdb_id)])

[('65938.txt', 0.0465000718832016),
 ('92048.txt', 0.03354673832654953),
 ('120633.txt', 0.027388297021389008),
 ('85859.txt', 0.026076868176460266),
 ('200071.txt', 0.0013969168066978455),
 ('66473.txt', -0.00011243671178817749),
 ('110167.txt', -0.0009681433439254761),
 ('98105.txt', -0.00425439327955246),
 ('181984.txt', -0.007469929754734039),
 ('62218.txt', -0.007664471864700317)]

In [20]:
# most dissimilar to scream
# Kelly's Heroes
kelly_imdb_id = 65938
kelly_movie_id = 3836

In [21]:
model.docvecs.similarity('{}.txt'.format(scr_imdb_id),'{}.txt'.format(kelly_imdb_id))

-0.04650007406425781

In [22]:
# species
sp_imdb_id = 114508
sp_movie_id = 196

# species 2
sp2_imdb_id = 120841
sp2_movie_id = 1862

In [23]:
model.docvecs.similarity('{}.txt'.format(sp_imdb_id),'{}.txt'.format(sp2_imdb_id))

0.68311800802591438

In [24]:
# similar movies to species
# species 2 is most similar
model.docvecs.most_similar(positive=['{}.txt'.format(sp_imdb_id)])

[('120841.txt', 0.6831179857254028),
 ('90583.txt', 0.6602948904037476),
 ('204626.txt', 0.6501474976539612),
 ('139239.txt', 0.6449568271636963),
 ('105226.txt', 0.6214965581893921),
 ('164052.txt', 0.6181344389915466),
 ('95179.txt', 0.6133553981781006),
 ('84783.txt', 0.6096972823143005),
 ('115710.txt', 0.6073347926139832),
 ('82533.txt', 0.6071861982345581)]

In [25]:
# disimilar movies to species
model.docvecs.most_similar(negative=['{}.txt'.format(sp_imdb_id)])

[('66206.txt', 0.07634155452251434),
 ('16630.txt', 0.0496206060051918),
 ('57546.txt', 0.04905722290277481),
 ('50825.txt', 0.04533801227807999),
 ('75232.txt', 0.040982794016599655),
 ('76137.txt', 0.03134193271398544),
 ('106226.txt', 0.029320180416107178),
 ('107501.txt', 0.020478807389736176),
 ('75704.txt', 0.018507108092308044),
 ('43274.txt', 0.011801136657595634)]

In [26]:
# most disimilar to species
# Patton
patton_imdb_id = 66206
patton_movie_id = 1272

In [27]:
model.docvecs.similarity('{}.txt'.format(sp_imdb_id),'{}.txt'.format(patton_imdb_id))

-0.076341565258958299

In [28]:
# star wars episode v
swv_imdb_id = 80684
swv_movie_id = 1196

# star wars episode vi
swvi_imdb_id = 86190
swvi_movie_id = 1210

In [29]:
model.docvecs.similarity('{}.txt'.format(swv_imdb_id),'{}.txt'.format(swvi_imdb_id))

0.92307505174504856

In [30]:
# similar movies to star wars episode v
# star wars episode vi is most similar
model.docvecs.most_similar(positive=['{}.txt'.format(swv_imdb_id)])

[('86190.txt', 0.9230749607086182),
 ('120915.txt', 0.8119794130325317),
 ('76759.txt', 0.8067136406898499),
 ('84827.txt', 0.7332901954650879),
 ('120738.txt', 0.7259986400604248),
 ('117731.txt', 0.6888055801391602),
 ('119707.txt', 0.6791511178016663),
 ('70909.txt', 0.6599252820014954),
 ('97368.txt', 0.6569679379463196),
 ('84315.txt', 0.6349420547485352)]

In [31]:
# disimilar movies to star wars episode v
model.docvecs.most_similar(negative=['{}.txt'.format(swv_imdb_id)])

[('145653.txt', 0.01300012692809105),
 ('94155.txt', 0.010185103863477707),
 ('217630.txt', 0.0010292734950780869),
 ('104797.txt', 0.0009782575070858002),
 ('90863.txt', -0.009507261216640472),
 ('37884.txt', -0.010835811495780945),
 ('91024.txt', -0.011850625276565552),
 ('97322.txt', -0.016994968056678772),
 ('166396.txt', -0.02006503939628601),
 ('118556.txt', -0.02777113951742649)]

In [32]:
# most disimilar to star wars
# Angela\'s Ashes
angela_imdb_id = 145653
angela_movie_id = 3179

In [33]:
model.docvecs.similarity('{}.txt'.format(swv_imdb_id),'{}.txt'.format(angela_imdb_id))

-0.013000124588363082

In [34]:
# toy story
ts_imdb_id = 114709
ts_movie_id = 1

# toy story 2
ts2_imdb_id = 120363
ts2_movie_id = 3114

In [35]:
model.docvecs.similarity('{}.txt'.format(ts_imdb_id),'{}.txt'.format(ts2_imdb_id))

0.75291341575190129

In [36]:
# similar movies to toy story
# toy story 2 is most similar
model.docvecs.most_similar(positive=['{}.txt'.format(ts_imdb_id)])

[('120363.txt', 0.7529133558273315),
 ('115433.txt', 0.6592516303062439),
 ('89961.txt', 0.6522800922393799),
 ('33563.txt', 0.6440500617027283),
 ('120913.txt', 0.6346996426582336),
 ('55254.txt', 0.6268846988677979),
 ('122718.txt', 0.6225907206535339),
 ('101329.txt', 0.618421196937561),
 ('96787.txt', 0.6112702488899231),
 ('84649.txt', 0.6037213802337646)]

In [37]:
# disimilar movies to toy story
model.docvecs.most_similar(negative=['{}.txt'.format(ts_imdb_id)])

[('101640.txt', 0.02567880228161812),
 ('112857.txt', 0.02182948589324951),
 ('118798.txt', 0.019348686560988426),
 ('74958.txt', 0.018655510619282722),
 ('112714.txt', 0.014902409166097641),
 ('109450.txt', 0.0139361172914505),
 ('48254.txt', 0.01313953846693039),
 ('123385.txt', 0.007183991372585297),
 ('96073.txt', 0.003403082489967346),
 ('119375.txt', -0.0002711638808250427)]

In [38]:
# most disimilar to toy story
# Raise the Red Lantern
lantern_imdb_id = 101640
lantern_movie_id = 1280

In [39]:
model.docvecs.similarity('{}.txt'.format(ts_imdb_id),'{}.txt'.format(lantern_imdb_id))

-0.025678801003992269

# Item Factor Space

## MPCFs (No side information)

In [40]:
loaded = dd.io.load('../models/mpcf/2016-05-27_20.57.43_no-si_ml-1m_e20_tt-0.7_task-22.h5')
items_map = loaded['movies'] # maps movie_id to 0-based index
item_factors = loaded['params']['Q']

fw_id = items_map[fw_movie_id]
fw2_id = items_map[fw2_movie_id]
brown_id = items_map[brown_movie_id]

jp_id = items_map[jp_movie_id]
jp2_id = items_map[jp2_movie_id]
butler_id = items_map[butler_movie_id]

scr_id = items_map[scr_movie_id]
scr2_id = items_map[scr2_movie_id]
kelly_id = items_map[kelly_movie_id]

sp_id = items_map[sp_movie_id]
sp2_id = items_map[sp2_movie_id]
patton_id = items_map[patton_movie_id]

swv_id = items_map[swv_movie_id]
swvi_id = items_map[swvi_movie_id]
angela_id = items_map[angela_movie_id]

ts_id = items_map[ts_movie_id]
ts2_id = items_map[ts2_movie_id]
lantern_id = items_map[lantern_movie_id]

In [41]:
print "similar:", cosine_sim(item_factors[fw_id], item_factors[fw2_id])
print "disimilar:", cosine_sim(item_factors[fw_id], item_factors[brown_id])

similar: 0.692309544564
disimilar: -0.0310242412934


In [42]:
print "similar:", cosine_sim(item_factors[jp_id], item_factors[jp2_id])
print "disimilar:", cosine_sim(item_factors[jp_id], item_factors[butler_id])

similar: 0.583736976915
disimilar: -0.121387548439


In [43]:
print "similar:", cosine_sim(item_factors[scr_id], item_factors[scr2_id])
print "disimilar:", cosine_sim(item_factors[scr_id], item_factors[kelly_id])

similar: 0.686800923547
disimilar: -0.197853290897


In [44]:
print "similar:", cosine_sim(item_factors[sp_id], item_factors[sp2_id])
print "disimilar:", cosine_sim(item_factors[sp_id], item_factors[patton_id])

similar: 0.616964580935
disimilar: -0.0549698772233


In [45]:
print "similar:", cosine_sim(item_factors[swv_id], item_factors[swvi_id])
print "disimilar:", cosine_sim(item_factors[swv_id], item_factors[angela_id])

similar: 0.690062467715
disimilar: -0.0734449486636


In [46]:
print "similar:", cosine_sim(item_factors[ts_id], item_factors[ts2_id])
print "disimilar:", cosine_sim(item_factors[ts_id], item_factors[lantern_id])

similar: 0.669595461581
disimilar: -0.0127149157715


## MPCFs-SI (with side information)

In [47]:
loaded = dd.io.load('../models/mpcf-si/2016-06-29_13.23.36_si_ml-1m_e20_tt-0.7_task-0.h5')
items_map = loaded['items'] # maps movie_id to 0-based index
item_factors = loaded['params']['Q']

fw_id = items_map[fw_movie_id]
fw2_id = items_map[fw2_movie_id]
brown_id = items_map[brown_movie_id]

jp_id = items_map[jp_movie_id]
jp2_id = items_map[jp2_movie_id]
butler_id = items_map[butler_movie_id]

scr_id = items_map[scr_movie_id]
scr2_id = items_map[scr2_movie_id]
kelly_id = items_map[kelly_movie_id]

sp_id = items_map[sp_movie_id]
sp2_id = items_map[sp2_movie_id]
patton_id = items_map[patton_movie_id]

swv_id = items_map[swv_movie_id]
swvi_id = items_map[swvi_movie_id]
angela_id = items_map[angela_movie_id]

ts_id = items_map[ts_movie_id]
ts2_id = items_map[ts2_movie_id]
lantern_id = items_map[lantern_movie_id]

In [48]:
print "similar:", cosine_sim(item_factors[fw_id], item_factors[fw2_id])
print "disimilar:", cosine_sim(item_factors[fw_id], item_factors[brown_id])

similar: 0.705107367023
disimilar: -0.0675717988178


In [49]:
print "similar:", cosine_sim(item_factors[jp_id], item_factors[jp2_id])
print "disimilar:", cosine_sim(item_factors[jp_id], item_factors[butler_id])

similar: 0.617076550583
disimilar: -0.190130878462


In [50]:
print "similar:", cosine_sim(item_factors[scr_id], item_factors[scr2_id])
print "disimilar:", cosine_sim(item_factors[scr_id], item_factors[kelly_id])

similar: 0.723297044654
disimilar: -0.205559453259


In [51]:
print "similar:", cosine_sim(item_factors[sp_id], item_factors[sp2_id])
print "disimilar:", cosine_sim(item_factors[sp_id], item_factors[patton_id])

similar: 0.70643524738
disimilar: -0.0306956746373


In [52]:
print "similar:", cosine_sim(item_factors[swv_id], item_factors[swvi_id])
print "disimilar:", cosine_sim(item_factors[swv_id], item_factors[angela_id])

similar: 0.722052519549
disimilar: -0.16142731848


In [53]:
print "similar:", cosine_sim(item_factors[ts_id], item_factors[ts2_id])
print "disimilar:", cosine_sim(item_factors[ts_id], item_factors[lantern_id])

similar: 0.667803907944
disimilar: -0.0505249450573
