In [21]:
%config IPCompleter.greedy=True

In [22]:
import pandas as pd
import numpy as np
from spotlight.factorization.implicit import ImplicitFactorizationModel
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

from collections import defaultdict
from itertools import count

In [23]:
ratings_df = pd.read_csv('C:/Users/papad/Desktop/ml-latest-small/ml-latest-small/ratings.csv')
movies_df = pd.read_csv('C:/Users/papad/Desktop/ml-latest-small/ml-latest-small/movies.csv')
tags_df = pd.read_csv('C:/Users/papad/Desktop/ml-latest-small/ml-latest-small/tags.csv')
links_df = pd.read_csv('C:/Users/papad/Desktop/ml-latest-small/ml-latest-small/links.csv')

In [24]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [25]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
uid_map = defaultdict(count().__next__)
iid_map = defaultdict(count().__next__)
uids = np.array([uid_map[uid] for uid in ratings_df["userId"].values ], dtype=np.int32)
iids = np.array([iid_map[iid] for iid in ratings_df["movieId"].values ], dtype=np.int32)

uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}

ratings = ratings_df["rating"].values.astype(np.float32)
timestamps = ratings_df["timestamp"].values.astype(np.int32)

print("userId %d got uid %d" % (556, uid_map[556]))
print("movieId %d got iid %d" % (54001, iid_map[54001]))

userId 556 got uid 555
movieId 54001 got iid 2518


In [27]:
from spotlight.interactions import Interactions
from spotlight.cross_validation import random_train_test_split

dataset = Interactions(user_ids=uids,
                                  item_ids=iids,
                                  ratings=ratings,
                                  timestamps=timestamps)

#lets initialise the seed, so that its repeatable and reproducible 
train, test = random_train_test_split(dataset, random_state=np.random.seed(42))

In [28]:
print(train)

<Interactions dataset (610 users x 9724 items x 80668 interactions)>


In [29]:
print(test)

<Interactions dataset (610 users x 9724 items x 20168 interactions)>


In [45]:
from spotlight.factorization.explicit import ExplicitFactorizationModel
import time  

emodel = ExplicitFactorizationModel(n_iter=10,
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False)
current = time.time()

emodel.fit(train, verbose=True)

end = time.time()
diff = end - current
print("Training took %d seconds "% (diff))

Epoch 0: loss 4.229930914466894
Epoch 1: loss 0.813165167653108
Epoch 2: loss 0.5230778297291526
Epoch 3: loss 0.3686225962223886
Epoch 4: loss 0.29229526367934444
Epoch 5: loss 0.2536848688710339
Epoch 6: loss 0.23266982621973073
Epoch 7: loss 0.21934477928318555
Epoch 8: loss 0.21006752616619762
Epoch 9: loss 0.20136033389953117
Training took 93 seconds 


In [46]:
print("One test item_id for userId 556 (uid %d) is " % uid_map.get(556))

testItemId = test.item_ids[np.where(test.user_ids == uid_map.get(556))[0][0]]
#this is a simpler version. Thanks to Hafsah!
#testItemId = test.item_ids[test.user_ids == uid_map.get(556)]

print("Test movieId is %d itemId %d " % (iid_rev_map.get(testItemId), testItemId))


#here 0 is a dummy item, which Spotlight needs for some reason...
#we discard its prediction using [1]
predicted = emodel.predict( np.array([555]), item_ids=np.array([0, testItemId]))[1]

#what was the actual score of the user for that movie?
#we can get the appropriate row from the ratings dataframe, then extract that value
actual = ratings_df[(ratings_df.movieId==iid_rev_map.get(testItemId)) & (ratings_df.userId==556)][["rating"]].values[0][0]

print("Predicted rating was %f, actual rating %0.1f, error was %f" % (predicted, actual, abs(predicted-actual) )) 

One test item_id for userId 556 (uid 555) is 
Test movieId is 69844 itemId 917 
Predicted rating was 4.835270, actual rating 3.5, error was 1.335270


In [47]:
allpreds = emodel.predict( np.array([555]) )

print(allpreds)
print(allpreds.size)

#we can recover the original rating for movieId 48394
print(allpreds[1132])

[4.8161016 3.4808502 5.671245  ... 2.0886903 3.0977912 0.76195  ]
9724
4.204029


In [49]:
emodel._net.item_embeddings.weight[0]

tensor([ 0.5073,  0.4764, -0.0731, -0.2266, -0.9056,  0.2152,  0.4169,  0.3048,
        -0.4653, -0.3311,  0.5881,  0.3747,  0.6738,  0.0093,  0.0768, -0.9656,
         0.0648,  0.2280,  0.0599, -0.5885, -0.0454, -0.2586, -0.0167,  0.6628,
        -0.1781,  0.2543, -0.2711, -0.0490, -0.1546, -0.4939, -0.0994, -0.6258],
       grad_fn=<SelectBackward>)

In [51]:
from spotlight.evaluation import mrr_score

print(mrr_score(emodel, test, train=train, k=100).mean())

0.09547624376091454


In [61]:
from sklearn import metrics
metrics.ndcg_score(allpreds, test)

AttributeError: module 'sklearn.metrics' has no attribute 'ndcg_score'