In [1]:
import numpy as np
import pandas

ratings = pandas.read_csv('../data/ratings.csv', header=None, names=['User', 'Work', 'Choice'])
works = pandas.read_csv('../data/works.csv', header=None, index_col=0, names=['Title'])

class BaseEmbedding:
    def __init__(self, ratings=None, works=None):
        if ratings is None:
            ratings = pandas.read_csv('../data/ratings.csv', header=None, names=['User', 'Work', 'Choice'])
        if works is None:
            works = pandas.read_csv('../data/works.csv', header=None, index_col=0, names=['Title'])
            
        self.ratings = ratings
        self.works = works
        self.nb_users = ratings['User'].max() + 1
        self.nb_works = ratings['Work'].max() + 1

    def most_similar(self, work_id, topn=8):
        res = self._get_most_similar(work_id, topn)
        similar = self.works.loc[res[1]]
        similar['Similarity'] = pandas.Series(res[0], index=similar.index)
        return similar

# Embedding par SVD de la matrices objets-objets

In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
import scipy.sparse as sp

class SVDEmbedding(BaseEmbedding):
    def __init__(self, *choices_list, size=100, **kwargs):
        super().__init__(**kwargs)
        self._compute_svd(*choices_list, size=size)
        self._compute_nn()
        
    def _matrix_for(self, choices):
        elems = self.ratings[self.ratings['Choice'].isin(choices)].as_matrix()
        return sp.csc_matrix((np.ones(len(elems)), (elems[:,0], elems[:,1])), shape=(self.nb_users, self.nb_works))
    
    def _cooccurrences_for(self, choices):
        matrix = self._matrix_for(choices)
        return matrix.T.dot(matrix)
    
    def _compute_svd(self, *choices_list, size):
        cooccurrences = np.sum(self._cooccurrences_for(choices) for choices in choices_list)
        self._svd = TruncatedSVD(size).fit(cooccurrences)
        
    def _compute_nn(self):
        assert hasattr(self, '_svd')
        self._nn = NearestNeighbors(algorithm='brute', metric='cosine').fit(self._svd.components_.T)
        
    def _get_most_similar(self, work_id, n_neighbors):
        assert hasattr(self, '_svd') and hasattr(self, '_nn')
        res = self._nn.kneighbors(self._svd.components_[:,[work_id]].T, n_neighbors=n_neighbors + 1)
        return (1 - res[0].ravel(), res[1].ravel())

In [3]:
svd_embedding = SVDEmbedding({'dislike'}, {'neutral'}, {'like', 'favorite'}, ratings=ratings, works=works)

In [4]:
works[works['Title'].str.contains('Madoka')]

Unnamed: 0,Title
1184,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...
1773,Mahou Shoujo Madoka★Magica
1890,Puella Magi Madoka Magica the Movie Part II: E...
2969,Puella Magi Madoka Magica the Movie Part III: ...
4985,Puella Magi Madoka Magica
7265,Puella Magi Madoka Magica - The different story
7753,Puella Magi Madoka Magica the Movie Part I: Be...


In [5]:
svd_embedding.most_similar(1773)

Unnamed: 0,Title,Similarity
1773,Mahou Shoujo Madoka★Magica,1.0
7753,Puella Magi Madoka Magica the Movie Part I: Be...,0.515669
2969,Puella Magi Madoka Magica the Movie Part III: ...,0.470317
1890,Puella Magi Madoka Magica the Movie Part II: E...,0.463384
1916,Hayate the combat butler,0.397772
4466,Enzai,0.376058
616,Peaceful Times (F02) Petit Film,0.374256
711,Mottainai,0.374256
6651,Shinano Mainichi Shinbun,0.374256


# Embedding par `item2vec`

In [6]:
import gensim
import random

class Item2VecEmbedding(BaseEmbedding):
    def __init__(self, *choices_list, size=100, min_count=2, **kwargs):
        super().__init__(**kwargs)
        self.docs = []
        for choices in choices_list:
            elems = self.ratings[self.ratings['Choice'].isin(choices)][['User', 'Work']].as_matrix()
            docs = [[] for _ in range(self.nb_users)]
            for user_id, work_id in elems:
                docs[user_id].append(str(work_id))
            self.docs.extend(docs)
        max_len = max(len(doc) for doc in self.docs)
        self._word2vec = gensim.models.Word2Vec(self.docs, size=size, window=max_len+1, min_count=min_count, iter=10, sg=1)
        
    def _get_most_similar(self, work_id, n_neighbors):
        res = self._word2vec.most_similar(str(work_id), topn=n_neighbors)
        return ([1.] + [x for _, x in res], [work_id] + [int(x) for x, _ in res])

In [7]:
embedding = Item2VecEmbedding({'like', 'favorite'}, ratings=ratings, works=works)

In [8]:
embedding.most_similar(1773)

Unnamed: 0,Title,Similarity
1773,Mahou Shoujo Madoka★Magica,1.0
2969,Puella Magi Madoka Magica the Movie Part III: ...,0.845671
6508,Suzumiya Haruhi no Yuuutsu,0.80818
4846,Suzumiya Haruhi no Shoushitsu,0.80127
5596,Bakemonogatari,0.785766
7753,Puella Magi Madoka Magica the Movie Part I: Be...,0.76985
2874,5 centimètres par seconde,0.760987
2819,Tengen Toppa Gurren Lagann,0.733072
1890,Puella Magi Madoka Magica the Movie Part II: E...,0.730325
