Scenariusze:
- Istniejący użytkownik: szukamy rekomendacji za pomocą CBF i CF, następnie średnia ważona i wynik

In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import train_test_split
from scipy.sparse import csr_matrix, dok_matrix, load_npz
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

### CBF model functions

In [14]:
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')

In [84]:
def recommend_for_user_cbf(user_id, top_n=10):
    """
        top_n - returns top_n movies fitted for user

        returns: 
        
    """
    tfid_matrix = load_npz('../data/tfid_matrix_cbf.npz')

    # Build user profile
    threshold = 4.0
    well_rated = ratings[(ratings.userId == user_id) & (ratings.rating >= threshold)]
    movies_idxs = well_rated.movieId.map(lambda m: movies.index[movies.movieId == m][0]).values
    user_vec = np.asarray(tfid_matrix[movies_idxs].mean(axis=0))

    sim_scores = cosine_similarity(user_vec, tfid_matrix).flatten()
    
    seen = set(ratings[ratings.userId == user_id].movieId)
    unseen_idxs = [i for i, m in enumerate(movies.movieId) if m not in seen]
    
    # Choose only unseen movies
    unseen_scores = [(i, sim_scores[i]) for i in unseen_idxs]
    unseen_scores.sort(key=lambda x: x[1], reverse=True)
    
    top_n_scores = unseen_scores[:top_n]
    top_idxs = [i for i, score in top_n_scores]
    
    top_movies = movies.iloc[top_idxs][['movieId', 'title', 'genres']]
    top_movies['score'] = [score for i, score in top_n_scores]
    return top_movies

In [67]:
recommend_for_user_cbf(5)

Unnamed: 0,movieId,title,genres,score
3163,3256,Patriot Games (1992),Action|Crime|Drama|Thriller,0.405562
1168,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.320625
1656,1722,Tomorrow Never Dies (1997),Action|Adventure|Thriller,0.307747
1864,1953,"French Connection, The (1971)",Action|Crime|Thriller,0.304654
1013,1036,Die Hard (1988),Action|Crime|Thriller,0.302144
1550,1608,Air Force One (1997),Action|Thriller,0.301877
899,920,Gone with the Wind (1939),Drama|Romance|War,0.299508
1939,2028,Saving Private Ryan (1998),Action|Drama|War,0.296338
1869,1958,Terms of Endearment (1983),Comedy|Drama,0.28927
3525,3623,Mission: Impossible II (2000),Action|Adventure|Thriller,0.287832


### CF model functions

In [31]:
def min_max_scale(values):
    min_val = min(values)
    max_val = max(values)
    return [(x - min_val) / (max_val - min_val) for x in values]

In [85]:
def recommend_for_user_cf(user_id, top_n=10):
    user_id__to_index = {user_ind: i for i, user_ind in enumerate(pd.Categorical(ratings.userId).categories)}
    index__toitem_id = {i: item_ind for i, item_ind in enumerate(pd.Categorical(ratings.movieId).categories)}

    model = implicit.nearest_neighbours.CosineRecommender.load('../data/cf_model.npz')
    data_sparse = load_npz('../data/sparse_matrix_cf.npz')

    user_id = user_id__to_index[user_id]
    user_items = data_sparse.tocsr()
    recommendations = model.recommend(userid=user_id, user_items=user_items[user_id], N=top_n, filter_already_liked_items=True)
    return pd.DataFrame({'movieId': [index__toitem_id[i] for i in recommendations[0]], 'score': min_max_scale(recommendations[1])}).\
        merge(movies, on='movieId')[['movieId', 'title', 'genres', 'score']].\
        sort_values('score', ascending=False)



In [55]:
recommend_for_user_cf(21)

Unnamed: 0,movieId,title,genres,score
0,1974,Friday the 13th (1980),Horror|Mystery|Thriller,1.0
1,2003,Gremlins (1984),Comedy|Horror,0.614909
2,1261,Evil Dead II (Dead by Dawn) (1987),Action|Comedy|Fantasy|Horror,0.350704
3,2455,"Fly, The (1986)",Drama|Horror|Sci-Fi|Thriller,0.311424
4,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,0.208938
5,1970,"Nightmare on Elm Street 3: Dream Warriors, A (...",Horror|Thriller,0.187957
6,1983,Halloween II (1981),Horror,0.134522
7,2288,"Thing, The (1982)",Action|Horror|Sci-Fi|Thriller,0.109271
8,2513,Pet Sematary (1989),Horror,0.045748
9,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.0


### Hybrid model

In [86]:
id = 5

display(recommend_for_user_cf(id))
display(recommend_for_user_cbf(id))

FileNotFoundError: [Errno 2] No such file or directory: '../data/cf_model.npz'

In [None]:
def recommend_hybrid(user_id, alpha=0.4, top_n = 10):
    cf = recommend_for_user_cf(user_id, top_n)
    cbf = recommend_for_user_cbf(user_id, top_n)

    hybrid = pd.merge(
        cf, cbf,
        on='movieId',
        how='outer',
        suffixes=('_cf', '_cbf')
    )

    hybrid['title']  = hybrid['title_cf'].fillna(hybrid['title_cbf'])
    hybrid['genres'] = hybrid['genres_cf'].fillna(hybrid['genres_cbf'])

    hybrid['score_cf'] = hybrid['score_cf'].fillna(0.0)
    hybrid['score_cbf'] = hybrid['score_cbf'].fillna(0.0)

    # Hybrid score
    hybrid['score'] = alpha * hybrid['score_cf'] + (1 - alpha) * hybrid['score_cbf']

    result = (
        hybrid[['movieId', 'title', 'genres', 'score']]
        .sort_values('score', ascending=False)
        .head(top_n)
        .reset_index(drop=True)
    )
    return result

In [79]:
display(recommend_hybrid(user_id=5))

Unnamed: 0,movieId,title,genres,score
0,377,Speed (1994),Action|Romance|Thriller,0.5
1,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy,0.297975
2,500,Mrs. Doubtfire (1993),Comedy|Drama,0.283076
3,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,0.278953
4,587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller,0.211295
5,3256,Patriot Games (1992),Action|Crime|Drama|Thriller,0.202781
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.160312
7,1722,Tomorrow Never Dies (1997),Action|Adventure|Thriller,0.153873
8,1953,"French Connection, The (1971)",Action|Crime|Thriller,0.152327
9,1036,Die Hard (1988),Action|Crime|Thriller,0.151072
