Идея такая:
1. Выбираем жанр с максимальной оценкой у заданног пользователя;
2. С помощью NearestNeighbors выбираем 100 фильмов с похожими жанрами, которые пользователь ещё не смотрел;
3. Для этой выборки пресказываем оценку через KNNWithMeans;
4. Берем 10 с самой высокой оценкой и выдаем рекомендации.

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('../recsys4/movies.csv')
ratings = pd.read_csv('../recsys4/ratings.csv')

In [94]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.4+ KB


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [165]:
current_user = 15

In [166]:
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user]

In [167]:
best_genders = user_movies.loc[user_movies.rating.idxmax()]['genres']
print(best_genders)

Action|Adventure|Sci-Fi


In [168]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [169]:
rec_movies = movies_with_ratings[movies_with_ratings.userId != current_user]
rec_movies_t = rec_movies.movieId.unique()
rec_movies_t1 = movies[movies.movieId.isin(rec_movies_t)]
movie_genres = [change_string(g) for g in rec_movies_t1.genres.values]
rec_movies_t1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9724 non-null int64
title      9724 non-null object
genres     9724 non-null object
dtypes: int64(1), object(2)
memory usage: 303.9+ KB


In [170]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [171]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [172]:
neigh = NearestNeighbors(n_neighbors=100, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=100, p=2, radius=1.0)

In [173]:
test = change_string(best_genders)
print(test)
predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)
res = neigh.kneighbors(X_tfidf2, n_neighbors=100, return_distance=True)
print(res)

Action Adventure SciFi
(array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.33138861, 0.33138861, 0.33138861, 0.33138861,
        0.33138861, 0.33

In [174]:
rec_movies_by_genres = rec_movies_t1.iloc[res[1][0]]

In [175]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [176]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [177]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [178]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [179]:
trainset, testset = train_test_split(data, test_size=.15)

In [180]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'content_based': False})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x21544e48>

In [181]:
test_pred = algo.test(testset)

In [182]:
accuracy.rmse(test_pred)

RMSE: 0.8973


0.8972615985798436

In [183]:
recomed_set = {}
for t in rec_movies_by_genres.title.values:
    est = algo.predict(uid=current_user, iid=t).est
    recomed_set[t] = est

In [184]:
sorted(recomed_set.items(), key=lambda t: t[1], reverse=True)[:10]

[('Star Wars: Episode V - The Empire Strikes Back (1980)', 4.631055555434235),
 ('Star Wars: Episode IV - A New Hope (1977)', 4.563542135899371),
 ('Star Wars: Episode VI - Return of the Jedi (1983)', 4.52788285442872),
 ('Star Wars: Episode III - Revenge of the Sith (2005)', 4.403197677537721),
 ('THX 1138 (1971)', 4.324637382150056),
 ('Thor: Ragnarok (2017)', 4.206666701888649),
 ('Okja (2017)', 4.107787974103707),
 ('Serenity (2005)', 4.029521936726773),
 ('Total Recall (1990)', 4.015813728907528),
 ('Battlestar Galactica: The Plan (2009)', 3.9967548522884986)]