In [1]:
from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = '/Users/mac/Desktop/Netology/Pychon/10. Продвинутый pandas/Python_2_join/ml-latest-small'

In [3]:
movies = pd.read_csv(path + '/movies.csv')
ratings = pd.read_csv(path + '/ratings.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,3.0,851866700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9.0,4.0,938629200.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13.0,5.0,1331380000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.0,997938300.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19.0,3.0,855190100.0


In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = movies['genres'].apply(change_string)

In [9]:
CV = CountVectorizer()
X_train = CV.fit_transform(movie_genres)

Tf = TfidfTransformer()
X_train = Tf.fit_transform(X_train)

nn = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
nn.fit(X_train)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=20)

In [10]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

0it [00:00, ?it/s]

In [11]:
def recommend(user_id):
    
    '''
    Функция рекомендации фильмов по жанру и рейтинга фильмов в порядке убывания.
    
    '''
    
    # получаем фильмы, которые смотрел юзер
    user_movies = movies_with_ratings[movies_with_ratings.userId == user_id].title.unique()
    
    # получаем жанр последнего просмотренного юзером фильма
    
    movie_genres = title_genres[user_movies[-1]]
    
    movie_genres = change_string(movie_genres)

    X = CV.transform([movie_genres])
    X = Tf.transform(X)

    predict = nn.kneighbors(X, return_distance=False)
    
    # Получаем список фильмов
    movies_to_score = movies.iloc[predict[0]].title.values
    
    movies_to_score = set(movies_to_score.tolist()) - set(user_movies.tolist())
            
    # Получаем рейтинги фильмов
    rating_movie = movies_with_ratings[movies_with_ratings['title'].isin(movies_to_score)]
    
    # рассчитываем средный рейтинг
    mean_rating_movie = rating_movie.groupby('title').mean().reset_index()
    
    # выводим средний рейтинг в порядке убывания
    mean_rating_ = mean_rating_movie[['title', 'rating']].sort_values('rating', ascending=False)
    return mean_rating_

In [14]:
recommend(5)

Unnamed: 0,title,rating
1,Caveman (1981),5.0
7,"Emperor's New Clothes, The (2001)",4.5
4,"Dog's Life, A (1918)",4.25
0,Beautiful People (1999),4.0
3,Different for Girls (1996),4.0
14,Ricky Gervais Live: Animals (2003),4.0
6,Eagle vs Shark (2007),4.0
18,Top Secret! (1984),3.958333
8,Every Which Way But Loose (1978),3.6
11,"Inbetweeners 2, The (2014)",3.5
