### Что делать?
1. Датасет ml-latest
2. Вспомнить подходы, которые мы разбирали
3. Выбрать понравившийся подход к гибридным системам
4. Написать свою

[Материалы здесь](https://github.com/ALKONDR/netology-recsys/blob/master/lecture-5/lecture-5-part-2.ipynb)

In [186]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [187]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

Попробуем осуществить алгоритм смешивания: выберем рекомендации исходя из тегов и жанров, а затем выберем лучшие рекомендации на базе двух алгоритмов.

In [188]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

Для начала выбираем наиболее похожие фильмы исходя из жанров:

In [189]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [190]:
genres = [change_string(g) for g in movies.genres.values]

In [191]:
genres[0]

'Adventure Animation Children Comedy Fantasy'

In [192]:
count_vect_genres = CountVectorizer()
X_train_counts_genres = count_vect_genres.fit_transform(genres)

tfidf_transformer_genres = TfidfTransformer()
X_train_tfidf_genres = tfidf_transformer_genres.fit_transform(X_train_counts_genres)

neigh_genres = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh_genres.fit(X_train_tfidf_genres)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [193]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect_genres.transform([test])
X_tfidf2 = tfidf_transformer_genres.transform(predict)

res = neigh_genres.kneighbors(X_tfidf2, return_distance=True)

In [194]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]),
 array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
         6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]],
       dtype=int64))

In [195]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [196]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Теперь разберемся с тегами:

In [197]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId').reset_index(drop=True)

In [198]:
tag_strings = []
movies_tags = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_tags.append(movie)

HBox(children=(IntProgress(value=0, max=9737), HTML(value='')))




In [199]:
count_vect_tags = CountVectorizer()
X_train_counts_tags = count_vect_tags.fit_transform(tag_strings)

In [200]:
tfidf_transformer_tags = TfidfTransformer()
X_train_tfidf_tags = tfidf_transformer_tags.fit_transform(X_train_counts_tags)

In [201]:
neigh_tags = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh_tags.fit(X_train_tfidf_tags)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [202]:
test = change_string('pixar pixar fun ')

predict = count_vect_tags.transform([test])
X_tfidf2 = tfidf_transformer_tags.transform(predict)

res = neigh_tags.kneighbors(X_tfidf2, return_distance=True)

In [203]:
for i in res[1][0]:
    print(movies_tags[i])

Magnolia (1999)
In a Lonely Place (1950)
Out of Time (2003)
Our Man Flint (1965)
Our Town (1940)
Out Cold (1989)
Out Cold (2001)
Out of Sight (1998)
Our Lady of the Assassins (Virgen de los sicarios, La) (2000)
Out of the Past (1947)


In [204]:
title_tags = {}

for index, row in tqdm_notebook(movies_with_tags.iterrows()):
    title_tags[row.title] = row.tag

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Построим рекомендательную систему для пользователя на базе жанров и тегов:

In [205]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
    
    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    movie_tags = str(title_tags[last_user_movie])
    
    movie_genres = change_string(movie_genres)
    movie_tags = change_string(movie_tags)

    predict_genres = count_vect_genres.transform([movie_genres])
    X_tfidf2_genres = tfidf_transformer_genres.transform(predict_genres)
    
    predict_tags = count_vect_tags.transform([movie_tags])
    X_tfidf2_tags = tfidf_transformer_tags.transform(predict_tags)

    res_genres = neigh_genres.kneighbors(X_tfidf2_genres, return_distance=True)
    res_tags = neigh_tags.kneighbors(X_tfidf2_tags, return_distance=True)
    
    print('Last User Movie: ', last_user_movie)
    print('Genres of Last Movie: ', movie_genres)
    print('Tags of Last Movie: ', movie_tags, '\n')
    
    print('10 films recommendations based on genres: \n')
    print(movies.iloc[res_genres[1][0]][:10].title.values)
    print('\n')
    print('10 films recommendations based on tags: \n')
    print(movies.iloc[res_tags[1][0]][:10].title.values)

In [208]:
recommend_for_user(5.0)

Last User Movie:  Fargo (1996)
Genres of Last Movie:  Comedy Crime Drama Thriller
Tags of Last Movie:  CoenBrothers 

10 films recommendations based on genres: 

['Pulp Fiction (1994)' 'Freeway (1996)' 'Fargo (1996)'
 "Man Bites Dog (C'est arrivé près de chez vous) (1992)"
 'Informant!, The (2009)' 'Leaves of Grass (2009)' 'Party Monster (2003)'
 'In Bruges (2008)' 'Beautiful Creatures (2000)'
 'Confessions of a Dangerous Mind (2002)']


10 films recommendations based on tags: 

["He's Just Not That Into You (2009)" 'Out to Sea (1997)' 'Greed (1924)'
 'Lilya 4-Ever (Lilja 4-ever) (2002)' 'Final Cut, The (2004)'
 'Blood and Chocolate (2007)' 'Catch and Release (2006)'
 'Seven Up! (1964)' 'Because I Said So (2007)' "Smokin' Aces (2006)"]


In [209]:
recommend_for_user(6.0)

Last User Movie:  Bonnie and Clyde (1967)
Genres of Last Movie:  Crime Drama
Tags of Last Movie:  gangsters 

10 films recommendations based on genres: 

['Animal Factory (2000)' 'Crash (2004)'
 'Three Billboards Outside Ebbing, Missouri (2017)'
 'Good Night, and Good Luck. (2005)' 'City by the Sea (2002)'
 'Papillon (1973)' "Who'll Stop the Rain (1978)"
 'Dog Day Afternoon (1975)' 'Tsotsi (2005)'
 'Shawshank Redemption, The (1994)']


10 films recommendations based on tags: 

['Lilya 4-Ever (Lilja 4-ever) (2002)' 'Final Cut, The (2004)'
 'Witness (1985)' 'Peanuts Movie, The (2015)' 'Epic Movie (2007)'
 'Because I Said So (2007)' 'Norbit (2007)' 'Blood and Chocolate (2007)'
 'Catch and Release (2006)' "Smokin' Aces (2006)"]
