## Домашнее задание «Гибридные рекомендатльные системы»

In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np
%matplotlib inline

from implicit.als import AlternatingLeastSquares

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head(1)

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0


In [8]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [9]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [44]:
# The SVD++ algorithm, an extension of SVD taking into account implicit ratings.
algo = SVDpp(n_factors=20, n_epochs=20)
algo.fit(trainset);

In [45]:
test_pred = algo.test(testset)

In [46]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8585


0.858467962847227

In [47]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [48]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [49]:
# how it looks like
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [51]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [52]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [53]:
neigh = NearestNeighbors(n_neighbors=30 , n_jobs=-1, metric='euclidean') # 30 films
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=30, p=2, radius=1.0)

In [31]:
title_genres = {}

for index, row in tqdm_notebook(movies.iterrows()):
    title_genres[row.title] = row.genres

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
# напишем функцию, которая будет рекомендовать фильм на основе полследнего с рейтингом более 4 

In [112]:
def recommend_for_user(user_id):
 
    user_movies = movies_with_ratings[
        (movies_with_ratings.userId == user_id) & (movies_with_ratings.rating>4)].title.unique()

    last_user_movie = user_movies[-1]
    
    movie_genres = title_genres[last_user_movie]
    
    movie_genres = change_string(movie_genres)

    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies.iloc[res[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score:
        if movie in user_movies:
            continue
        else:  
            scores.append(algo.predict(uid=current_user_id, iid=movie).est)
            titles.append(movie)
        
    
    sort_films = dict((sorted(list(dict(zip(titles,scores)).items()),key=lambda x:x[1],reverse=True)[:10]))
    return pd.DataFrame(sort_films,index=[user_id],columns=sort_films.keys())

In [114]:
# return DF with films
recommend_for_user(35.0)

Unnamed: 0,There Will Be Blood (2007),High Noon (1952),Unforgiven (1992),"Searchers, The (1956)","Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)",True Grit (1969),"Treasure of the Sierra Madre, The (1948)","Magnificent Seven, The (1960)",Hud (1963),Lonesome Dove (1989)
35.0,4.445834,4.382674,4.307656,4.258279,4.171002,4.154269,4.11229,4.025637,3.887784,3.886765
