In [1]:
import pandas as pd
import numpy as np
import scipy

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

In [2]:
% config IPCompleter.greedy = True

### Задание
1. Использовать dataset MovieLens


2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

 -TF-IDF на тегах и жанрах
 
 -Средние оценки (+ median, variance, etc.) пользователя и фильма
 
 
3. Оценить RMSE на тестовой выборке

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [4]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
# добавим к рейтингам инфо по фильмам
movies_with_rating = ratings.join(movies.set_index('movieId'), on='movieId')

In [6]:
# отберем только фильмы рейтингом, по которым есть теги
movies_with_tags = tags.movieId.unique()
movies_with_rating_tags = movies_with_rating[movies_with_rating.movieId.isin(movies_with_tags)]

In [7]:
# создадим агрегированные фичи
movies_agg = movies_with_rating_tags.groupby(by='movieId').agg(['mean', 'count', 'var']).rating.reset_index()

In [8]:
movies_agg.head()

Unnamed: 0,movieId,mean,count,var
0,1,3.92093,215,0.69699
1,2,3.431818,110,0.777419
2,3,3.259615,52,1.112651
3,5,3.071429,49,0.822917
4,7,3.185185,54,0.955625


In [9]:
# var при 1 оценки NaN, заменим на 0
movies_agg.fillna(0, inplace = True)

In [10]:
movies_with_rating_agg = movies_agg.merge(movies, on='movieId', how='left')

In [11]:
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy|Romance
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy|Romance


In [12]:
grouped_tags = tags.groupby(by='movieId')

film_tags = {}
for key, value in grouped_tags.groups.items():
    film_tags[key] = tags.loc[value.values].tag.tolist()

In [13]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: film_tags[x.movieId], axis=1)

In [14]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [15]:
movies_with_rating_agg['genres'] = [change_string(g) for g in movies_with_rating_agg.genres.values]

In [16]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: ' '.join(x.tags), axis=1)

In [17]:
# соберем вместе жанры и теги
movies_with_rating_agg['genres_tags'] = movies_with_rating_agg.apply(lambda x: x.genres + ' ' + x.tags, axis=1)

In [18]:
# уберем лишнее
movies_with_rating_agg.drop(columns=['genres', 'tags'], inplace=True)

In [19]:
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres_tags
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure Children Fantasy fantasy magic board...
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy pregnancy remake
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake


In [22]:
genres_tags = movies_with_rating_agg.genres_tags.tolist()

In [26]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(genres_tags)

In [27]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [30]:
X_tfidf = X_tfidf.toarray()

In [43]:
df_X_tfidf = pd.DataFrame(X_tfidf, index=movies_with_rating_agg.movieId)

In [44]:
df = movies_with_rating_agg.merge(df_X_tfidf, on='movieId')

In [52]:
# симпатичный свод по фильмам с фичами tfidf по жанрам и тегами
df.head()

Unnamed: 0,movieId,mean,count,var,title,genres_tags,0,1,2,3,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure Children Fantasy fantasy magic board...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy pregnancy remake,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


переходим к рекомендациям для юзеров

In [77]:
# отберем TOP 10 юзеров, которые оценивали фильмы с тегами
ratings[ratings['movieId'].isin(movies_with_tags)].groupby('userId').count()['movieId'].sort_values()[-10:]

userId
480     399
274     413
387     421
288     422
606     447
448     448
68      501
599     685
414     827
474    1198
Name: movieId, dtype: int64

In [78]:
# построим датасет для юзера 606
user606_ratings = ratings[(ratings.userId == 606) & ratings.movieId.isin(movies_with_tags)]

In [92]:
df = df.set_index('movieId')

In [93]:
user606_ratings = user606_ratings.join(df, on = 'movieId')

In [97]:
user606_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean,count,var,title,genres_tags,0,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
97364,606,1,2.5,1349082950,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97365,606,7,2.5,1171754710,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97366,606,11,2.5,1174349629,3.671429,70,0.810766,"American President, The (1995)",Comedy Drama Romance politics president,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97368,606,17,4.0,1171838026,3.776119,67,1.312754,Sense and Sensibility (1995),Drama Romance Jane Austen,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97371,606,28,3.5,1173049970,4.227273,11,0.618182,Persuasion (1995),Drama Romance In Netflix queue Jane Austen,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


можно переходить к регрессии, будем использовать Лассо, т.к. фич намного больше, чем наблюдений

In [105]:
X = user606_ratings.drop(columns=['userId', 'rating', 'timestamp', 'title','genres_tags']).set_index('movieId')
y = user606_ratings.loc[:, user606_ratings.columns.isin(['movieId', 'rating'])].set_index('movieId')

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [107]:
model = Lasso().fit(X_train, y_train)

In [109]:
y_test_predict = model.predict(X_test)

In [112]:
# посчитаем RMSE для модели
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(y_test, y_test_predict))

print('RMSE: ', rmse)

RMSE:  0.6203634433101242


Перейдем к подготовке рекомендаций для юзера

In [144]:
# отберем фильмы, которые юзер 606 не оценивал
df_for_reco = df.iloc[~df.index.isin(user606_ratings.movieId.unique())]

In [145]:
df_for_reco.drop(columns=['title','genres_tags'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [146]:
df_for_reco['predicted_score'] = model.predict(df_for_reco)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [156]:
reco_TOP10_for_user = df_for_reco[['mean', 'predicted_score']].sort_values('predicted_score', ascending=False)[:10]

In [160]:
# предсказанные оценки оказались очень близки к друг другу, для финальной сортировки будем использовать среднюю оценку фильма
reco_TOP10_for_user.sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
58559,4.238255,3.822759
608,4.116022,3.832573
79132,4.066434,3.820918
457,3.992105,3.835334
364,3.94186,3.829813
150,3.845771,3.838708
588,3.79235,3.833187
595,3.770548,3.821839
377,3.52924,3.829506
380,3.497191,3.831653


In [162]:
# мне нравится подборка :)
reco_TOP10_for_user.merge(movies.set_index('movieId'), on='movieId')

Unnamed: 0_level_0,mean,predicted_score,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
150,3.845771,3.838708,Apollo 13 (1995),Adventure|Drama|IMAX
457,3.992105,3.835334,"Fugitive, The (1993)",Thriller
588,3.79235,3.833187,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
608,4.116022,3.832573,Fargo (1996),Comedy|Crime|Drama|Thriller
380,3.497191,3.831653,True Lies (1994),Action|Adventure|Comedy|Romance|Thriller
364,3.94186,3.829813,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
377,3.52924,3.829506,Speed (1994),Action|Romance|Thriller
58559,4.238255,3.822759,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
595,3.770548,3.821839,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
79132,4.066434,3.820918,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
