## Домашнее задание

Для датасета *MovieLens* построить рекомендации (предсказать оценку) на фичах:
- TF-IDF на тегах и жанрах
- средние оценки пользователя и фильма

Оценить RMSE на тестовой выборке

In [236]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

In [61]:
links = pd.read_csv('MovieLens/links.csv')
movies = pd.read_csv('MovieLens/movies.csv')
ratings = pd.read_csv('MovieLens/ratings.csv')
tags = pd.read_csv('MovieLens/tags.csv')

In [62]:
movies.shape

(9742, 3)

In [63]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [64]:
ratings.shape

(100836, 4)

In [65]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [66]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [67]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [103]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.drop(columns=['timestamp'], inplace=True)
movies_with_tags.dropna(inplace=True)

In [238]:
# получаем средний рейтинг для фильма
r = ratings.set_index('movieId').sort_index().reset_index().drop(columns=['timestamp'])

arrays = [r['movieId'].tolist(), r['userId'].tolist()]
index = pd.MultiIndex.from_arrays(arrays, names=('movieId', 'userId'))
rr = pd.DataFrame(r['rating'].tolist(), index=index, columns=['rating'])

rr = rr.groupby(level='movieId').mean().reset_index()
rr

Unnamed: 0,movieId,rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [120]:
# средний рейтинг для фильма другим, более лаконичным,способом
mean_ratings = ratings.groupby('movieId')[['rating']].mean().reset_index()
mean_ratings

Unnamed: 0,movieId,rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [240]:
# соединяем таблицу с фильмами и тегами с таблицей средних значений рейтингов 
# и убираем все строки, где есть пропуски
movies_tags_ratings = movies_with_tags.join(mean_ratings.set_index('movieId'), on='movieId')
movies_tags_ratings.dropna(inplace=True)
movies_tags_ratings.head()

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,3.92093
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,3.92093
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,3.431818
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,3.431818


In [164]:
# преобразуем теги 
data = movies_tags_ratings.groupby(['title', 'genres', 'rating'])['tag'].apply(' '.join).reset_index()
data.head()

Unnamed: 0,title,genres,rating,tag
0,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,artistic Funny humorous inspiring intelligent ...
1,...And Justice for All (1979),Drama|Thriller,3.166667,lawyers
2,10 Cloverfield Lane (2016),Thriller,3.678571,creepy suspense
3,10 Things I Hate About You (1999),Comedy|Romance,3.527778,Shakespeare sort of
4,101 Dalmatians (1996),Adventure|Children|Comedy,3.074468,dogs remake


In [165]:
# преобразуем жанры
data['genres'] = data['genres'].apply(change_string)
data.head()

Unnamed: 0,title,genres,rating,tag
0,(500) Days of Summer (2009),Comedy Drama Romance,3.666667,artistic Funny humorous inspiring intelligent ...
1,...And Justice for All (1979),Drama Thriller,3.166667,lawyers
2,10 Cloverfield Lane (2016),Thriller,3.678571,creepy suspense
3,10 Things I Hate About You (1999),Comedy Romance,3.527778,Shakespeare sort of
4,101 Dalmatians (1996),Adventure Children Comedy,3.074468,dogs remake


In [167]:
# строим модель
X = data[['genres', 'tag']]
y = data['rating']

In [233]:
def tfidf_transform(X):
    tfidf_vectorizer = TfidfVectorizer()
    X_tfidf_genres = tfidf_vectorizer.fit_transform(X['genres'])
    X_tfidf_tag = tfidf_vectorizer.fit_transform(X['tag'])
    return np.concatenate((X_tfidf_genres.toarray(), X_tfidf_tag.toarray()), axis=1)

In [235]:
X_out = tfidf_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_out, y, test_size = 0.2)

dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)

y_predict = dtree.predict(X_test)

In [237]:
rmse = mean_squared_error(y_test, y_predict, squared=False)

0.5514307934004732

In [246]:
print(f'RMSE is: {rmse:.4f}')

RMSE is: 0.5514
