# Домашнее задание

1. Использовать dataset [MovieLens](https://grouplens.org/datasets/movielens/latest/)
 
 
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
     
     
3. Оценить RMSE на тестовой выборке

In [45]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [111]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [40]:
# создать колонки в таблице movies:
    # средняя оценка фильма исходя из тега
    # средняя оценка фильма исходя из жанра
    # target: средняя оценка данного фильма по всем пользователям

In [188]:
avg_rating = ratings.groupby('movieId')[['rating']].mean().sort_values('rating', ascending=False)

In [189]:
avg_rating = avg_rating.reset_index()

In [203]:
avg_rating

Unnamed: 0,movieId,rating
0,88448,5.0
1,100556,5.0
2,143031,5.0
3,143511,5.0
4,143559,5.0
...,...,...
9719,157172,0.5
9720,85334,0.5
9721,53453,0.5
9722,8494,0.5


#### Считаем средний рейтинг фильмов исходя из жанра:

In [41]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [42]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [43]:
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [46]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [47]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [48]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [49]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [50]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]], dtype=int64))

In [61]:
similar_genres = movies.iloc[res[1][0]]

In [69]:
joined_genres = similar_genres.join(avg_rating.set_index('movieId'), on='movieId')

In [73]:
np.mean(joined_genres['rating'])

3.48111658456486

#### Считаем средний рейтинг фильмов уже исходя из тегов:

In [160]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [161]:
movies_with_tags.dropna(inplace=True)

In [162]:
tag_strings = []
movies_names = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_names.append(movie)

HBox(children=(IntProgress(value=0, max=1572), HTML(value='')))




In [163]:
d = {'title': movies_names, 'tag': tag_strings}

In [164]:
movies_tags = pd.DataFrame(d)

In [165]:
movies_tags_joined = movies_tags.join(movies.set_index('title'), on='title')

In [166]:
movies_tags_joined

Unnamed: 0,title,tag,movieId,genres
0,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...,69757,Comedy|Drama|Romance
1,...And Justice for All (1979),lawyers,3420,Drama|Thriller
2,10 Cloverfield Lane (2016),creepy suspense,152077,Thriller
3,10 Things I Hate About You (1999),Shakespearesortof,2572,Comedy|Romance
4,101 Dalmatians (1996),dogs remake,1367,Adventure|Children|Comedy
...,...,...,...,...
1567,Zero Dark Thirty (2012),Afghanistan Americanpropaganda assassination m...,98961,Action|Drama|Thriller
1568,Zombieland (2009),BillMurray darkcomedy EmmaStone funny JesseEis...,71535,Action|Comedy|Horror
1569,Zoolander (2001),benstiller comedy DavidBowie goofy mindlessone...,4816,Comedy
1570,Zulu (1964),Africa,5899,Action|Drama|War


In [116]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [117]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [118]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [119]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [121]:
similar_tags = movies.iloc[res[1][0]]

In [123]:
joined_tags = similar_tags.join(avg_rating.set_index('movieId'), on='movieId')

In [125]:
np.mean(joined_tags['rating'])

3.8447485575537015

#### Теперь создаем цикл для посчета среднего рейтинга похожих жанров для всего массива:

In [215]:
avg_genre_rating = []

In [216]:
for i in movies_tags_joined['genres']:
    
    test = change_string(i)

    predict = count_vect.transform([test])
    X_tfidf2 = tfidf_transformer.transform(predict)
    
    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    similar_genres = movies.iloc[res[1][0]]
    
    joined_genres = similar_genres.join(avg_rating.set_index('movieId'), on='movieId')
    
    avg_genre_rating.append(np.mean(joined_genres['rating']))

In [217]:
avg_genre_rating = pd.DataFrame(avg_genre_rating, columns = ['avg_genre_rating'])

#### Теперь создаем цикл для посчета среднего рейтинга похожих тегов для всего массива:

In [226]:
avg_tag_rating = []

In [227]:
for i in movies_tags_joined['tag']:
    
    test = change_string(i)

    predict = count_vect.transform([test])
    X_tfidf2 = tfidf_transformer.transform(predict)
    
    res = neigh.kneighbors(X_tfidf2, return_distance=True)
    
    similar_tags = movies.iloc[res[1][0]]
    
    joined_tags = similar_tags.join(avg_rating.set_index('movieId'), on='movieId')
    
    avg_tag_rating.append(np.mean(joined_tags['rating']))

In [228]:
avg_tag_rating = pd.DataFrame(avg_tag_rating, columns = ['avg_tag_rating'])

#### Добавляем средние оценки к нашей таблице:

In [229]:
movies_tags_final = movies_tags_joined.join(avg_rating.set_index('movieId'), on='movieId')

In [230]:
del movies_tags_final['avg_rating']

In [231]:
movies_tags_final['avg_genre_rating'] = avg_genre_rating

In [232]:
movies_tags_final['avg_tag_rating'] = avg_tag_rating

In [233]:
movies_tags_final

Unnamed: 0,title,tag,movieId,genres,rating,avg_genre_rating,avg_tag_rating
0,(500) Days of Summer (2009),artistic Funny humorous inspiring intelligent ...,69757,Comedy|Drama|Romance,3.666667,3.774286,3.844749
1,...And Justice for All (1979),lawyers,3420,Drama|Thriller,3.166667,3.844749,3.559577
2,10 Cloverfield Lane (2016),creepy suspense,152077,Thriller,3.678571,3.844749,3.844749
3,10 Things I Hate About You (1999),Shakespearesortof,2572,Comedy|Romance,3.527778,3.721490,3.665387
4,101 Dalmatians (1996),dogs remake,1367,Adventure|Children|Comedy,3.074468,3.394653,3.844749
...,...,...,...,...,...,...,...
1567,Zero Dark Thirty (2012),Afghanistan Americanpropaganda assassination m...,98961,Action|Drama|Thriller,4.107143,3.553460,3.708913
1568,Zombieland (2009),BillMurray darkcomedy EmmaStone funny JesseEis...,71535,Action|Comedy|Horror,3.877358,3.629167,3.501492
1569,Zoolander (2001),benstiller comedy DavidBowie goofy mindlessone...,4816,Comedy,3.509259,3.639387,3.844749
1570,Zulu (1964),Africa,5899,Action|Drama|War,4.000000,3.524017,3.844749


#### Строим модель регрессии:

In [234]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [250]:
data = movies_tags_final.dropna()

In [251]:
X = data[['avg_genre_rating', 'avg_tag_rating']]
y = data['rating'].fillna('')

In [252]:
lin_reg = LinearRegression()

In [255]:
rmse_cv = np.sqrt(-cross_val_score(lin_reg, X, y,  scoring="neg_mean_squared_error", cv = 5))

In [256]:
print("Root Mean Squared Error: {}".format(np.mean(rmse_cv)))

Root Mean Squared Error: 0.5210759920442583
