### Домашнее задание по теме «Рекомендации на основе содержания»


### Задание
- Использовать dataset MovieLens
- Построить рекомендации (регрессия, предсказываем оценку) на фичах:
- TF-IDF на тегах и жанрах
- Средние оценки (+ median, variance, etc.) пользователя и фильма
- Оценить RMSE на тестовой выборке

In [46]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [48]:
links = pd.read_csv('ml-latest-small/links.csv')
links.head(3)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


In [49]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [50]:
movies.shape

(9742, 3)

In [51]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [52]:
ratings.shape

(100836, 4)

In [53]:
tags = pd.read_csv('ml-latest-small/tags.csv')
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [54]:
tags.shape

(3683, 4)

#### Вычислим среднюю и медианную оценку для каждого фильма:

In [55]:
avg_ratings = ratings[['movieId', 'rating']].copy()

In [56]:
avg_ratings = avg_ratings.groupby(['movieId'], as_index = False)['rating'].mean()

In [57]:
avg_ratings= avg_ratings.rename(columns={'rating': 'avg_rating'})
avg_ratings.head()

Unnamed: 0,movieId,avg_rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [58]:
med_ratings = ratings[['movieId', 'rating']].copy()
med_ratings = med_ratings.groupby(['movieId'], as_index = False)['rating'].median()

In [59]:
med_ratings= med_ratings.rename(columns={'rating': 'med_rating'})
med_ratings.head()

Unnamed: 0,movieId,med_rating
0,1,4.0
1,2,3.5
2,3,3.0
3,4,3.0
4,5,3.0


#### Преобразуем данные о жанрах в строки

In [60]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [61]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [62]:
movies['genres'] = [change_string(g) for g in movies.genres.values]

In [63]:
movies['genres'][:5]

0    Adventure Animation Children Comedy Fantasy
1                     Adventure Children Fantasy
2                                 Comedy Romance
3                           Comedy Drama Romance
4                                         Comedy
Name: genres, dtype: object

#### Добавим к данным о жанрах информацию о тегах

In [64]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [65]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [66]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0


#### Добавим оценку:

In [67]:
movies_with_tags = movies_with_tags.join(avg_ratings.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,avg_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,3.92093
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,3.92093
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.92093
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0,3.431818
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0,3.431818


In [68]:
movies_with_tags = movies_with_tags.join(med_ratings.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,avg_rating,med_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,3.92093,4.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,3.92093,4.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.92093,4.0
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0,3.431818,3.5
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0,3.431818,3.5


In [69]:
movies_with_tags.shape

(11853, 8)

#### Объединим теги и жанры

In [70]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,avg_rating,med_rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,3.92093,4.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,3.92093,4.0
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.92093,4.0


In [71]:
movies_with_tags = movies_with_tags.dropna()

In [72]:
movies_with_tags_unique = movies_with_tags.copy()

In [73]:
movies_with_tags_unique['all_tags'] = movies_with_tags_unique['title'].apply(lambda x: (' ').join(list(movies_with_tags_unique[movies_with_tags_unique['title'] == x]['tag'].unique()))) 

In [74]:
movies_with_tags_unique['all_tags'] = movies_with_tags_unique['genres'] + ' ' + movies_with_tags_unique['all_tags'] 

In [75]:
movies_with_tags_unique.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,avg_rating,med_rating,all_tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0,3.92093,4.0,Adventure Animation Children Comedy Fantasy pi...
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0,3.92093,4.0,Adventure Animation Children Comedy Fantasy pi...
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0,3.92093,4.0,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1528844000.0,3.431818,3.5,Adventure Children Fantasy fantasy magic board...
1,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1528844000.0,3.431818,3.5,Adventure Children Fantasy fantasy magic board...


In [78]:
movies_reduced = movies_with_tags_unique.filter(['movieId', 'title','avg_rating','med_rating','all_tags'],axis = 1)

#### Удалим теги-дубликаты и отсутствующие значения

In [79]:
movies_reduced = movies_reduced.drop_duplicates()
movies_reduced = movies_reduced.dropna()
movies_reduced.head()

Unnamed: 0,movieId,title,avg_rating,med_rating,all_tags
0,1,Toy Story (1995),3.92093,4.0,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),3.431818,3.5,Adventure Children Fantasy fantasy magic board...
2,3,Grumpier Old Men (1995),3.259615,3.0,Comedy Romance moldy old
4,5,Father of the Bride Part II (1995),3.071429,3.0,Comedy pregnancy remake
6,7,Sabrina (1995),3.185185,3.0,Comedy Romance remake


In [80]:
# Общее количество фильмов
movies_reduced.title.unique().shape

(1554,)

#### Преобразуем данные тегов и векторизуем их

In [83]:
def change_string(s):
    return str(s).replace('-', '').lower()

tag_strings = []
movies = []
med_ratings = []
avg_ratings = []

for movie, group in tqdm_notebook(movies_reduced.groupby(['title','med_rating','avg_rating'])):
    tag_strings.append(' '.join([change_string(s) for s in group.all_tags.values]))
    movies.append(group.title.values[0])
    med_ratings.append(group.med_rating.values[0])
    avg_ratings.append(group.avg_rating.values[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


  0%|          | 0/1554 [00:00<?, ?it/s]

In [84]:
tag_strings[:10]

['comedy drama romance artistic funny humorous inspiring intelligent quirky romance zooey deschanel',
 'drama thriller lawyers',
 'thriller creepy suspense',
 'comedy romance shakespeare sort of',
 'adventure children comedy dogs remake',
 'adventure animation children disney',
 'drama terrorism',
 'drama court claustrophobic confrontational earnest good dialogue great screenplay gritty motivational thoughtprovoking',
 'adventure drama thriller stranded',
 'comedy fantasy romance mark ruffalo']

In [85]:
len(tag_strings)

1554

In [86]:
movies[:5]

['(500) Days of Summer (2009)',
 '...And Justice for All (1979)',
 '10 Cloverfield Lane (2016)',
 '10 Things I Hate About You (1999)',
 '101 Dalmatians (1996)']

In [87]:
len(movies)

1554

In [88]:
avg_ratings[:5]

[3.6666666666666665,
 3.1666666666666665,
 3.6785714285714284,
 3.5277777777777777,
 3.074468085106383]

In [89]:
len(avg_ratings)

1554

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(tag_strings)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
print(tfidf_df.shape)

(1554, 1680)


In [91]:
tfidf_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679
count,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,...,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0,1554.0
mean,0.000585,0.000573,0.001045,0.001,0.000352,0.001355,0.001003,3.5e-05,0.000366,0.001405,...,0.000267,0.000281,0.003421,0.000427,0.000286,0.000296,0.000317,0.000825,0.002492,0.000253
std,0.013598,0.022596,0.029486,0.027858,0.013878,0.032179,0.028345,0.001373,0.014416,0.018488,...,0.010519,0.011084,0.044582,0.016821,0.011257,0.011679,0.012497,0.023006,0.043923,0.009962
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.389609,0.890742,0.94059,0.786799,0.547077,0.975324,0.912031,0.05412,0.568288,0.325131,...,0.414667,0.43696,0.683894,0.663083,0.443755,0.460414,0.492627,0.66269,0.833245,0.392721


#### Подготовим тестовые и тренировочные данные

In [129]:
X = tfidf_df
avg_y = avg_ratings
med_y = med_ratings

In [130]:
#y = [round(number,1) for number in y]
#y

In [131]:
X_train, X_test, avg_y_train, avg_y_test = train_test_split(X, avg_y, test_size=0.3)
X_train, X_test, med_y_train, med_y_test = train_test_split(X, med_y, test_size=0.3)


#### Построим модель и предскажем рейтинг

In [132]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_jobs= -1,max_depth = 50, random_state = 0,max_features = 'sqrt')
rf.fit(X_train, avg_y_train)
avg_predictions = rf.predict(X_test)
rf.fit(X_train, med_y_train)
med_predictions = rf.predict(X_test)

In [133]:
#med_predictions = [round(number * 2) / 2 for number in med_predictions]
#avg_predictions = [round(number * 2) / 2 for number in avg_predictions]
#predictions = [round(number,1) for number in predictions]

#### RMSE на тестовой выборке (для средней оценки рейтинга):

In [134]:
mean_squared_error(avg_y_test, avg_predictions)

0.27849635606096523

In [135]:
rf.score(X_test, avg_y_test)

-0.25560574192281216

In [136]:
avg_y_test[:10]

[3.8333333333333335,
 2.8,
 2.880952380952381,
 3.5833333333333335,
 4.018796992481203,
 3.425,
 3.5,
 3.75,
 2.6153846153846154,
 3.8333333333333335]

In [137]:
avg_predictions[:10]

array([3.71476978, 3.7162532 , 3.69583796, 3.70852942, 3.72750134,
       3.66481447, 3.83060299, 3.73005027, 3.70623175, 3.56843056])

#### RMSE на тестовой выборке (для медианной оценки рейтинга):

In [138]:
mean_squared_error(med_y_test, med_predictions)

0.2507936180078473

In [139]:
rf.score(X_test, med_y_test)

0.10066831874389603

In [140]:
med_y_test[:10]

[4.25, 3.5, 3.5, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.5]

In [141]:
med_predictions[:10]

array([3.50227053, 3.81159559, 3.89484292, 3.85945951, 3.44662832,
       3.85200409, 3.64206281, 3.40989301, 3.91360592, 3.83233328])

- думаю, что для улучшения результата можно попробовать учесть, что рейтинг изначально задается с шагом в 0.5
- можно использовать классификацию