Использовать dataset MovieLens

1. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

1.1. TF-IDF на тегах и жанрах

1.2. Средние оценки (+ median, variance, etc.) пользователя и фильма

1.3. Оценить RMSE на тестовой выборке

In [169]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

%matplotlib inline

In [170]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [171]:
tables_dict = {'links':links, 'movies':movies, 'ratings':ratings, 'tags':tags}

In [172]:
for key, value in tables_dict.items():
    all_rows = value.shape[0]
    no_nan_rows = value.dropna(inplace=False).shape[0]
    print('Датасет {}'.format(key))
    print('Всего строк: {}, из них {} с отсутствующими значениями'.format(all_rows, all_rows - no_nan_rows))
    if all_rows - no_nan_rows > 0:
        print("Применить dropna?")
    print('-'*50)

Датасет links
Всего строк: 9742, из них 8 с отсутствующими значениями
Применить dropna?
--------------------------------------------------
Датасет movies
Всего строк: 9742, из них 0 с отсутствующими значениями
--------------------------------------------------
Датасет ratings
Всего строк: 100836, из них 0 с отсутствующими значениями
--------------------------------------------------
Датасет tags
Всего строк: 3683, из них 0 с отсутствующими значениями
--------------------------------------------------


In [173]:
# В описании к датасету сказано:
#tmdbId-это идентификатор фильмов, используемых https://www.themoviedb.org ... 
#Например, фильм История игрушек имеет ссылку https://www.themoviedb.org/movie/862 ...
#links нам не понадобится

In [174]:
for key, value in tables_dict.items():
    print('*' * 20, key, '*' * 20)
    print(value.head(4), '\n')

******************** links ********************
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0 

******************** movies ********************
   movieId                     title  \
0        1          Toy Story (1995)   
1        2            Jumanji (1995)   
2        3   Grumpier Old Men (1995)   
3        4  Waiting to Exhale (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance   

******************** ratings ********************
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815 

******************** tags ********************
   userId  movieId      

In [175]:
# тегов много и они разные, иногда поторяются
tags[['movieId', 'tag']][5:15]


Unnamed: 0,movieId,tag
5,89774,Tom Hardy
6,106782,drugs
7,106782,Leonardo DiCaprio
8,106782,Martin Scorsese
9,48516,way too long
10,431,Al Pacino
11,431,gangster
12,431,mafia
13,1221,Al Pacino
14,1221,Mafia


In [176]:
# приведем к нижнему регистру все теги, чтобы например 'drugs' и 'Drugs' считались одним и тем же тегом, а не разными
tags['tag'] = tags['tag'].str.lower()
tags[['movieId', 'tag']][5:15]

Unnamed: 0,movieId,tag
5,89774,tom hardy
6,106782,drugs
7,106782,leonardo dicaprio
8,106782,martin scorsese
9,48516,way too long
10,431,al pacino
11,431,gangster
12,431,mafia
13,1221,al pacino
14,1221,mafia


In [177]:
# агрегируем все теги по каждому фильму
tags = tags.groupby('movieId')[['tag']].agg(' '.join)
tags.head()

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar pixar fun
2,fantasy magic board game robin williams game
3,moldy old
5,pregnancy remake
7,remake


In [178]:
# готовим датафрейм путем объединения movies и tags
data = pd.merge(movies, tags, left_on='movieId', right_index=True)
data.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game robin williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
6,7,Sabrina (1995),Comedy|Romance,remake


In [179]:
def change_string(s):
    """
    Функция превращает строку вида
    'Adventure|Animation|Children|Comedy|Fantasy'
    в строку вида
    'Adventure Animation Children Comedy Fantasy'
    """
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [180]:
# подготавливаем жанры
genres = [change_string(g) for g in data.genres.values];

In [181]:
data['genres'] = genres
data.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game robin williams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
6,7,Sabrina (1995),Comedy Romance,remake


In [182]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [183]:
# для работы с жанрами возьмем CountVectorizer
count_vect = CountVectorizer()
"""Модуль CountVectorizer в sklearn как раз подзволяет сконвертировать набор текстов в матрицу токенов, 
находящихся в тексте. Также имеется много полезных настроек, 
например можно задать минимальное количество необходимое для появления токена в матрице и даже получить статистику 
по n-граммам. Следует учитывать, что CountVectorizer по умолчанию сам производит токенизацию и выкидывает слова
с длиной меньшей чем два."""
X_train_counts = count_vect.fit_transform(data.genres)

In [184]:
count_vect.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'action': 0,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [185]:
"""
чтобы получить сгенерированный словарь, из приведенной структуры CountVectorizer,
стоит отметить что порядок совпадает с матрицей
"""
count_vect.get_feature_names()

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'horror',
 'imax',
 'musical',
 'mystery',
 'nogenreslisted',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [186]:
# чтобы узнать индекс токена в словаре
count_vect.vocabulary_.get('children') # вернет 3

3

In [187]:
# показать матрицу
X_train_counts.toarray()

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [188]:
# матрица не нампи, а непривычная спарс матрица из scipy
type(X_train_counts)

scipy.sparse.csr.csr_matrix

In [189]:
X_train_counts.toarray().shape

(1572, 20)

In [190]:
data.shape

(1572, 4)

In [191]:
# можем также узнать количественное вхождение каждого слова:
matrix_freq = np.asarray(X_train_counts.sum(axis=0)).ravel()
final_matrix = np.array([np.array(count_vect.get_feature_names()), matrix_freq])

In [192]:
final_matrix

array([['action', 'adventure', 'animation', 'children', 'comedy',
        'crime', 'documentary', 'drama', 'fantasy', 'filmnoir', 'horror',
        'imax', 'musical', 'mystery', 'nogenreslisted', 'romance',
        'scifi', 'thriller', 'war', 'western'],
       ['235', '217', '86', '93', '519', '207', '68', '884', '120', '35',
        '91', '30', '85', '131', '1', '347', '160', '297', '77', '25']],
      dtype='<U21')

In [193]:
# для получения вектора фильма, учитывающего TF.IDF оценку жанров, возьмем TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [194]:
type(X_train_tfidf)

scipy.sparse.csr.csr_matrix

In [195]:
# не очень наглядно
X_train_tfidf

<1572x20 sparse matrix of type '<class 'numpy.float64'>'
	with 3708 stored elements in Compressed Sparse Row format>

In [196]:
# приведем матрицу scipy в массив numpy
X_train_tfidf.toarray()

array([[0.        , 0.39861329, 0.52164113, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.49508056, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.59650626, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.44396322, 0.4561219 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.46534105, 0.        , 0.62564122, ..., 0.        , 0.        ,
        0.        ]])

In [197]:
X_train_tfidf.shape

(1572, 20)

In [198]:
# Считаем, что для жанров мы получили векторы. Теперь то же самое для тегов

In [199]:
count_vect_2 = CountVectorizer()

In [200]:
X_train_counts_2 = count_vect_2.fit_transform(data.tag)

In [201]:
tfidf_transformer_2 = TfidfTransformer()
X_train_tfidf_2 = tfidf_transformer_2.fit_transform(X_train_counts_2)

In [202]:
X_train_tfidf_2.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [203]:
X_train_tfidf_2.shape

(1572, 1744)

In [204]:
matrix_freq_2 = np.asarray(X_train_counts_2.sum(axis=0)).ravel()
final_matrix_2 = np.array([np.array(count_vect_2.get_feature_names()), matrix_freq_2])

In [205]:
final_matrix_2

array([['06', '1900s', '1920s', ..., 'zombie', 'zombies', 'zooey'],
       ['3', '1', '2', ..., '2', '6', '1']], dtype='<U21')

In [206]:
# теперь сольем оба массива и получим вектор для каждого фильма, собранный из двух компонент:
# опиcания жанров и описания тегов
from scipy.sparse import coo_matrix, hstack
genr_tag_array = hstack([X_train_tfidf,X_train_tfidf_2])
genr_tag_array.toarray()

array([[0.        , 0.39861329, 0.52164113, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.49508056, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.59650626, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.44396322, 0.4561219 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.46534105, 0.        , 0.62564122, ..., 0.        , 0.        ,
        0.        ]])

In [207]:
# по измерениям все сходится
genr_tag_array.shape

(1572, 1764)

Попробуем просто найти похожие фильмы

In [208]:
# для поиска похожих фильмов будем использовать KNN
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(genr_tag_array)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [209]:
# найдем какие-нибудь фильмы, у которых в названии есть 'war'
pattern = 'war'
inds = []
for i in range(data.shape[0]):
    try:
        if pattern in data['title'][i].lower():
            inds.append(i)
    except:
        continue

data.loc[inds]

Unnamed: 0,movieId,title,genres,tag
224,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure SciFi,classic space action action sci-fi epic great ...
251,290,Once Were Warriors (1994),Crime Drama,in netflix queue
487,556,"War Room, The (1993)",Documentary,politics
511,594,Snow White and the Seven Dwarfs (1937),Animation Children Drama Fantasy Musical,disney
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action Adventure SciFi,i am your father space space opera classic geo...
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure SciFi,darth vader luke skywalker space opera
1078,1398,In Love and War (1996),Romance War,hemingway


In [210]:
# для теста модели возьмем Star Wars: Episode IV - A New Hope (1977)
data_test = data.loc[[224]]
data_test

Unnamed: 0,movieId,title,genres,tag
224,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure SciFi,classic space action action sci-fi epic great ...


In [211]:
# тег весьма содержательный и большой
data_test.tag.values[0]

'classic space action action sci-fi epic great soundtrack classic sci-fi engrossing adventure epic classic nerd classic sci-fi space action space epic oldie but goodie sci-fi star wars robots and androids space classic sci-fi sci-fi space adventure star wars darth vader luke skywalker space opera'

In [212]:
# жанры выгядят для этого фильма так
data_test.genres.values[0]

'Action Adventure SciFi'

In [213]:
# по сути так выгядит объект, к которому мы ищем наиболее похожие
# жанры плюс теги
data_test.genres.values[0]  + ' ' +  data_test.tag.values[0]

'Action Adventure SciFi classic space action action sci-fi epic great soundtrack classic sci-fi engrossing adventure epic classic nerd classic sci-fi space action space epic oldie but goodie sci-fi star wars robots and androids space classic sci-fi sci-fi space adventure star wars darth vader luke skywalker space opera'

In [214]:
# используем уже обученный CountVectorizer для жанров - count_vect
X_train_counts_test = count_vect.transform(data_test.genres)
X_train_counts_test.shape

(1, 20)

In [215]:
# применяем уже обученный TfidfTransformer для жанров - tfidf_transformer
X_train_tfidf_test = tfidf_transformer.transform(X_train_counts_test) # получили X_train_tfidf_test
X_train_tfidf_test.shape

(1, 20)

In [216]:
# используем уже обученный CountVectorizer для жанров - count_vect_2
X_train_counts_2_test = count_vect_2.transform(data_test.tag)
X_train_counts_2_test.shape

(1, 1744)

In [217]:
# применяем уже обученный TfidfTransformer для жанров - tfidf_transformer_2
X_train_tfidf_2_test = tfidf_transformer_2.transform(X_train_counts_2_test) # получили X_train_tfidf_2_test
X_train_tfidf_2_test.shape

(1, 1744)

In [218]:
#сливаем оба получившихся массива
genr_tag_array_test = hstack([X_train_tfidf_test,X_train_tfidf_2_test])
genr_tag_array_test.toarray() # получили genr_tag_array_test, который уже можно скармливать

array([[0.54742359, 0.56241571, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [219]:
# форма исследуемого вектора правильная (1 строка, 1744 + 20 фичей)
genr_tag_array_test.shape

(1, 1764)

In [220]:
# ищем ближайших соседей (KNN)
res = neigh.kneighbors(genr_tag_array_test, return_distance=True)

In [221]:
# смотрим на результат
res

(array([[0.        , 1.06650093, 1.07827509, 1.13504273, 1.15004809,
         1.29683751, 1.32351182]]),
 array([[  65,  288, 1315,  291,  299,  593,  206]], dtype=int64))

In [222]:
# рекомендация выглядит весьма убедительно
data.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,tag
224,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure SciFi,classic space action action sci-fi epic great ...
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action Adventure SciFi,i am your father space space opera classic geo...
5896,33493,Star Wars: Episode III - Revenge of the Sith (...,Action Adventure SciFi,space space opera
902,1200,Aliens (1986),Action Adventure Horror SciFi,action aliens horror sci-fi space space craft ...
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure SciFi,darth vader luke skywalker space opera
2001,2662,"War of the Worlds, The (1953)",Action Drama SciFi,classic
706,924,2001: A Space Odyssey (1968),Adventure Drama SciFi,hal space aliens apes arthur c. clarke artific...


In [223]:
# Мы в этой модели используем сразу 2 характеристики фильмов - какие у него теги и какие жанры

In [224]:
# соберем названия колонок в список и далее создадим датафрейм с уже рассчитанными фичами TF.IDF
feature_cols = list(final_matrix[0]) + list(final_matrix_2[0])
len(feature_cols)

1764

In [225]:
genr_tag_array.toarray().shape

(1572, 1764)

In [226]:
data_tfidf = pd.DataFrame(genr_tag_array.toarray(), columns = feature_cols, index=None)
data_tfidf.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
# переименуем столбцы data, чтобы избежать риска, что названия колонок и тегов повторятся 
data.columns = ['movieId_data', 'title_data', 'genres_data','tag_data']
data.head()

Unnamed: 0,movieId_data,title_data,genres_data,tag_data
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game robin williams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
6,7,Sabrina (1995),Comedy Romance,remake


In [228]:
# объединим два датафрейма
data_tfidf = pd.concat([data, data_tfidf], axis=1)
data_tfidf = data_tfidf.dropna().reset_index()


In [229]:
del data_tfidf['index']
data_tfidf.head()

Unnamed: 0,movieId_data,title_data,genres_data,tag_data,action,adventure,animation,children,comedy,crime,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,0.0,0.398613,0.521641,0.511277,0.282182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game robin williams game,0.0,0.495081,0.0,0.635009,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,Grumpier Old Men (1995),Comedy Romance,moldy old,0.0,0.0,0.0,0.0,0.643145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,Father of the Bride Part II (1995),Comedy,pregnancy remake,0.0,0.0,0.0,0.0,0.643145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,Sabrina (1995),Comedy Romance,remake,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
data_tfidf.shape

(504, 1768)

In [231]:
# следующим шагом вытащим год

In [232]:
def extract_year(s):
    try:
        return int(s[-5:-1])
    except:
        return 0

In [233]:
data_tfidf['title_data'] = data_tfidf['title_data'].apply(extract_year)

In [234]:
data_tfidf.head()

Unnamed: 0,movieId_data,title_data,genres_data,tag_data,action,adventure,animation,children,comedy,crime,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1.0,1995,Adventure Animation Children Comedy Fantasy,pixar pixar fun,0.0,0.398613,0.521641,0.511277,0.282182,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1995,Adventure Children Fantasy,fantasy magic board game robin williams game,0.0,0.495081,0.0,0.635009,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1995,Comedy Romance,moldy old,0.0,0.0,0.0,0.0,0.643145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,1995,Comedy,pregnancy remake,0.0,0.0,0.0,0.0,0.643145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,1995,Comedy Romance,remake,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [235]:
del data_tfidf['genres_data']
del data_tfidf['tag_data']

In [236]:
data_tfidf.head()

Unnamed: 0,movieId_data,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
0,1.0,1995,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1995,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.0,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
# Все, у нас готов датафрейм, который можно скормить регрессионной модели
# в качестве целевой переменной используем средние оценки фильмов

In [238]:
views = ratings[['movieId', 'rating']].groupby('movieId').count()
# найдем средние оценки для всех фильмов
average_rate = ratings[['movieId', 'rating']].groupby('movieId').mean()#.sort_values(['rating'], ascending=False)
average_rate.head(8)

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92093
2,3.431818
3,3.259615
4,2.357143
5,3.071429
6,3.946078
7,3.185185
8,2.875


In [239]:
data_tfidf.set_index('movieId_data').join(average_rate, on='movieId_data', how='left').head(7)

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1995,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
2.0,1995,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
3.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
5.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
7.0,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185
11.0,1995,0.0,0.0,0.0,0.0,0.0,0.447946,0.0,0.233389,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.671429
14.0,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531774,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.833333


In [240]:
# теперь соединим data_tfidf (предикторы) и average_rate (целевая переменная) по индексам
df = data_tfidf.set_index('movieId_data').join(average_rate, on='movieId_data', how='left')
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1995,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
2.0,1995,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
3.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
5.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
7.0,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185


In [241]:
#df['rating'] = df['rating'].apply(round)

In [242]:
# посмотрим основные статистики по целевой переменной
df.rating.describe()

count    503.000000
mean       3.690446
std        0.470147
min        1.416667
25%        3.418155
50%        3.766667
75%        4.025000
max        4.750000
Name: rating, dtype: float64

In [243]:
df.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1995,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
2.0,1995,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
3.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
5.0,1995,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
7.0,1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185


In [244]:
# год фильма имеет размерность на несколько порядков выше, чем у остальных фичей
df.describe()

Unnamed: 0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
count,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0,...,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0,503.0
mean,1982.023857,0.077636,0.083224,0.034267,0.044863,0.208179,0.083933,0.038777,0.299862,0.038089,...,0.0,0.001353,0.000958,0.0,0.000994,0.0,0.001406,0.001988,0.000777,3.690446
std,18.535081,0.195737,0.2008,0.144441,0.164915,0.323923,0.224513,0.191224,0.337284,0.146178,...,0.0,0.030348,0.021479,0.0,0.022294,0.0,0.031528,0.044588,0.017426,0.470147
min,1922.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.416667
25%,1974.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.418155
50%,1992.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263392,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.766667
75%,1995.0,0.0,0.0,0.0,0.0,0.434504,0.0,0.0,0.464918,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.025
max,1998.0,0.87853,0.88385,1.0,0.9244,1.0,0.886846,1.0,1.0,0.860889,...,0.0,0.680634,0.481727,0.0,0.5,0.0,0.707107,1.0,0.390825,4.75


In [252]:
# нормируем признак

In [251]:
def old_fashion(year):
    year = int(year)
    if year < 1974:
        return 0.7
    elif year < 1992:
        return 0.8
    elif year < 1995:
        return 0.9
    else:
        return 1

In [253]:
df['title_data'] = df['title_data'].apply(old_fashion)
df.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.0,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
2.0,1.0,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
3.0,1.0,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
5.0,1.0,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185


In [257]:
# теперь найдем пользователя, для которого будем делать рекомендации
# он должен поставить много оценок и тегов
users = ratings[ratings.movieId.isin(df.index)].userId.unique()

In [258]:
# пользователей, которые поставили оценку
len(users)

608

In [259]:
# пользователей, которые поставили тег

len(pd.read_csv('tags.csv').userId.unique())

58

In [260]:
# пересечем их множества и получим кол-во пользователей, которые ставили и тег и оценку
active_users = set(users).intersection(set(pd.read_csv('tags.csv').userId.unique()))
len(active_users)

57

In [261]:
# TOP-20 самые активных пользователей по тегам
tag_s = pd.read_csv('tags.csv')

act_tag = tag_s[['userId', 'movieId', 'tag']][tag_s.userId.isin(active_users)].groupby('userId').count()\
                                                                                    .sort_values('tag', ascending=False)
act_tag.head(20)

Unnamed: 0_level_0,movieId,tag
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
474,1507,1507
567,432,432
62,370,370
599,323,323
477,280,280
424,273,273
537,100,100
125,48,48
357,45,45
318,41,41


In [262]:
# TOP-20 самые активных пользователей по рейтингам

act_rat = ratings[['userId', 'movieId', 'rating']][ratings\
                                                      .userId.isin(active_users)].groupby('userId').count()\
.sort_values('rating', ascending=False)
act_rat.head(20)

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
599,2478,2478
474,2108,2108
274,1346,1346
610,1302,1302
606,1115,1115
288,1055,1055
177,904,904
318,879,879
600,763,763
305,677,677


In [263]:
set(act_tag.head(20).index).intersection(set(act_rat.head(20).index))

{18, 62, 305, 318, 357, 474, 477, 567, 599, 606}

In [264]:
import random
random.seed(42)

In [270]:
# нашли номер пользователя, для которого будем делать рекомендации
current_user = random.choice(list(set(act_tag.head(20).index).intersection(set(act_rat.head(20).index))))
current_user

599

In [312]:
# найдем среди фильмов, имеющих теги и оценки, те, которые оценил наш пользователь
current_user_films = ratings[(ratings.userId == current_user)\
                             & ratings.movieId.isin(df.index)].drop(columns='timestamp')

In [313]:
current_user_films.shape

(299, 3)

In [314]:
# перввые 5 фильмов, отмеченных пользователем
current_user_films.head()

Unnamed: 0,userId,movieId,rating
92623,599,1,3.0
92624,599,2,2.5
92625,599,3,1.5
92627,599,7,2.5
92630,599,11,2.5


In [315]:
# фильтруем из датафрейма df фильмы, отмеченных пользователем

user_df = df[df.index.isin(current_user_films.movieId)]
user_df.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.0,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
2.0,1.0,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
3.0,1.0,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615
7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.185185
11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.447946,0.0,0.233389,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.671429


In [316]:
# разбиваем датафрейм на предикторы и целевую переменную
X = user_df.drop(columns=['rating'])
X.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1.0,0.0,0.398613,0.521641,0.511277,0.282182,0.0,0.0,0.0,0.477459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,1.0,0.0,0.495081,0.0,0.635009,0.0,0.0,0.0,0.0,0.593008,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,1.0,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.447946,0.0,0.233389,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [317]:
y = user_df.rating
y.head()

movieId_data
1.0     3.920930
2.0     3.431818
3.0     3.259615
7.0     3.185185
11.0    3.671429
Name: rating, dtype: float64

In [318]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [319]:
# в качестве модели используем Лассо-регрессию, т.к. она зануляет коэффициенты неважных фичей
# лассо может обнулять значения коэффициентов, тем самым полностью убирая признак из датасета 
# (так как при вычислении результирующей переменной соответствующий признак будет умножен на ноль). 
# Таким образом, с регрессией лассо модель может полностью избавиться от шумов в данных. 
from sklearn.linear_model import Lasso

In [325]:
lm = Lasso(alpha=0.001, max_iter=1000, normalize=True, tol=0.0001)

In [326]:
lm.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [327]:
# средне-квадратическая ошибка
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, lm.predict(X_test))

0.15882425891503038

In [330]:
# обучим модель на всех данных по пользователю

lm.fit(X, y)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [331]:
# теперь мы знаем оценки фильмов, которые смотрел пользователь и имеем модель, 
# которая может предсказать его вероятные оценки

In [344]:
# посмотрим, что любит наш пользователь (смотрел последнее время)
# ужасы, документалки, научные

ratings[ratings.userId == current_user].merge(movies.set_index('movieId'), on='movieId')\
.sort_values('timestamp', ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
132,599,332,1.5,1519558046,Village of the Damned (1995),Horror|Sci-Fi
1553,599,6734,2.0,1519557945,Memoirs of an Invisible Man (1992),Comedy|Romance|Sci-Fi|Thriller
2073,599,66171,2.5,1519535068,Push (2009),Sci-Fi|Thriller
1477,599,6005,2.0,1519530214,Blue Collar Comedy Tour: The Movie (2003),Comedy|Documentary
2140,599,78959,3.0,1519528505,Endgame (2009),Drama
582,599,1719,3.5,1519475938,"Sweet Hereafter, The (1997)",Drama
818,599,2525,2.0,1519470824,Alligator (1980),Action|Horror|Sci-Fi
2441,599,167706,2.5,1519470313,Shakespeare Behind Bars (2005),Documentary
2346,599,114126,2.5,1519462490,Beautiful Losers (2008),Documentary
1858,599,33725,3.5,1519459381,It's All Gone Pete Tong (2004),Comedy|Drama|Musical


In [333]:
# предскажем оценки, которые мог бы поставить пользователь остальным фильмам (которые он не смотрел)
user_df.index


Float64Index([   1.0,    2.0,    3.0,    7.0,   11.0,   16.0,   17.0,   21.0,
                26.0,   29.0,
              ...
              2023.0, 2028.0, 2054.0, 2058.0, 2064.0, 2071.0, 2076.0, 2078.0,
              2080.0, 2100.0],
             dtype='float64', name='movieId_data', length=299)

In [334]:
df_for_rec = df.iloc[~df.index.isin(user_df.index)]
df_for_rec = df_for_rec.drop(columns=['rating'])
df_for_rec.head()

Unnamed: 0_level_0,title_data,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,york,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey
movieId_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5.0,1.0,0.0,0.0,0.0,0.0,0.643145,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531774,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25.0,1.0,0.0,0.0,0.0,0.0,0.56279,0.0,0.0,0.420749,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28.0,1.0,0.0,0.0,0.0,0.0,0.0,0.567986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [335]:
df_for_rec['prediction_for_user'] = lm.predict(df_for_rec)

In [336]:
df_for_rec.prediction_for_user.describe()

count    204.000000
mean       3.765757
std        0.191672
min        3.195027
25%        3.593864
50%        3.812903
75%        3.951866
max        4.266818
Name: prediction_for_user, dtype: float64

In [341]:
rec = df_for_rec[['prediction_for_user']].sort_values('prediction_for_user', ascending=False).iloc[:20]
rec

Unnamed: 0_level_0,prediction_for_user
movieId_data,Unnamed: 1_level_1
1209.0,4.266818
1013.0,4.241502
952.0,4.141039
199.0,4.064852
938.0,4.057736
903.0,4.023416
1179.0,4.021093
1284.0,4.017534
1082.0,4.012815
1066.0,4.01075


In [343]:
movies.head()
movies[movies['movieId'].isin(rec.index)]

Unnamed: 0,movieId,title,genres
168,199,"Umbrellas of Cherbourg, The (Parapluies de Che...",Drama|Musical|Romance
684,902,Breakfast at Tiffany's (1961),Drama|Romance
685,903,Vertigo (1958),Drama|Mystery|Romance|Thriller
719,938,Gigi (1958),Musical
720,940,"Adventures of Robin Hood, The (1938)",Action|Adventure|Romance
721,941,"Mark of Zorro, The (1940)",Adventure
725,945,Top Hat (1935),Comedy|Musical|Romance
732,952,Around the World in 80 Days (1956),Adventure|Comedy
744,971,Cat on a Hot Tin Roof (1958),Drama
768,1010,"Love Bug, The (1969)",Children|Comedy


In [None]:
# мы все сделали правильно
# но почему наиболее подходящей рекоменацией являются Шербургские зонтики и Завтрак у Тиффани?