# Задание

1. Использовать датасет [MovieLens](https://grouplens.org/datasets/movielens/latest/).
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
* TF-IDF на тегах и жанрах;
* средние оценки (+ median, variance и т. д.) пользователя и фильма.

3. Оценить RMSE на тестовой выборке.



In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

### Загрузка данных

In [None]:
!wget 'https://drive.google.com/uc?id=1kOh-9U9r0U4e63OAUoB_8JHfrisNxq_-' -O ml-latest-small.zip

--2023-02-28 07:50:28--  https://drive.google.com/uc?id=1kOh-9U9r0U4e63OAUoB_8JHfrisNxq_-
Resolving drive.google.com (drive.google.com)... 74.125.26.113, 74.125.26.100, 74.125.26.102, ...
Connecting to drive.google.com (drive.google.com)|74.125.26.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-14-c4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1muj5f314pa1mlo4cgtaj5llh3o375gi/1677570600000/07609373658266146093/*/1kOh-9U9r0U4e63OAUoB_8JHfrisNxq_-?uuid=ceecd359-dbf0-4378-9eb5-587f908f795d [following]
--2023-02-28 07:50:29--  https://doc-14-c4-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/1muj5f314pa1mlo4cgtaj5llh3o375gi/1677570600000/07609373658266146093/*/1kOh-9U9r0U4e63OAUoB_8JHfrisNxq_-?uuid=ceecd359-dbf0-4378-9eb5-587f908f795d
Resolving doc-14-c4-docs.googleusercontent.com (doc-14-c4-docs.googleusercontent.com)... 74.125.31.132, 2607:f8b0:400c:c02::84
Connecting to doc-14-c4-d

In [None]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ml-latest-small/links.csv  
replace ml-latest-small/tags.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ml-latest-small/tags.csv  
replace ml-latest-small/ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ml-latest-small/ratings.csv  
replace ml-latest-small/README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ml-latest-small/README.txt  
replace ml-latest-small/movies.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ml-latest-small/movies.csv  


In [None]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [None]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


#### Подготовка жанров

In [None]:
# подсчет жанров, к которому относится фильм
movies['num_generes'] = movies['genres'].apply(lambda x: len(x.split('|')))
movies.head()

Unnamed: 0,movieId,title,genres,num_generes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5
1,2,Jumanji (1995),Adventure|Children|Fantasy,3
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,1


In [None]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
# Разделение жанров пробелом на отдельные слова для дальнейшего перевода в вектор
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [None]:
movies.tail(10)

Unnamed: 0,movieId,title,genres,num_generes
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,4
9733,193567,anohana: The Flower We Saw That Day - The Movi...,Animation|Drama,2
9734,193571,Silver Spoon (2014),Comedy|Drama,2
9735,193573,Love Live! The School Idol Movie (2015),Animation,1
9736,193579,Jon Stewart Has Left the Building (2015),Documentary,1
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3
9739,193585,Flint (2017),Drama,1
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,2
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,1


#### Преобразование жанров в векторы TfIdf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()
X_train_genres_tfidf = tfidf.fit_transform(movie_genres)
X_train_genres_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [None]:
X_train_genres_tfidf.shape

(9742, 20)

In [None]:
# датасет с жанрами
df_1 = pd.DataFrame(X_train_genres_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
df_1

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [None]:
# Добавление столбца с movieId
df_1['movieId'] = movies['movieId']

In [None]:
df_1

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western,movieId
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0,3
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0,4
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193581
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193583
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193585
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,193587


#### Подготовка тегов

In [None]:
movies_with_tags = movies.merge(tags, on='movieId')

In [None]:
movies_with_tags

Unnamed: 0,movieId,title,genres,num_generes,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,3,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,3,62,magic board game,1528843932
...,...,...,...,...,...,...,...
3678,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,4,62,star wars,1528934552
3679,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,4,184,anime,1537098582
3680,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,4,184,comedy,1537098587
3681,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,4,184,gintama,1537098603


In [None]:
movies_with_tags.shape

(3683, 7)

In [None]:
movies_with_tags.tag.unique().shape

(1589,)

In [None]:
# удаление пропусков
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.shape

(3683, 6)

In [None]:
movies_with_tags.title.unique().shape

(1572,)

In [None]:
# функция для обработки столбца с тегами (удаление пробелов и дефизов)
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [None]:
tag_strings[:10]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel',
 'lawyers',
 'creepy suspense',
 'shakespearesortof',
 'dogs remake',
 'disney',
 'terrorism',
 'court claustrophobic confrontational earnest gooddialogue greatscreenplay gritty motivational thoughtprovoking',
 'stranded',
 'markruffalo']

#### Преобразование тегов в векторы TfIdf

In [None]:
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1572x1472 sparse matrix of type '<class 'numpy.float64'>'
	with 3598 stored elements in Compressed Sparse Row format>

In [None]:
# датафрейм с тегами
df_2 = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())

In [None]:
df_2

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [None]:
# добавление movieId к датафрейму с тегами
df_2['movieId'] = a['movieId']

In [None]:
df_2.tail(5)

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,movieId
1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,183611
1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,184471
1569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187593
1570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187595
1571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193565


In [None]:
# объединение датасета с жанрами и тегами
dataset = df_1.merge(df_2, how = 'left', on='movieId')

In [None]:
dataset

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,filmnoir_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,...,,,,,,,,,,
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,...,,,,,,,,,,
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,...,,,,,,,,,,
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,...,,,,,,,,,,
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,,,,,,,,,,


In [None]:
# замена NaN на нули
dataset = dataset.fillna(0)

In [None]:
dataset

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,filmnoir_x,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420342
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [None]:
# присоединяем таблицу с рейтингом (оценками) к дафафрейму с тегами и жанрами
df = dataset.merge(ratings, on='movieId')

In [None]:
df.columns

Index(['action_x', 'adventure_x', 'animation_x', 'children_x', 'comedy_x',
       'crime_x', 'documentary_x', 'drama_x', 'fantasy_x', 'filmnoir_x',
       ...
       'wrongfulimprisonment', 'wry', 'youngermen', 'zither', 'zoekazan',
       'zombies', 'zooeydeschanel', 'userId', 'rating', 'timestamp'],
      dtype='object', length=1496)

##### Линейная регрессия предсказание оценки на тегах и жанрах (TF-IDF на тегах и жанрах)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['userId', 'userId', 'timestamp', 'rating'], axis=1)
y = df['rating']

In [None]:
df

Unnamed: 0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,filmnoir_x,...,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,userId,rating,timestamp
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,1,4.0,964982703
1,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,5,4.0,847434962
2,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,7,4.5,1106635946
3,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,15,2.5,1510577970
4,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.482990,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.420342,17,4.5,1305696483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.0,0.575034,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,184,4.0,1537109082
100832,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.0,0.638968,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,184,3.5,1537109545
100833,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,184,3.5,1537109805
100834,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,184,3.5,1537110021


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Оцениваем качество на выборке для тестирования (оцените ее точность на тестовых данных;)
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('MSE:', rmse)

MSE: 7936.482447363208


In [None]:
y_pred

array([3.20149404, 4.16103487, 3.68198317, ..., 3.40995764, 3.71876024,
       3.45337257])

In [None]:
df_result['rating_true'] = y_test
df_result['rating_pred'] = y_pred
df_result

Unnamed: 0,rating_true,rating_pred
67037,3.0,3.201494
42175,4.5,4.161035
93850,4.0,3.681983
6187,3.0,3.815674
12229,5.0,3.576032
...,...,...
57416,2.0,3.257881
67290,4.0,3.333332
33423,3.0,3.409958
98552,4.0,3.718760


#### Подготовка датафрейма к обучению (средние оценки (+ median, variance и т. д.) пользователя и фильма.)

In [None]:
# фильм 1|юзер 1|рейтинг фильма 3.5|рейтинг юзера 4.3
# фильм 1|юзер 2|рейтинг фильма 3.5|рейтинг юзера 2.1
# фильм 2|юзер 1|рейтинг фильма 5.0|рейтинг юзера 4.3

In [None]:
# объединение данных по фильмам с рейтингом
df_with_ratings = movies.merge(ratings, on='movieId')

In [None]:
df_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [None]:
# группировка по movieId c вычислением медианы оценок фильма
data1 = df_with_ratings.filter(items = ['movieId', 'rating']).groupby(['movieId']).median('rating').reset_index()
data1.rename(columns = {'rating' : 'film_rating'}, inplace = True )
data1

Unnamed: 0,movieId,film_rating
0,1,4.0
1,2,3.5
2,3,3.0
3,4,3.0
4,5,3.0
...,...,...
9719,193581,4.0
9720,193583,3.5
9721,193585,3.5
9722,193587,3.5


In [None]:
# группировка по userId c вычисление медианы оценок каждого пользователя
data2 = df_with_ratings.filter(items = ['userId', 'rating']).groupby(['userId']).median('rating').reset_index()
data2.rename(columns = {'rating' : 'user_rating'}, inplace = True )
data2

Unnamed: 0,userId,user_rating
0,1,5.0
1,2,4.0
2,3,0.5
3,4,4.0
4,5,4.0
...,...,...
605,606,4.0
606,607,4.0
607,608,3.0
608,609,3.0


In [None]:
df_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [None]:
# Получение датасета вида:
# фильм 1|юзер 1|рейтинг фильма 3.5|рейтинг юзера 4.3
# фильм 1|юзер 2|рейтинг фильма 3.5|рейтинг юзера 2.1
# фильм 2|юзер 1|рейтинг фильма 5.0|рейтинг юзера 4.3

data = df_with_ratings.merge(data2, on='userId')
data = data.merge(data1, on='movieId')
data = data.filter(['movieId', 'userId', 'film_rating', 'user_rating'])
data

Unnamed: 0,movieId,userId,film_rating,user_rating
0,1,1,4.0,5.0
1,1,5,4.0,4.0
2,1,7,4.0,3.5
3,1,15,4.0,3.5
4,1,17,4.0,4.0
...,...,...,...,...
100831,193579,184,3.5,4.0
100832,193581,184,4.0,4.0
100833,193583,184,3.5,4.0
100834,193585,184,3.5,4.0


### Линейная регрессия предсказание оценки (медиана оценок пользователя для конкретного фильма)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
X = df # датасет с tfidf с жанрами и рейтингами
y = data.user_rating # вычисленная  медиана оценок пользователя для конкретного фильма

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Оцениваем качество на выборке для тестирования (оцените ее точность на тестовых данных;)
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('MSE:', rmse)

MSE: 0.5407366198308401
