Использовать dataset MovieLens
Построить рекомендации (регрессия, предсказываем оценку) на фичах:
TF-IDF на тегах и жанрах
Средние оценки (+ median, variance, etc.) пользователя и фильма
Оценить RMSE на тестовой выборке

In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm_notebook
from sklearn.metrics import mean_squared_error

In [6]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [7]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [143]:
movies[movies['title'] == 'Black Mirror']

Unnamed: 0,movieId,title,genres
9611,176601,Black Mirror,(no genres listed)


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [72]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [73]:
joined_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [74]:
# разбиваем жанровые строки на слова
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

joined_ratings['genres'] = joined_ratings.genres.apply(change_string)
joined_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance
2,1,6,4.0,964982224,Heat (1995),Action Crime Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime Mystery Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama Horror Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action Crime Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action SciFi


In [145]:
# получаем список жанров
str_  = ''
for item in joined_ratings['genres']:
    str_ += ' ' + item
genres_list = list(set(str_.strip().split(' ')))
genres_list

['Drama',
 'Animation',
 'Documentary',
 'IMAX',
 'Adventure',
 'FilmNoir',
 'Thriller',
 'Mystery',
 'Comedy',
 'Crime',
 'Western',
 'Fantasy',
 'Children',
 'Action',
 'SciFi',
 'Musical',
 '(nogenreslisted)',
 'Romance',
 'Horror',
 'War']

In [172]:
#формируем датафрейм user_item - это средние и медианные значения оценок пользователей, в том числе по жанрам(ниже)
user_item = pd.DataFrame()
user_item['user_mean_rating'] = joined_ratings.groupby('userId').rating.mean()
user_item['user_median_rating'] = joined_ratings.groupby('userId').rating.median()
user_item['userId'] = user_item.index
user_item

Unnamed: 0_level_0,user_mean_rating,user_median_rating,userId
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.366379,5.0,1
2,3.948276,4.0,2
3,2.435897,0.5,3
4,3.555556,4.0,4
5,3.636364,4.0,5
...,...,...,...
606,3.657399,4.0,606
607,3.786096,4.0,607
608,3.134176,3.0,608
609,3.270270,3.0,609


In [173]:
#формируем датафрейм user_item - это средние и медианные значения оценок пользователей, в том числе по жанрам
for genre in genres_list:
#    joined_ratings[joined_ratings['genres'].str.contains(genre) ].rating.mean()
    user_item['user_mean_rating_%s' % genre] = joined_ratings[joined_ratings['genres'].str.contains(genre) ].groupby('userId').rating.mean()
    user_item['user_median_rating_%s' % genre] = joined_ratings[joined_ratings['genres'].str.contains(genre) ].groupby('userId').rating.median()

  return func(self, *args, **kwargs)


In [174]:
user_item

Unnamed: 0_level_0,user_mean_rating,user_median_rating,userId,user_mean_rating_Drama,user_median_rating_Drama,user_mean_rating_Animation,user_median_rating_Animation,user_mean_rating_Documentary,user_median_rating_Documentary,user_mean_rating_IMAX,...,user_mean_rating_Musical,user_median_rating_Musical,user_mean_rating_(nogenreslisted),user_median_rating_(nogenreslisted),user_mean_rating_Romance,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.366379,5.0,1,4.529412,5.0,4.689655,5.0,,,,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.00,4.500000,5.0
2,3.948276,4.0,2,3.882353,4.0,,,4.333333,5.00,3.750000,...,,,,,4.500000,4.5,3.000000,3.00,4.500000,4.5
3,2.435897,0.5,3,0.750000,0.5,0.500000,0.5,,,,...,0.500000,0.5,,,0.500000,0.5,4.687500,4.75,0.500000,0.5
4,3.555556,4.0,4,3.483333,4.0,4.000000,4.0,4.000000,4.00,3.000000,...,4.000000,4.0,,,3.379310,3.5,4.250000,4.00,3.571429,4.0
5,3.636364,4.0,5,3.800000,4.0,4.333333,4.5,,,3.666667,...,4.400000,5.0,,,3.090909,3.0,3.000000,3.00,3.333333,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.657399,4.0,606,3.787966,4.0,3.714286,4.0,3.800000,4.00,3.062500,...,3.727273,4.0,,,3.740845,4.0,3.346154,3.50,3.792308,4.0
607,3.786096,4.0,607,4.012195,4.0,3.333333,3.0,,,5.000000,...,3.600000,3.0,,,3.517241,3.0,4.114286,4.00,4.166667,5.0
608,3.134176,3.0,608,3.437500,3.5,3.118182,3.5,3.000000,3.25,4.000000,...,2.757576,3.0,,,2.886792,3.0,3.319588,3.50,3.578947,4.0
609,3.270270,3.0,609,3.368421,3.0,3.000000,3.0,3.000000,3.00,3.000000,...,,,,,3.200000,3.0,3.500000,3.50,3.500000,3.5


In [175]:
#джойним по userId user_item и joined_ratings
joined_ratings1 = joined_ratings.join(user_item.set_index('userId'), on='userId')
joined_ratings1

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,...,user_mean_rating_Musical,user_median_rating_Musical,user_mean_rating_(nogenreslisted),user_median_rating_(nogenreslisted),user_mean_rating_Romance,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,4.366379,5.0,4.529412,5.0,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.0,4.500000,5.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,4.366379,5.0,4.529412,5.0,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.0,4.500000,5.0
2,1,6,4.0,964982224,Heat (1995),Action Crime Thriller,4.366379,5.0,4.529412,5.0,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.0,4.500000,5.0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery Thriller,4.366379,5.0,4.529412,5.0,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.0,4.500000,5.0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime Mystery Thriller,4.366379,5.0,4.529412,5.0,...,4.681818,5.0,,,4.307692,4.0,3.470588,4.0,4.500000,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama Horror Thriller,3.688556,3.5,3.874739,4.0,...,3.928571,4.0,,,3.731092,3.5,3.506601,3.5,3.776596,4.0
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action Crime Thriller,3.688556,3.5,3.874739,4.0,...,3.928571,4.0,,,3.731092,3.5,3.506601,3.5,3.776596,4.0
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,3.688556,3.5,3.874739,4.0,...,3.928571,4.0,,,3.731092,3.5,3.506601,3.5,3.776596,4.0
100834,610,168252,5.0,1493846352,Logan (2017),Action SciFi,3.688556,3.5,3.874739,4.0,...,3.928571,4.0,,,3.731092,3.5,3.506601,3.5,3.776596,4.0


In [176]:
#формируем датафрейм  - это средние и медианные значения оценок фильмов
film_item = pd.DataFrame()
film_item['film_mean_rating'] = joined_ratings.groupby('movieId').rating.mean()
film_item['film_median_rating'] = joined_ratings.groupby('movieId').rating.median()
film_item['movieId'] = film_item.index
film_item

Unnamed: 0_level_0,film_mean_rating,film_median_rating,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.920930,4.0,1
2,3.431818,3.5,2
3,3.259615,3.0,3
4,2.357143,3.0,4
5,3.071429,3.0,5
...,...,...,...
193581,4.000000,4.0,193581
193583,3.500000,3.5,193583
193585,3.500000,3.5,193585
193587,3.500000,3.5,193587


In [189]:
#джойним датафрейм с фильмо-рейтингом и юзер рейтингом
joined_ratings2 = joined_ratings1.join(film_item.set_index('movieId'), on='movieId')
joined_ratings2.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,...,user_mean_rating_(nogenreslisted),user_median_rating_(nogenreslisted),user_mean_rating_Romance,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War,film_mean_rating,film_median_rating
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.5,5.0,3.92093,4.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.5,5.0,3.259615,3.0


In [190]:
#Для нечисловых фич кодируем значения с помощью LabelEncoder()
categorical = ["title", "genres"]
for f in categorical:
        if joined_ratings2[f].dtype=='object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(joined_ratings2[f].values))
            joined_ratings2[f] = lbl.transform(list(joined_ratings2[f].values))


In [191]:
joined_ratings2

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,...,user_mean_rating_(nogenreslisted),user_median_rating_(nogenreslisted),user_mean_rating_Romance,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War,film_mean_rating,film_median_rating
0,1,1,4.0,964982703,8871,351,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.500000,5.0,3.920930,4.0
1,1,3,4.0,964981247,3661,732,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.500000,5.0,3.259615,3.0
2,1,6,4.0,964982224,3845,260,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.500000,5.0,3.946078,4.0
3,1,47,5.0,964983815,7523,937,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.500000,5.0,3.975369,4.0
4,1,50,5.0,964982931,9119,790,4.366379,5.0,4.529412,5.0,...,,,4.307692,4.0,3.470588,4.0,4.500000,5.0,4.237745,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,7938,852,3.688556,3.5,3.874739,4.0,...,,,3.731092,3.5,3.506601,3.5,3.776596,4.0,3.333333,4.0
100832,610,168248,5.0,1493850091,4597,260,3.688556,3.5,3.874739,4.0,...,,,3.731092,3.5,3.506601,3.5,3.776596,4.0,4.142857,4.0
100833,610,168250,5.0,1494273047,3392,910,3.688556,3.5,3.874739,4.0,...,,,3.731092,3.5,3.506601,3.5,3.776596,4.0,3.633333,4.0
100834,610,168252,5.0,1493846352,5161,330,3.688556,3.5,3.874739,4.0,...,,,3.731092,3.5,3.506601,3.5,3.776596,4.0,4.280000,4.5


In [192]:
# определяем отдельные столбцы из даты и времени

joined_ratings2['dt'] = joined_ratings2['timestamp'].apply(lambda t: datetime.fromtimestamp(t))
joined_ratings2['year'] = joined_ratings2['dt'].dt.year
joined_ratings2['month'] = joined_ratings2['dt'].dt.month
joined_ratings2["day"] = joined_ratings2['dt'].dt.day

In [193]:
joined_ratings2

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,...,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War,film_mean_rating,film_median_rating,dt,year,month,day
0,1,1,4.0,964982703,8871,351,4.366379,5.0,4.529412,5.0,...,3.470588,4.0,4.500000,5.0,3.920930,4.0,2000-07-30 21:45:03,2000,7,30
1,1,3,4.0,964981247,3661,732,4.366379,5.0,4.529412,5.0,...,3.470588,4.0,4.500000,5.0,3.259615,3.0,2000-07-30 21:20:47,2000,7,30
2,1,6,4.0,964982224,3845,260,4.366379,5.0,4.529412,5.0,...,3.470588,4.0,4.500000,5.0,3.946078,4.0,2000-07-30 21:37:04,2000,7,30
3,1,47,5.0,964983815,7523,937,4.366379,5.0,4.529412,5.0,...,3.470588,4.0,4.500000,5.0,3.975369,4.0,2000-07-30 22:03:35,2000,7,30
4,1,50,5.0,964982931,9119,790,4.366379,5.0,4.529412,5.0,...,3.470588,4.0,4.500000,5.0,4.237745,4.5,2000-07-30 21:48:51,2000,7,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,7938,852,3.688556,3.5,3.874739,4.0,...,3.506601,3.5,3.776596,4.0,3.333333,4.0,2017-05-04 00:53:22,2017,5,4
100832,610,168248,5.0,1493850091,4597,260,3.688556,3.5,3.874739,4.0,...,3.506601,3.5,3.776596,4.0,4.142857,4.0,2017-05-04 01:21:31,2017,5,4
100833,610,168250,5.0,1494273047,3392,910,3.688556,3.5,3.874739,4.0,...,3.506601,3.5,3.776596,4.0,3.633333,4.0,2017-05-08 22:50:47,2017,5,8
100834,610,168252,5.0,1493846352,5161,330,3.688556,3.5,3.874739,4.0,...,3.506601,3.5,3.776596,4.0,4.280000,4.5,2017-05-04 00:19:12,2017,5,4


In [194]:
#удаляем лишние столбцы (которые мы уже преобразовали)
joined_ratings2 = joined_ratings2.drop(['dt', 'timestamp'], axis=1)

In [195]:
joined_ratings2

Unnamed: 0,userId,movieId,rating,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,user_mean_rating_Animation,...,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War,film_mean_rating,film_median_rating,year,month,day
0,1,1,4.0,8871,351,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.920930,4.0,2000,7,30
1,1,3,4.0,3661,732,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.259615,3.0,2000,7,30
2,1,6,4.0,3845,260,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.946078,4.0,2000,7,30
3,1,47,5.0,7523,937,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.975369,4.0,2000,7,30
4,1,50,5.0,9119,790,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,4.237745,4.5,2000,7,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,7938,852,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,3.333333,4.0,2017,5,4
100832,610,168248,5.0,4597,260,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,4.142857,4.0,2017,5,4
100833,610,168250,5.0,3392,910,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,3.633333,4.0,2017,5,8
100834,610,168252,5.0,5161,330,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,4.280000,4.5,2017,5,4


In [217]:
#заполняем нулями пропуски
joined_ratings2 = joined_ratings2.fillna(0)

In [218]:
joined_ratings2

Unnamed: 0,userId,movieId,rating,title,genres,user_mean_rating,user_median_rating,user_mean_rating_Drama,user_median_rating_Drama,user_mean_rating_Animation,...,user_median_rating_Romance,user_mean_rating_Horror,user_median_rating_Horror,user_mean_rating_War,user_median_rating_War,film_mean_rating,film_median_rating,year,month,day
0,1,1,4.0,8871,351,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.920930,4.0,2000,7,30
1,1,3,4.0,3661,732,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.259615,3.0,2000,7,30
2,1,6,4.0,3845,260,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.946078,4.0,2000,7,30
3,1,47,5.0,7523,937,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,3.975369,4.0,2000,7,30
4,1,50,5.0,9119,790,4.366379,5.0,4.529412,5.0,4.689655,...,4.0,3.470588,4.0,4.500000,5.0,4.237745,4.5,2000,7,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,7938,852,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,3.333333,4.0,2017,5,4
100832,610,168248,5.0,4597,260,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,4.142857,4.0,2017,5,4
100833,610,168250,5.0,3392,910,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,3.633333,4.0,2017,5,8
100834,610,168252,5.0,5161,330,3.688556,3.5,3.874739,4.0,3.901515,...,3.5,3.506601,3.5,3.776596,4.0,4.280000,4.5,2017,5,4


In [219]:
#разделяем целевую переменную и остальной датафрейм
X = joined_ratings2.loc[:, joined_ratings2.columns != 'rating']
y = joined_ratings2['rating']

In [220]:
#делим на train и test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [222]:
#обучаем
rf = RandomForestRegressor(n_estimators=100, max_features ='sqrt')
rf.fit(X_train, y_train)


RandomForestRegressor(max_features='sqrt')

In [224]:
#предсказываем
y_pred = rf.predict(X_test)

In [244]:
list(y_test)

[5.0,
 2.5,
 2.5,
 3.0,
 4.0,
 5.0,
 5.0,
 4.0,
 3.5,
 3.0,
 3.5,
 5.0,
 3.5,
 3.0,
 4.0,
 5.0,
 4.0,
 4.0,
 2.5,
 2.5,
 5.0,
 2.0,
 4.0,
 3.5,
 3.0,
 4.0,
 3.0,
 4.5,
 3.0,
 5.0,
 4.0,
 2.5,
 4.0,
 3.5,
 2.5,
 4.5,
 4.0,
 3.5,
 2.5,
 5.0,
 2.0,
 4.0,
 3.0,
 2.0,
 4.5,
 3.0,
 3.0,
 3.5,
 4.0,
 3.0,
 4.5,
 4.5,
 1.5,
 4.0,
 5.0,
 4.5,
 3.5,
 3.5,
 4.0,
 4.0,
 3.0,
 4.0,
 3.0,
 4.0,
 4.0,
 5.0,
 3.0,
 4.0,
 3.5,
 4.0,
 1.0,
 4.5,
 3.5,
 5.0,
 4.0,
 4.0,
 4.0,
 3.0,
 5.0,
 2.0,
 3.5,
 3.5,
 3.5,
 4.5,
 2.0,
 5.0,
 4.0,
 4.0,
 4.0,
 2.5,
 2.5,
 5.0,
 4.5,
 4.0,
 3.0,
 3.0,
 2.0,
 1.0,
 0.5,
 2.0,
 4.0,
 4.0,
 3.0,
 5.0,
 4.0,
 4.0,
 4.0,
 3.5,
 4.0,
 4.0,
 3.0,
 4.5,
 4.0,
 3.0,
 4.5,
 3.0,
 4.0,
 4.5,
 3.0,
 1.5,
 1.0,
 5.0,
 4.0,
 4.0,
 2.5,
 2.0,
 3.0,
 4.5,
 3.5,
 3.0,
 1.5,
 4.0,
 3.5,
 1.0,
 3.0,
 3.5,
 2.0,
 5.0,
 4.0,
 3.0,
 4.0,
 3.5,
 3.5,
 5.0,
 4.0,
 4.0,
 1.5,
 4.0,
 3.0,
 3.0,
 5.0,
 3.0,
 4.0,
 3.0,
 4.0,
 4.0,
 3.5,
 5.0,
 5.0,
 1.5,
 2.0,
 3.5,
 4.5,
 1.0,
 1.0,
 3.0,
 4.0

In [245]:
result = pd.DataFrame()
result['pred'] = y_pred
result['real'] = list(y_test)
result

Unnamed: 0,pred,real
0,4.655,5.0
1,2.175,2.5
2,2.895,2.5
3,4.095,3.0
4,3.365,4.0
...,...,...
30246,2.505,2.5
30247,4.735,4.5
30248,3.155,3.0
30249,2.760,3.5


In [228]:
#среднеквадратичная ошибка (RMSE)
rms = mean_squared_error(y_test, y_pred, squared=False)
rms

0.7868912129799702