In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [4]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [5]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(movie_genres)

In [6]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [7]:
X_tfidf_mat = X_tfidf.toarray()

In [8]:
movies = pd.concat([movies, pd.DataFrame(X_tfidf_mat)], axis=1)

In [9]:
# фильмы с tfidf из жанров
movies.head()

Unnamed: 0,movieId,title,genres,0,1,2,3,4,5,6,...,10,11,12,13,14,15,16,17,18,19
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
mean_ratings = pd.DataFrame(ratings.groupby('movieId').rating.mean())
mean_ratings.columns = ['mean_rating']

In [11]:
# рэйтинги с средней оценкой каждого фильма
ratings = ratings.merge(mean_ratings, how='left', on='movieId')

In [12]:
mean_user_ratings = pd.DataFrame(ratings.groupby('userId').rating.mean())
mean_user_ratings.columns = ['mean_user_rating']

In [13]:
ratings = ratings.merge(mean_user_ratings, how='left', on='userId')

In [14]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,mean_user_rating
0,1,1,4.0,964982703,3.92093,4.366379
1,1,3,4.0,964981247,3.259615,4.366379
2,1,6,4.0,964982224,3.946078,4.366379
3,1,47,5.0,964983815,3.975369,4.366379
4,1,50,5.0,964982931,4.237745,4.366379


In [15]:
data = movies.drop('genres', axis = 1).merge(ratings, how='inner', on='movieId')

In [16]:
data.head()

Unnamed: 0,movieId,title,0,1,2,3,4,5,6,7,...,15,16,17,18,19,userId,rating,timestamp,mean_rating,mean_user_rating
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,4.0,964982703,3.92093,4.366379
1,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5,4.0,847434962,3.92093,3.636364
2,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7,4.5,1106635946,3.92093,3.230263
3,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,15,2.5,1510577970,3.92093,3.448148
4,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,17,4.5,1305696483,3.92093,4.209524


In [17]:
X = data.drop(['movieId', 'title', 'userId', 'rating'], axis=1)
y = data['rating']

In [24]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
model_lin = LinearRegression()
model_lin.fit(X_train, y_train)
y_pred_lin = model_lin.predict(X_test)

In [30]:
# rmse нашей модели
print(round(sqrt(mean_squared_error(y_test, y_pred_lin)), 2))

0.81
