### ПАКЕТ SURPRISE

1) используйте данные MovieLens 1M 

2) можно использовать любые модели из пакета 

3) получите RMSE на тестовом сете 0.87 и ниже 


In [71]:
from surprise import KNNWithMeans, KNNBasic, KNNBaseline, KNNWithZScore
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.model_selection import cross_validate

import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [5]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [9]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703


In [7]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [8]:
movies_with_ratings.head(1)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0


In [13]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings['userId'],
    'iid': movies_with_ratings['title'],
    'rating': movies_with_ratings['rating']
})

In [14]:
dataset.head(1)

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0


In [15]:
reader = Reader(rating_scale=(0.5,5.0))

In [16]:
data = Dataset.load_from_df(dataset,reader)

In [20]:
trainset, testset = train_test_split(data,test_size=.15)

# ITEM-BASED recsys

In [None]:
# создадим словарик моделей

In [24]:
models = [KNNWithMeans(), KNNBasic(), KNNBaseline(), KNNWithZScore()]

In [27]:
# сделаем крос-валидацию для моделей
results = []
for model in range(len(models)):
    algorithm = cross_validate(models[model],data,measures=['rmse'],cv = 5,n_jobs=-1)
    results.append(algorithm)

In [45]:
mean_rmse = [results[i]['test_rmse'].mean() for i in range(len(models))]

In [63]:
list(zip(models,mean_rmse)) # будем работать с KNNBaseline из-за меньшего среднего rmse
# постараемся улучшить результат

[(<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1173c85f8>,
  0.8976777500800583),
 (<surprise.prediction_algorithms.knns.KNNBasic at 0x1173c83c8>,
  0.9477724997220071),
 (<surprise.prediction_algorithms.knns.KNNBaseline at 0x1173c8198>,
  0.8741006610755668),
 (<surprise.prediction_algorithms.knns.KNNWithZScore at 0x1173c8ef0>,
  0.8974337120010792)]

In [None]:
# с помощью grid_search подберем лучшие параметры 

In [85]:
params = {
    'k': np.arange(5,100,15),
    'sim_options':{'name': ['pearson_baseline', 'cosine'], 'user_based': [True, False]},
    }

In [86]:
grid_search = GridSearchCV(algo_class=KNNBaseline,param_grid=params,measures=['rmse'],n_jobs=-1,cv=5)

In [87]:
grid_search.fit(data)

In [88]:
grid_search.best_params

{'rmse': {'k': 50,
  'sim_options': {'name': 'pearson_baseline', 'user_based': False}}}

In [95]:
alg = KNNBaseline(**grid_search.best_params['rmse'])

In [96]:
alg.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x10ae84748>

In [97]:
pred_test = alg.test(testset)

In [98]:
accuracy.rmse(pred_test)

RMSE: 0.8455


0.845477131441009

In [None]:
# результат достигнут