In [3]:
pip install surprise 

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Using cached scikit-surprise-1.1.1.tar.gz (11.8 MB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp38-cp38-win_amd64.whl size=729741 sha256=a4e43afda0a8618a4a22a4bc183870fe60556fb1ec1c536c95efb47c4f71a698
  Stored in directory: c:\users\unicorn\appdata\local\pip\cache\wheels\20\91\57\2965d4cff1b8ac7ed1b6fa25741882af3974b54a31759e10b6
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
Note: you may need to restart the kernel to use updated packages.


In [26]:
from surprise import KNNWithMeans, KNNBasic
from surprise.prediction_algorithms import matrix_factorization
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import pandas as pd

In [6]:
movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [8]:
#объединяем рейтинги и фильмы
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [9]:
#Для примера для пользователя 2
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Hackers (1995)', 'Escape from L.A. (1996)',
       'Sex, Lies, and Videotape (1989)', 'Harold and Maude (1971)',
       'Manhattan (1979)', 'Room with a View, A (1986)', 'Stripes (1981)',
       'Driving Miss Daisy (1989)', 'L.A. Story (1991)',
       'Broadcast News (1987)', 'Big Chill, The (1983)',
       'Arlington Road (1999)', 'Little Shop of Horrors (1986)',
       'Risky Business (1983)', 'American Graffiti (1973)'], dtype=object)

In [10]:
#Формируем датасет под формат для surprice
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [11]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0


In [12]:
#находим минимальные и макимальные значения для будущего прогноза рейтинга, эти данные указываем в reader
ratings.rating.min()

0.5

In [13]:
ratings.rating.max()

5.0

In [14]:
#указываем min и max в reader, загрузаем дата сет в библиотеку surprice
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
# data_1 = Dataset.load_from_df(movies_with_ratings, reader)

In [15]:
#train и test
trainset, testset = train_test_split(data, test_size=.15)

In [23]:
#учим
algo = matrix_factorization.SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x265bc60f130>

In [24]:
#предсказываем
test_pred = algo.test(testset)

In [30]:
#для примера: для пользователя 2 , для фильма "Бойцовский клуб", модель предсказала, что пользователь поставил бы оценку 4,22
algo.predict(uid=2, iid='Fight Club (1999)')

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.220904901774226, details={'was_impossible': False})

In [31]:
#среднеквадратичная ошибка (RMSE)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.7916


0.7915579761001484

In [39]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7948  0.7946  0.7946  0.7943  0.7945  0.7946  0.0001  
MAE (testset)     0.5996  0.5995  0.5996  0.5994  0.5995  0.5995  0.0001  
Fit time          1234.47 1220.77 1522.14 1828.19 1722.40 1505.59 247.38  
Test time         88.08   64.99   103.29  109.24  61.76   85.47   19.34   


{'test_rmse': array([0.79476999, 0.79462093, 0.79457729, 0.79433537, 0.79450076]),
 'test_mae': array([0.59956361, 0.59949425, 0.59956288, 0.59935457, 0.59948946]),
 'fit_time': (1234.4664425849915,
  1220.769945859909,
  1522.1356534957886,
  1828.1850047111511,
  1722.400310754776),
 'test_time': (88.07547521591187,
  64.98808908462524,
  103.28835701942444,
  109.23884654045105,
  61.759963274002075)}