# 1. Загрузка и подготовка данных

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357237 sha256=e525c0f78a99d1bb7db10d2977fe5ba4e4c9444213485f1327014da88edcd147
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

In [3]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip' -O  MovieLens.zip --no-check-certificate

--2024-06-24 19:38:51--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
  Issued certificate has expired.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘MovieLens.zip’


2024-06-24 19:38:52 (5.65 MB/s) - ‘MovieLens.zip’ saved [978202/978202]



In [4]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [6]:
df_movies.shape, df_ratings.shape

((9742, 3), (100836, 4))

In [7]:
df_movies_with_ratings = df_movies.join(
    df_ratings.set_index('movieId'),
    on='movieId',
    how='inner').reset_index(drop=True)

df_movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
df_movies_with_ratings.shape

(100836, 6)

In [33]:
df_movies_with_ratings.title.nunique(), df_movies_with_ratings.userId.nunique()

(9719, 610)

# 2. Surprise

In [9]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [10]:
dataset = pd.DataFrame({
    'uid': df_movies_with_ratings.userId,
    'iid': df_movies_with_ratings.title,
    'rating': df_movies_with_ratings.rating
})

dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5


In [11]:
reader = Reader(rating_scale=(df_ratings.rating.min(), df_ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [22]:
from surprise import SVD, SVDpp, KNNBaseline

In [19]:
def get_cv(model):
  cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

In [63]:
def get_knn_cv(model_name, sim_options, max_range=[30, 40, 50], min_range=[1, 5, 10]):
  for max_k in max_range:
    for min_k in min_range:
      print(f'\n  max_k = {max_k}, min_k = {min_k}\n')
      model = model_name(k=max_k,
                          min_k = min_k,
                          sim_options=sim_options,
                          verbose=False,
                          random_state=42)
      get_cv(model)

## SVD

min RMSE (testset) = 0.8746 (mean)

In [25]:
from surprise import SVD

In [26]:
model = SVD(random_state=42)
get_cv(model)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8686  0.8805  0.8715  0.8742  0.8779  0.8746  0.0043  
Fit time          1.55    1.63    1.61    2.12    1.53    1.69    0.22    
Test time         0.16    0.23    0.19    0.26    0.11    0.19    0.05    


## KNNBaseline

min RMSE (testset) = 0.8571 (mean)

In [27]:
from surprise import KNNBaseline

In [64]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True}

In [65]:
get_knn_cv(KNNBaseline, sim_options)


  max_k = 30, min_k = 1

Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8781  0.8817  0.8769  0.8775  0.8766  0.8782  0.0018  
Fit time          0.50    0.54    0.54    0.51    0.56    0.53    0.02    
Test time         1.38    1.49    1.36    1.49    2.26    1.60    0.34    

  max_k = 30, min_k = 5

Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8464  0.8608  0.8568  0.8667  0.8642  0.8590  0.0071  
Fit time          0.51    0.51    0.55    0.56    0.55    0.54    0.02    
Test time         1.49    1.35    1.35    1.37    1.69    1.45    0.13    

  max_k = 30, min_k = 10

Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8472  0.8684  0.8474  0.8536  0.8692  0.8572  0.0098  


## KNNBasic

min RMSE (testset) = 0.9691 (mean)

In [66]:
from surprise import KNNBasic

In [67]:
sim_options = {'name': 'cosine',
               'user_based': True}

In [68]:
get_knn_cv(KNNBasic, sim_options)


  max_k = 30, min_k = 1

Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9683  0.9736  0.9741  0.9697  0.9759  0.9723  0.0028  
Fit time          0.14    0.19    0.17    0.18    0.26    0.19    0.04    
Test time         1.48    1.19    1.16    1.81    1.53    1.43    0.24    

  max_k = 30, min_k = 5

Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9649  0.9740  0.9707  0.9768  0.9610  0.9695  0.0058  
Fit time          0.14    0.19    0.18    0.17    0.17    0.17    0.01    
Test time         1.34    1.20    1.34    1.18    1.94    1.40    0.28    

  max_k = 30, min_k = 10

Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9820  0.9840  0.9802  0.9754  0.9786  0.9800  0.0029  
Fit time 

## KNNWithMeans

min RMSE (testset) = 0.8902 (mean)

In [47]:
from surprise import KNNWithMeans

In [69]:
sim_options = {'name': 'pearson',
               'user_based': True}

In [70]:
get_knn_cv(KNNWithMeans, sim_options)


  max_k = 30, min_k = 1

Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8910  0.8920  0.9051  0.8965  0.9003  0.8970  0.0052  
Fit time          0.27    0.27    0.24    0.36    0.28    0.28    0.04    
Test time         1.32    1.42    1.26    2.24    1.42    1.53    0.36    

  max_k = 30, min_k = 5

Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8932  0.8943  0.8923  0.8838  0.8909  0.8909  0.0037  
Fit time          0.69    0.25    0.26    0.27    0.35    0.36    0.17    
Test time         2.11    1.42    1.33    1.57    1.97    1.68    0.31    

  max_k = 30, min_k = 10

Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8902  0.8977  0.8975  0.9004  0.9014  0.8974  0.0039

## BaselineOnly

min RMSE (testset) = 0.8670 (mean)

In [77]:
from surprise import BaselineOnly

'method': 'als'

In [86]:
bsl_options = {'method': 'als',
               'n_epochs': 5}

In [87]:
model = BaselineOnly(bsl_options=bsl_options, verbose=False)
get_cv(model)

Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8712  0.8814  0.8700  0.8694  0.8740  0.8732  0.0044  
Fit time          0.20    0.22    0.24    0.22    0.21    0.22    0.01    
Test time         0.08    0.20    0.17    0.20    0.07    0.14    0.06    


'method': 'sgd'

In [103]:
bsl_options = {'method': 'sgd',
               'learning_rate': 0.01}

In [104]:
model = BaselineOnly(bsl_options=bsl_options, verbose=False)
get_cv(model)

Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8663  0.8641  0.8678  0.8709  0.8661  0.8670  0.0022  
Fit time          0.65    0.65    0.65    0.65    0.64    0.65    0.01    
Test time         0.19    0.07    0.20    0.07    0.19    0.15    0.06    


# Выводы

Лучше всех справился **KNNBaseline** с параметрами

```
(k = 50, min_k = 10, 'user_based': True)
```

минимальное среднее RMSE на тестовой выборке = 0.8571.

В случае **BaselineOnly** с параметрами

```
('method': 'sgd', 'learning_rate': 0.01)
```

 минимальное среднее RMSE = 0.8670