https://grouplens.org/datasets/movielens/latest/

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
!gdown 1r9WD2Pe9MFB3yPBvrcT0Owhmb8JtUuwO -O MovieLens.zip

Downloading...
From: https://drive.google.com/uc?id=1r9WD2Pe9MFB3yPBvrcT0Owhmb8JtUuwO
To: /content/MovieLens.zip
  0% 0.00/978k [00:00<?, ?B/s]100% 978k/978k [00:00<00:00, 97.9MB/s]


In [None]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [None]:
links = pd.read_csv('/content/ml-latest-small/links.csv')
movies = pd.read_csv('/content/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
tags = pd.read_csv('/content/ml-latest-small/tags.csv')

In [None]:
df_ratings = pd.DataFrame(ratings)
movies_df = pd.DataFrame(movies)

df_ratings = df_ratings.merge(movies_df[['movieId', 'title']], how='left',  on='movieId')
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,1,3,4.0,964981247,Grumpier Old Men (1995)
2,1,6,4.0,964982224,Heat (1995)
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,964982931,"Usual Suspects, The (1995)"


In [None]:
num_users = df_ratings['userId'].nunique()
group = df_ratings.groupby('movieId')

def create_movie_vector(group):
    # Создаем массив нулей размером num_users
    result = np.zeros(num_users) 
    # Добавляем новый столбец 'user_index', вычитая 1 из значений 'userId'
    group['user_index'] = group['userId'] - 1 
    # Присваиваем значения 'rating' в соответствующие позиции в массиве 'result'
    result[group['user_index']] = group['rating']  
    return result

In [None]:
movie_vector = group.apply(create_movie_vector)

In [None]:
movie_vector

movieId
1         [4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.5, 0.0, 0.0, ...
2         [0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, ...
3         [4.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, ...
4         [0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, ...
5         [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, ...
                                ...                        
193581    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193583    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193585    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193587    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193609    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Length: 9724, dtype: object

# surprise

https://surpriselib.com/

In [None]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [None]:
# оставляем три атрибута и переименовываем их
dataset = pd.DataFrame({
    'uid': df_ratings.userId,
    'iid': df_ratings.title,
    'rating': df_ratings.rating
})

In [None]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,1,Grumpier Old Men (1995),4.0
2,1,Heat (1995),4.0
3,1,Seven (a.k.a. Se7en) (1995),5.0
4,1,"Usual Suspects, The (1995)",5.0


In [None]:
dataset.iid.nunique(), dataset.uid.nunique()

(9719, 610)

Так как количество item-ов больше чем количество user-ов - выбираем user-based подход

user-based KNNWithMeans

In [None]:
# используем класс Reader из Surprise, чтобы определить диапазон значений рейтингов
min_rating = dataset.rating.min()
max_rating = dataset.rating.max()
reader = Reader(rating_scale=(min_rating, max_rating))

In [None]:
# создаем объект Dataset из данных и объекта Reader
df = Dataset.load_from_df(dataset, reader)

In [None]:
algo = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': True  # compute  similarities between users
})

In [None]:
cv_results = cross_validate(algo, df, measures=["RMSE", "MAE"], cv=5, verbose=True)
cv_results

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9009  0.8998  0.9062  0.8944  0.8994  0.9001  0.0038  
MAE (testset)     0.6904  0.6868  0.6953  0.6824  0.6857  0.6881  0.0044  
Fit time          0.17    0.22    0.40    0.59    0.20    0.32    0.16    
Test time         2.10    2.29    4.76    3.99    1.48    2.92    1.24    


{'test_rmse': array([0.90089547, 0.89981784, 0.90622509, 0.89437346, 0.89939755]),
 'test_mae': array([0.69039366, 0.6868261 , 0.6953285 , 0.68238224, 0.68571887]),
 'fit_time': (0.1744556427001953,
  0.22360634803771973,
  0.3983745574951172,
  0.5939021110534668,
  0.20118236541748047),
 'test_time': (2.103031873703003,
  2.2856228351593018,
  4.760380268096924,
  3.9850032329559326,
  1.4790887832641602)}

In [None]:
m = cv_results['test_rmse'].mean()
print(f'mean RMSE = {m}')

mean RMSE = 0.9001418806321679


item-based KNNWithMeans

In [None]:
algo_2 = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': False  # compute similarities between items
})

In [None]:
cv_results_2 = cross_validate(algo_2, df, measures=["RMSE", "MAE"], cv=5, verbose=True)
cv_results_2

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8996  0.8958  0.9036  0.9096  0.9065  0.9030  0.0049  
MAE (testset)     0.6879  0.6853  0.6907  0.6948  0.6891  0.6896  0.0031  
Fit time          11.58   7.66    7.73    7.30    7.56    8.37    1.61    
Test time         10.70   8.02    8.48    8.83    8.38    8.88    0.94    


{'test_rmse': array([0.89962897, 0.89579175, 0.90361082, 0.90961811, 0.90648374]),
 'test_mae': array([0.68789547, 0.68533105, 0.69073451, 0.69476782, 0.6890914 ]),
 'fit_time': (11.579537391662598,
  7.662917137145996,
  7.728542327880859,
  7.300186634063721,
  7.557751417160034),
 'test_time': (10.698849201202393,
  8.020298480987549,
  8.484471559524536,
  8.829739093780518,
  8.377800226211548)}

In [None]:
m_2 = cv_results_2['test_rmse'].mean()
print(f'mean RMSE = {m_2}')

mean RMSE = 0.9030266787235656


random_pred.NormalPredictor

In [None]:
from surprise import NormalPredictor

In [None]:
algo_3 = NormalPredictor()

In [None]:
cv_results_3 = cross_validate(algo_3, df, measures=["RMSE", "MAE"], cv=5, verbose=True)
cv_results_3

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4238  1.4237  1.4262  1.4287  1.4362  1.4277  0.0046  
MAE (testset)     1.1368  1.1371  1.1373  1.1410  1.1448  1.1394  0.0031  
Fit time          0.11    0.14    0.14    0.34    0.13    0.17    0.09    
Test time         0.52    0.14    0.25    0.12    0.42    0.29    0.16    


{'test_rmse': array([1.42383335, 1.42368649, 1.42622801, 1.42867446, 1.43622037]),
 'test_mae': array([1.13679839, 1.13712539, 1.13727931, 1.14098752, 1.14481603]),
 'fit_time': (0.10857248306274414,
  0.13966703414916992,
  0.13981270790100098,
  0.3439011573791504,
  0.1345217227935791),
 'test_time': (0.5203654766082764,
  0.14433073997497559,
  0.2518932819366455,
  0.12046313285827637,
  0.4223442077636719)}

In [None]:
m_3 = cv_results_3['test_rmse'].mean()
print(f'mean RMSE = {m_3}')

mean RMSE = 1.4277285377393025


SVD

In [None]:
from surprise import SVD

In [None]:
algo_4 = SVD(n_factors=200, n_epochs=40, lr_all=0.01, reg_all=0.1)

In [None]:
cv_results_4 = cross_validate(algo_4, df, measures=["RMSE", "MAE"], cv=5, verbose=True)
cv_results_4

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8597  0.8455  0.8500  0.8499  0.8517  0.8514  0.0046  
MAE (testset)     0.6586  0.6495  0.6488  0.6519  0.6533  0.6524  0.0035  
Fit time          4.77    3.97    4.96    3.97    4.08    4.35    0.43    
Test time         0.25    0.23    0.18    0.14    0.14    0.19    0.04    


{'test_rmse': array([0.85970643, 0.84552143, 0.849986  , 0.8499061 , 0.85166369]),
 'test_mae': array([0.65859493, 0.64948802, 0.64884723, 0.65186976, 0.65333482]),
 'fit_time': (4.773089408874512,
  3.969111919403076,
  4.95647120475769,
  3.9720335006713867,
  4.078521966934204),
 'test_time': (0.2458205223083496,
  0.2260727882385254,
  0.18163418769836426,
  0.137467622756958,
  0.14379286766052246)}

In [None]:
m_4 = cv_results_4['test_rmse'].mean()
print(f'mean RMSE = {m_4}')

mean RMSE = 0.8513567309335114


Вывод: наименьшая на кросс-валидации RMSE, равная 0.85, получена при использовании алгоритма SVD.