# 1. Загрузка и подготовка данных

In [None]:
import pandas as pd
import numpy as np

In [None]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip' -O  MovieLens.zip --no-check-certificate

--2024-07-01 17:59:36--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘MovieLens.zip’


2024-07-01 17:59:37 (4.03 MB/s) - ‘MovieLens.zip’ saved [978202/978202]



In [None]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [None]:
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [None]:
df_movies_with_ratings = df_movies.join(
    df_ratings.set_index('movieId'),
    on='movieId',
    how='inner').reset_index(drop=True)

df_movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


# 2. surprise.KNNBaseline

В домашнем задании по теме "Коллаборативная фильтрация" лучший результат был у KNNBaseline(k=50, min_k=10).

Можно попробовать использовать его для фильтрации по рейтингу.

In [None]:
!pip install surprise



In [None]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
dataset = pd.DataFrame({
    'uid': df_movies_with_ratings.userId,
    'iid': df_movies_with_ratings.title,
    'rating': df_movies_with_ratings.rating
})

dataset.head()

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5


In [None]:
df_ratings.rating.min(), df_ratings.rating.max()

(0.5, 5.0)

In [None]:
reader = Reader(rating_scale=(df_ratings.rating.min(), df_ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [None]:
trainset, testset = train_test_split(data, test_size=.2, random_state=42)

In [None]:
from surprise import KNNBaseline

In [None]:
sim_options = {'name': 'pearson_baseline',
               'user_based': True}

model_KNNBaseline = KNNBaseline(k=50, min_k = 10,
                    sim_options=sim_options,
                    verbose=False,
                    random_state=42)

model_KNNBaseline.fit(trainset)

<surprise.prediction_algorithms.knns.KNNBaseline at 0x7c5f63f2fd60>

In [None]:
pred = model_KNNBaseline.test(testset)
accuracy.rmse(pred, verbose=True)

RMSE: 0.8550


0.8550109840707429

## Example

In [None]:
def get_topN(df_movies_with_ratings, user_id, model, N):
  user_movies = df_movies_with_ratings.loc[df_movies_with_ratings.userId == user_id].movieId.unique()
  df_not_user_movies = df_movies_with_ratings.loc[~df_movies_with_ratings.movieId.isin(user_movies)].title.unique()

  scores = {}

  for film in df_not_user_movies:
    scores[film] = model.predict(uid=user_id, iid=film).est

  scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True)[:N])

  return scores

In [None]:
user_id = 10
movies_count = 5

get_topN(df_movies_with_ratings, user_id, model_KNNBaseline, movies_count)

{"Singin' in the Rain (1952)": 4.410359764992698,
 'Star Wars: Episode VII - The Force Awakens (2015)': 4.362984559684711,
 'Christmas Story, A (1983)': 4.360833124955914,
 'Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)': 4.27010067430524,
 'Lawrence of Arabia (1962)': 4.222312168932234}

# 3. NearestNeighbors

Фильтрация, основанная на контенте

In [None]:
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
def change(item):
  item = re.sub(r"[-()\s\.]", "", item)
  item = ' '.join(item.split('|')).lower()
  return item

genres = df_movies['genres'].apply(change).to_list()

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(genres)
X_train_tfidf

<9742x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22084 stored elements in Compressed Sparse Row format>

In [None]:
model_NNeighbors = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean')
model_NNeighbors.fit(X_train_tfidf)

#4. Каскадная система рекомендаций

KNNBaseline + NearestNeighbors

In [None]:
#genres_by_movies = dict(df_movies_with_ratings[['title', 'genres']].values)

In [None]:
def get_topN_movies(df_movies_with_ratings, df_movies, user_id, model_filter, model_predict, N):

  scores = {}
  last_index = 1

  while len(scores) < N:
    user_movies = df_movies_with_ratings.loc[df_movies_with_ratings.userId == user_id].sort_values('timestamp')
    last_movie_genres = user_movies.iloc[-last_index].genres
    user_movies = user_movies.title.unique()

    X_tfidf = tfidf.transform([change(last_movie_genres)])
    neighbors = model_filter.kneighbors(X_tfidf, return_distance=True)

    df_not_user_movies = df_movies.iloc[neighbors[1][0]]
    df_not_user_movies = df_not_user_movies.loc[~df_not_user_movies.title.isin(user_movies)].title.unique()

    for film in df_not_user_movies:
      scores[film] = model_predict.predict(uid=user_id, iid=film).est

    last_index += 1
    if last_index > len(user_movies): break

  scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:N]

  return pd.DataFrame(scores, columns=['title', 'rating'])

In [None]:
user_id = 20
movies_count = 10

get_topN_movies(df_movies_with_ratings, df_movies, user_id, model_NNeighbors, model_KNNBaseline, movies_count)

Unnamed: 0,title,rating
0,Shrek 2 (2004),3.904018
1,Enchanted (2007),3.742619
2,"Princess and the Frog, The (2009)",3.731323
3,Frozen (2013),3.684311
4,"Rudolph, the Red-Nosed Reindeer (1964)",3.678261
5,Chitty Chitty Bang Bang (1968),3.647135
6,Beauty and the Beast: The Enchanted Christmas ...,3.619647
7,Aladdin and the King of Thieves (1996),3.565335
8,Song of the South (1946),3.554093
9,Rock-A-Doodle (1991),3.548113


В случае пользователя 20 как раз происходит ситуация, когда количество фильмов для рекомендаций на основании жанров последнего просмотренного фильма меньше чем необходимо. Поэтому итоговый датасет рекомендаций дополняется фильмами, найденными на следующей итерации (для предпоследнего фильма) и т.д.

И уже после этого отбирается топ-N рекомендаций.