# Подготавливаем датасет

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
!gdown 1r9WD2Pe9MFB3yPBvrcT0Owhmb8JtUuwO -O MovieLens.zip

Downloading...
From: https://drive.google.com/uc?id=1r9WD2Pe9MFB3yPBvrcT0Owhmb8JtUuwO
To: /content/MovieLens.zip
  0% 0.00/978k [00:00<?, ?B/s]100% 978k/978k [00:00<00:00, 124MB/s]


In [None]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [None]:
links = pd.read_csv('/content/ml-latest-small/links.csv')
movies = pd.read_csv('/content/ml-latest-small/movies.csv')
ratings = pd.read_csv('/content/ml-latest-small/ratings.csv')
tags = pd.read_csv('/content/ml-latest-small/tags.csv')

In [None]:
df_ratings = pd.DataFrame(ratings)
df_movies = pd.DataFrame(movies)

df = df_ratings.merge(df_movies[['movieId', 'title', 'genres']], how='left',  on='movieId')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


# item-to-item collaborative filtering

In [None]:
num_users = df['userId'].nunique()
group = df.groupby('movieId')

def create_movie_vector(group):
    # Создаем массив нулей размером num_users
    result = np.zeros(num_users) 
    # Добавляем новый столбец 'user_index', вычитая 1 из значений 'userId'
    group['user_index'] = group['userId'] - 1 
    # Присваиваем значения 'rating' в соответствующие позиции в массиве 'result'
    result[group['user_index']] = group['rating']  
    return result

In [None]:
movie_vector = group.apply(create_movie_vector)

In [None]:
movie_vector

movieId
1         [4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.5, 0.0, 0.0, ...
2         [0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 0.0, ...
3         [4.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, ...
4         [0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, ...
5         [0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, ...
                                ...                        
193581    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193583    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193585    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193587    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
193609    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Length: 9724, dtype: object

In [None]:
from scipy.spatial.distance import cosine

In [None]:
def get_nearest_neighbors(title, *, n=10) -> list:
    """ для фильма title получаем список из n похожих фильмов """

    film_id = df[df['title'] == title].iloc[0].movieId # получаем movieId из названия фильма
    recomendations = []
    for key in tqdm(movie_vector.keys()):
      if key == film_id:
        continue
      recomendations.append((key, 
                            cosine(movie_vector[film_id], movie_vector[key])))
      
    nearest = sorted(recomendations, key=lambda x: x[1], reverse=True)[0:n]
    return [x[0] for x in nearest]

In [None]:
get_nearest_neighbors('Grumpier Old Men (1995)')

  0%|          | 0/9724 [00:00<?, ?it/s]

[30, 49, 53, 55, 77, 80, 82, 83, 85, 96]

In [None]:
def get_last_seen_movies(user_id, *, n=10):
    """ для user_id получаем список из последних n просмотренных им фильмов, которые он оценил в 4 или 5 """

    last_seen_n_movies = (
        df[(df['userId'] == user_id) & (df['rating'].isin([4, 5]))]
        [['title', 'timestamp', 'rating']]
        .sort_values('timestamp', ascending=False)
        .iloc[0:n]
        .title.values
    )

    return last_seen_n_movies

In [None]:
get_last_seen_movies(1)

array(['20 Dates (1998)', 'Back to the Future Part III (1990)',
       '¡Three Amigos! (1986)', 'Tombstone (1993)',
       'Canadian Bacon (1995)',
       'Messenger: The Story of Joan of Arc, The (1999)',
       'Pink Floyd: The Wall (1982)', 'Good Morning, Vietnam (1987)',
       'Rob Roy (1995)', 'Platoon (1986)'], dtype=object)

# TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [None]:
movie_genres = [change_string(g) for g in df.genres.values]

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

In [None]:
def get_tfidf_recomendation(last_films):
  res = []
  for movie in last_films:
    genre = df_movies.query('title == @movie').genres.iloc[0]
    string = change_string(genre)
    predict = count_vect.transform([string])
    X_tfidf2 = tfidf_transformer.transform(predict)
    res.extend(neigh.kneighbors(X_tfidf2, return_distance=False))
  recomendations_tfidf = []
  for i in res:
    recomendations_tfidf.extend(i)

  return recomendations_tfidf

# Получение рекомендации для пользователя

In [None]:
# выбираем юзера
user_id = 100
# берем последние 10 фильмов, которые он посмотрел и поставил 4 или 5.
last_films = get_last_seen_movies(user_id, n=10)


# для каждого фильма из last_films находим еще 10 похожих по косинусным расстояниям вектора movie_vector
recomendations = []
for film in last_films:
  recomendations.extend(get_nearest_neighbors(film))
# убираем уже просмотренные фильмы
recomendations = list(set(recomendations) - set(df[df['userId'] == user_id].movieId.values))

# для каждого фильма из last_films находим еще 10 похожих по жанрам
recomendations_tfidf = get_tfidf_recomendation(last_films)
# убираем уже просмотренные фильмы
recomendations_tfidf = list(set(recomendations_tfidf) - set(df[df['userId'] == user_id].movieId.values))

# находим общие фильмы, встречающиеся в обоих алгоритмах
common_elements = list(set(recomendations) & set(recomendations_tfidf))

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

  0%|          | 0/9724 [00:00<?, ?it/s]

In [None]:
print(f'Список рекомендуемых фильмов для пользователя {user_id}:')
print(*df_movies.query('movieId.isin(@common_elements)')['title'].values, sep='\n')

Список рекомендуемых фильмов для пользователя 100:
Dracula: Dead and Loving It (1995)
Four Rooms (1995)
Assassins (1995)
White Balloon, The (Badkonake sefid) (1995)
