In [1]:
import pandas as pd
import pickle
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms import KNNBaseline
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

In [2]:
movies = pd.read_csv('movies.csv')
movies.fillna('', inplace=True)
ratings = pd.read_csv('ratings.csv')

# Коллаборативная фильтрация

In [3]:
ratings = ratings[ratings['userId'] <= 15000]

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
sim_options = {'name': 'pearson_baseline', 'user_based': True}
model = KNNBaseline(sim_options=sim_options)
model.fit(trainset)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1adc50421d0>

In [6]:
filename = 'model.pkl' 
pickle.dump(model, open(filename, 'wb'))

In [6]:
def CFUserBased (user_id, n = 10):
    # находим фильмы которые смотрел пользователь
    watched_movies = set(ratings[ratings['userId'] == user_id]['movieId'])
    # находим все фильмы
    all_movies = set(ratings['movieId'])
    # находим не просмотренные фильмы
    unwatched_movies = all_movies - watched_movies
    predictions = []
    # проходимся по каждому не просмотренному фильму 
    for movie_id in unwatched_movies:
        predict = model.predict(user_id,movie_id).est
        predictions.append((movie_id, predict))
    predictions = sorted(predictions, key= lambda x: x[1], reverse=True)
    movies_title = []
    for movie_id, _ in predictions[:n]:
        title = movies[movies['id'] == movie_id]['title'].iloc[0]
        if title not in movies_title:
            movies_title.append(title)
    return movies_title


In [7]:
C = CFUserBased(1)
print(C)

['Unforgiven', 'Unbelievable Adventures of Italians in Russia', 'Stalag 17', 'Date and Switch', 'The Color of Pomegranates', 'Pathfinder', 'Babysitter Wanted', 'Drowning by Numbers', 'White of the Eye', 'Piggy']


# Фильтрация на основе контента 

In [7]:
movies['text'] = movies['cast'] + ' ' + movies['crew'] + ' ' + movies['genres'] + ' ' + movies['overview'] + ' ' + movies['production_countries'] + ' ' + str(movies['release_date'])
documents = [TaggedDocument(simple_preprocess(row['text']), [index]) for index, row in movies.iterrows()]
model1 = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
filename = 'model1.pkl' 
pickle.dump(model1, open(filename, 'wb'))

In [20]:
def ContentFiltering (user_id):
    user_watched_movies = ratings[ratings['userId'] == user_id]
    #выбираем те, у которых рейтинг больше 2.5
    user_watched_movies = user_watched_movies[user_watched_movies['rating'] > 2.5]
    #сортируем по убыванию
    user_watched_movies = user_watched_movies.sort_values(by='rating', ascending=False)
    #айдишник полученных фильмов
    watched_movies_ids = user_watched_movies['movieId'].to_list()
    recommended_movies = []
    for movie in watched_movies_ids:
        #получаем номер строки для соответсвующего айдишника
        movie_index =  movies[movies['id'] == movie].index.tolist()[0]
        # tag_of_trained_document - тег документа, который участвовал в обучении
        vector = model1.docvecs[movie_index]
        similar_movies = model1.docvecs.most_similar([vector], topn=5)
        similar_movie_ids = [sim[0] for sim in similar_movies]
        for i in similar_movie_ids:
            #обратно находим айдишник фильма
            rec_movie_id = movies.iloc[i]['id']
            if rec_movie_id not in watched_movies_ids:
                #и если пользователь его не смотрел, то добавляем в список рекомендаций 
                recommended_movies.append((rec_movie_id))
    movie_titles = []
    #по айдишнику находим названия
    for movie_id in recommended_movies:
        title = movies[movies['id'] == movie_id]['title'].iloc[0]
        if title not in movie_titles:
            #если такого фильма еще нет в рекомендациях, то дабавляем его
            movie_titles.append(title)
    return movie_titles

In [10]:
similar_movies = ContentFiltering(1)
print(similar_movies)

['Mona Lisa Smile', 'Arlington Road', "You've Got Mail", "Alice Doesn't Live Here Anymore", "The Devil at 4 O'Clock", 'Convict 13', 'Ladies and Gentlemen, the Fabulous Stains', 'The Taking of Pelham One Two Three', 'Land Without Bread', 'The Children Are Watching Us', 'The Best of Youth', 'Kamchatka', 'Betty Blue', 'Human Resources', 'All About Actresses', 'Samba', 'The Robe', 'The Great Ecstasy of Woodcarver Steiner', 'The Blood of a Poet', "George Carlin: Jammin' in New York", 'Johnny Suede', 'Drive Angry', 'The Woman in Black 2: Angel of Death', 'A Kind of Murder', '10 Years', 'Open Season 3', 'The One I Love', 'Mad Families', 'Afterschool', 'Apocalyptic', 'Legendary', 'Big Wednesday']


  vector = model1.docvecs[movie_index]
  similar_movies = model1.docvecs.most_similar([vector], topn=5)


# Общая фильтрация

In [23]:
user_id = int(input())
def recommend_film(user_id):
    film = []
    C = CFUserBased(user_id)
    film.append(C)
    similar_movies = ContentFiltering(user_id)
    film.append(similar_movies)
    return film

t = recommend_film(user_id)
print(t)


[['Unforgiven', 'Unbelievable Adventures of Italians in Russia', 'Stalag 17', 'Date and Switch', 'The Color of Pomegranates', 'Pathfinder', 'Babysitter Wanted', 'Drowning by Numbers', 'White of the Eye', 'Piggy'], ['Mona Lisa Smile', 'Arlington Road', "You've Got Mail", "Alice Doesn't Live Here Anymore", "The Devil at 4 O'Clock", 'Convict 13', 'Ladies and Gentlemen, the Fabulous Stains', 'The Taking of Pelham One Two Three', 'Land Without Bread', 'The Children Are Watching Us', 'The Best of Youth', 'Kamchatka', 'Betty Blue', 'Human Resources', 'All About Actresses', 'Samba', 'The Robe', 'The Great Ecstasy of Woodcarver Steiner', 'The Blood of a Poet', "George Carlin: Jammin' in New York", 'Johnny Suede', 'Drive Angry', 'The Woman in Black 2: Angel of Death', 'A Kind of Murder', '10 Years', 'Open Season 3', 'The One I Love', 'Mad Families', 'Afterschool', 'Apocalyptic', 'Legendary', 'Big Wednesday']]


  vector = model1.docvecs[movie_index]
  similar_movies = model1.docvecs.most_similar([vector], topn=5)


In [26]:
film = []
C = CFUserBased(user_id)
film.append(C)
print(film)


[['Unforgiven', 'Unbelievable Adventures of Italians in Russia', 'Stalag 17', 'Date and Switch', 'The Color of Pomegranates', 'Pathfinder', 'Babysitter Wanted', 'Drowning by Numbers', 'White of the Eye', 'Piggy']]


In [24]:
similar_movies = ContentFiltering(user_id)
similar_movies

  vector = model1.docvecs[movie_index]
  similar_movies = model1.docvecs.most_similar([vector], topn=5)


['Mona Lisa Smile',
 'Arlington Road',
 "You've Got Mail",
 "Alice Doesn't Live Here Anymore",
 "The Devil at 4 O'Clock",
 'Convict 13',
 'Ladies and Gentlemen, the Fabulous Stains',
 'The Taking of Pelham One Two Three',
 'Land Without Bread',
 'The Children Are Watching Us',
 'The Best of Youth',
 'Kamchatka',
 'Betty Blue',
 'Human Resources',
 'All About Actresses',
 'Samba',
 'The Robe',
 'The Great Ecstasy of Woodcarver Steiner',
 'The Blood of a Poet',
 "George Carlin: Jammin' in New York",
 'Johnny Suede',
 'Drive Angry',
 'The Woman in Black 2: Angel of Death',
 'A Kind of Murder',
 '10 Years',
 'Open Season 3',
 'The One I Love',
 'Mad Families',
 'Afterschool',
 'Apocalyptic',
 'Legendary',
 'Big Wednesday']