# Система рекомендаций по фильмам, основанная на исходящих ссылках из Википедии

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import random
import json
import time

from collections import Counter
from tqdm import tqdm

from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot

from sklearn import metrics
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso, SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier

In [3]:
!gdown --id "1xXBbql_pC1xqTkb-bGMN8XClHjg9FdGG"

Downloading...
From: https://drive.google.com/uc?id=1xXBbql_pC1xqTkb-bGMN8XClHjg9FdGG
To: /content/wp_movies_10k.ndjson
100% 31.1M/31.1M [00:00<00:00, 115MB/s] 


In [4]:
!gdown --id "1xyop7GNLa0q0TZkD90GzDOIhndMDdEQY"

Downloading...
From: https://drive.google.com/uc?id=1xyop7GNLa0q0TZkD90GzDOIhndMDdEQY
To: /content/wiki_movie_plots_deduped.csv
100% 81.2M/81.2M [00:00<00:00, 99.5MB/s]


Проставляем параметры, использующиеся в работе

In [5]:
rnd_seed = 17 
round_est = 4 # Точность округления

Здесь можно указать один из параметров модели 
(по умолчанию исходя из описания задания: 1 - 'Работа по ссылкам (поле Links), дубликаты ссылок оставляем')

In [6]:
work_option = 1
work_option_description = {1: 'Работа по ссылкам (поле Links), дубликаты ссылок оставляем',
                           2: 'Работа по ссылкам (поле Links), дубликаты убираем',
                           3: 'Работа по описанию (поле Description)',
                           4: 'На основании ссылок (поле Links) на фильмы из датасета',
                           5: 'На основании ссылок на категории'}

### Часть 1. Анализ и предобработка данных

In [7]:
with open('wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

Для работы с данными нам понадобится так же датафрейм

In [8]:
data = pd.DataFrame(movies)
data.rename(columns={0: "Name", 1: "Description", 2: "Links", 3: "Rating1", 4: "Rating2"}, inplace=True) #columns={"0": "Name"}, inplace=True)
data[:5]

Unnamed: 0,Name,Description,Links,Rating1,Rating2
0,Deadpool (film),"{'image': 'Deadpool poster.jpg', 'name': 'Dead...","[Tim Miller (director), Simon Kinberg, Ryan Re...",84%,6.9/10
1,The Revenant (2015 film),"{'image': 'The Revenant 2015 film poster.jpg',...","[Alejandro González Iñárritu, Arnon Milchan, S...",82%,7.9/10
2,Suicide Squad (film),"{'image': 'Suicide Squad (film) Poster.png', '...","[David Ayer, Charles Roven, Richard Suckle, Wi...",26%,4.7/10
3,Spectre (2015 film),"{'image': 'spectre poster.jpg', 'name': 'Spect...","[Sam Mendes, Michael G. Wilson, Barbara Brocco...",65%,
4,Rebel Without a Cause,"{'distributor': 'Warner Bros.', 'image': 'Rebe...","[Nicholas Ray, David Weisbart, Stewart Stern, ...",96%,


In [9]:
data[:7]

Unnamed: 0,Name,Description,Links,Rating1,Rating2
0,Deadpool (film),"{'image': 'Deadpool poster.jpg', 'name': 'Dead...","[Tim Miller (director), Simon Kinberg, Ryan Re...",84%,6.9/10
1,The Revenant (2015 film),"{'image': 'The Revenant 2015 film poster.jpg',...","[Alejandro González Iñárritu, Arnon Milchan, S...",82%,7.9/10
2,Suicide Squad (film),"{'image': 'Suicide Squad (film) Poster.png', '...","[David Ayer, Charles Roven, Richard Suckle, Wi...",26%,4.7/10
3,Spectre (2015 film),"{'image': 'spectre poster.jpg', 'name': 'Spect...","[Sam Mendes, Michael G. Wilson, Barbara Brocco...",65%,
4,Rebel Without a Cause,"{'distributor': 'Warner Bros.', 'image': 'Rebe...","[Nicholas Ray, David Weisbart, Stewart Stern, ...",96%,
5,Warcraft (film),"{'image': 'Warcraft Teaser Poster.jpg', 'name'...","[Duncan Jones, Thomas Tull, Charles Roven, Cha...",28%,4.2/10
6,The Martian (film),"{'image': 'The Martian film poster.jpg', 'name...","[Ridley Scott, Simon Kinberg, Michael Schaefer...",,


Изучим имеющиеся данные

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         10000 non-null  object
 1   Description  10000 non-null  object
 2   Links        10000 non-null  object
 3   Rating1      5584 non-null   object
 4   Rating2      5584 non-null   object
dtypes: object(5)
memory usage: 390.8+ KB


In [11]:
data.iloc[0].Name

'Deadpool (film)'

In [12]:
data.iloc[0].Description

{'Software Used': 'Adobe Premier Pro',
 'alt': "Official poster shows the titular hero Deadpool standing in front of the viewers, with hugging his hands, and donning his traditional black and red suit and mask, and the film's name, credits and billing below him.",
 'budget': '$58 million',
 'caption': 'Theatrical release poster',
 'cinematography': 'Ken Seng',
 'country': 'United States',
 'director': 'Tim Miller',
 'distributor': '20th Century Fox',
 'editing': 'Julian Clarke',
 'gross': '$783.1 million',
 'image': 'Deadpool poster.jpg',
 'language': 'English',
 'music': 'Tom Holkenborg',
 'name': 'Deadpool',
 'runtime': '108 minutes'}

In [13]:
data.iloc[0].Rating1, data.iloc[0].Rating2

('84%', '6.9/10')

Создаем новые поля на основании существующих:
- MovieLinks - ссылки на названия фильмов из датасета
- CategoryLinks - ссылки на категории

In [14]:
TitleSet = frozenset(data.Name)
data["MovieLinks"] = data.Links.apply(lambda x: [(i) for i in x if i in TitleSet])
data["CategoryLinks"] = data.Links.apply(lambda x: [(i) for i in list(x) if i[:9] == 'Category:'])
data.sample(3)

Unnamed: 0,Name,Description,Links,Rating1,Rating2,MovieLinks,CategoryLinks
2331,The Quiet Ones (2014 film),{'image': 'The Quiet Ones 2014 theatrical post...,"[John Pogue, James Gay-Rees, Simon Oakes, Tobi...",37%,,[],"[Category:2014 films, Category:2014 horror fil..."
5780,A Perfect World,"{'image': 'A_Perfect_World.jpg', 'name': 'A Pe...","[Bill Gold, John Lee Hancock, Kevin Costner, L...",81%,,"[In the Line of Fire, Unforgiven]","[Category:1993 films, Category:1990s crime dra..."
9226,The Foot Fist Way,"{'image': 'Foot fist way.jpg', 'name': 'The Fo...","[Jody Hill, Danny McBride, Ben Best (actor), P...",54%,5.5/10,[Napoleon Dynamite],"[Category:2006 films, Category:American comedy..."


Обрабатываем наименование фильма, а так же преобразуем рейтинги к единообразному виду. Создаем еще одно поле рейтинга, объединящее оба имеющихся.

In [15]:
data["Title"] = data["Name"].apply(lambda x: x.split(' (')[0])
data["Rating1"] = data["Rating1"].apply(lambda x: int(x[:-1]) if (x is not None and len(x)>1) else None)
data["Rating2"] = data["Rating2"].apply(lambda x: 100*float(x.split('/')[0])/float(x.split('/')[1]) if (x is not None and len(x)>1) else None)
data["Rating3"] = data["Rating1"] * data["Rating2"]

In [16]:
data.sample(3)

Unnamed: 0,Name,Description,Links,Rating1,Rating2,MovieLinks,CategoryLinks,Title,Rating3
7249,Pet Sematary Two,"{'image': 'Pet sematary ii ver2.jpg', 'name': ...","[Mary Lambert (director), Richard Outten, Edwa...",,,[Pet Sematary (film)],"[Category:1992 films, Category:1992 horror fil...",Pet Sematary Two,
6308,The Guyver,"{'image': 'The Guyver poster.jpg', 'name': 'Th...","[Screaming Mad George, Steve Wang, Brian Yuzna...",,,[Max Steel (film)],"[Category:1991 films, Category:1990s action fi...",The Guyver,
2777,Drive Angry,"{'image': 'Drive Angry Poster.jpg', 'name': 'D...","[Patrick Lussier, Michael De Luca, Todd Farmer...",45.0,53.0,[Season of the Witch (2011 film)],"[Category:2010s action films, Category:2010s f...",Drive Angry,2385.0


Так же добавим еще один датасет из других наборов данных и объединим с существующим

In [17]:
data2 = pd.read_csv('wiki_movie_plots_deduped.csv')
data2.sample(3)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
19353,1957,These Dangerous Years,British,Herbert Wilcox,"George Baker, Frankie Vaughan",comedy,https://en.wikipedia.org/wiki/These_Dangerous_...,Tough gang leader and wannabee rock star Dave ...
26103,2001,Moksha: Salvation,Bollywood,Ashok Mehta,"Manisha Koirala, Arjun Rampal",drama,https://en.wikipedia.org/wiki/Moksha:_Salvation,The story is about law graduate Vikram Saigal ...
12312,1994,Sioux City,American,Lou Diamond Phillips,"Lou Diamond Phillips, Melinda Dillon",drama,https://en.wikipedia.org/wiki/Sioux_City_(film),A young Lakota Sioux (Lou Diamond Phillips) is...


In [18]:
data3 = pd.merge(data, data2, how="left", on="Title")
data3.sample(5)

Unnamed: 0,Name,Description,Links,Rating1,Rating2,MovieLinks,CategoryLinks,Title,Rating3,Release Year,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
6067,Minsara Kanavu,"{'image': 'Minsara Kanavu.jpg', 'name': 'Minsa...","[Rajiv Menon, M. Saravanan (film producer), M....",,,"[Bombay (film), Iruvar, Kandukondain Kandukond...","[Category:1997 films, Category:Indian films, C...",Minsara Kanavu,,1997.0,Tamil,Rajiv Menon,"Prabhu Deva, Arvind Swamy, Kajol\r\n",unknown,https://en.wikipedia.org/wiki/Minsara_Kanavu,Priya Amal Raj (Kajol) lost her mother at a ve...
5662,Head-On (film),"{'image': 'Gegen die Wand (2004).jpg', 'name':...","[Fatih Akın, Mehmet Kurtuluş, Birol Ünel, Sibe...",90.0,74.0,"[Caché (film), Match Point]","[Category:2004 films, Category:German films, C...",Head-On,6660.0,,,,,,,
1631,50/50 (2011 film),"{'image': '50 50 Poster.jpg', 'name': '50/50',...","[Jonathan Levine (screenwriter), Evan Goldberg...",94.0,77.0,[],"[Category:2010s comedy-drama films, Category:2...",50/50,7238.0,2011.0,American,Jonathan Levine,"Joseph Gordon-Levitt, Seth Rogen, Anna Kendric...",comedy-drama,https://en.wikipedia.org/wiki/50/50_(2011_film),Adam Lerner (Joseph Gordon-Levitt) is a 27-yea...
5706,Arthur (2011 film),"{'distributor': 'Warner Bros. Pictures', 'imag...","[Jason Winer, Larry Brezner, Kevin McCormick (...",26.0,44.0,"[Arthur (1981 film), Arthur (1981 film), Arthu...","[Category:American films, Category:2011 films,...",Arthur,1144.0,1981.0,American,Steve Gordon,"Dudley Moore, Liza Minnelli, John Gielgud, Ger...",romantic comedy,https://en.wikipedia.org/wiki/Arthur_(1981_film),Arthur Bach is a spoiled alcoholic from New Yo...
8533,Thupparivaalan,"{'director': 'Mysskin', 'name': 'Thupparivaala...","[Mysskin, Vishal (actor), Vishal (actor), Vina...",,,[],"[Category:2017 films, Category:Indian films, C...",Thupparivaalan,,2017.0,Tamil,Mysskin,"Vishal, Prasanna, Vinay, Andrea Jeremiah, Anu ...",crime thriller,https://en.wikipedia.org/wiki/Thupparivaalan,"The film starts with Dhiwakar, a software engi..."


Далее можно строить модель по многочисленным признакам помимо исходных ссылок, но в рамках данной работы это опустим

### Часть II. Построение модели выбора ближайших фильмов

In [19]:
movies = list(data.to_numpy())

In [20]:
def movie_option(movie, x): 
    return {1: movie[2], 2: set(movie[2]), 3: movie[1].items(), 4: movie[5], 5: movie[6]}[x]

In [21]:
def common_link_counts(work_option=1):
    link_counts = Counter()
    for movie in movies:
        link_counts.update(movie_option(movie, work_option))
    return link_counts#.most_common(10)
link_counts = common_link_counts(work_option)
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

Собираем

In [22]:
min_count_links = 10

movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
top_links = [link for link, c in link_counts.items() if c >= min_count_links]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}


def pairs_accounting(work_option=1):
    pairs = []
    for movie in movies:
        pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie_option(movie, work_option) if link in link_to_idx)
    return pairs

pairs = pairs_accounting(work_option)
pairs_set = set(pairs)
len(movie_to_idx), len(top_links), len(pairs_set)

(10000, 19126, 505031)

Описываем генератор и саму модель

In [23]:
random.seed(rnd_seed)

def batchifier(pairs, positive_samples=64, negative_ratio=8):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:

        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

In [24]:
embedding_size = 100
nn_metric = 'mse'

def movie_embedding_model(embedding_size=100):
    link = Input(name='link', shape=(1,))
    link_embedding = Embedding(name='link_embedding', 
                               input_dim=len(top_links), 
                               output_dim=embedding_size)(link)
    movie = Input(name='movie', shape=(1,))
    movie_embedding = Embedding(name='movie_embedding', 
                                input_dim=len(movie_to_idx), 
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss=nn_metric)
    return model

model = movie_embedding_model(embedding_size)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 100)       1912600     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 100)       1000000     ['movie[0][0]']                  
                                                                                              

Обучение построенной модели

In [25]:
positive_samples_per_batch = 256
negative_ratio = 16

epochs = 25

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=negative_ratio),
    epochs=epochs,
    steps_per_epoch = len(pairs) // positive_samples_per_batch,
    verbose=2
)

Epoch 1/25
2832/2832 - 80s - loss: 0.3181 - 80s/epoch - 28ms/step
Epoch 2/25
2832/2832 - 77s - loss: 0.1704 - 77s/epoch - 27ms/step
Epoch 3/25
2832/2832 - 76s - loss: 0.1681 - 76s/epoch - 27ms/step
Epoch 4/25
2832/2832 - 77s - loss: 0.1671 - 77s/epoch - 27ms/step
Epoch 5/25
2832/2832 - 77s - loss: 0.1666 - 77s/epoch - 27ms/step
Epoch 6/25
2832/2832 - 77s - loss: 0.1663 - 77s/epoch - 27ms/step
Epoch 7/25
2832/2832 - 77s - loss: 0.1660 - 77s/epoch - 27ms/step
Epoch 8/25
2832/2832 - 77s - loss: 0.1659 - 77s/epoch - 27ms/step
Epoch 9/25
2832/2832 - 77s - loss: 0.1657 - 77s/epoch - 27ms/step
Epoch 10/25
2832/2832 - 77s - loss: 0.1655 - 77s/epoch - 27ms/step
Epoch 11/25
2832/2832 - 77s - loss: 0.1655 - 77s/epoch - 27ms/step
Epoch 12/25
2832/2832 - 76s - loss: 0.1654 - 76s/epoch - 27ms/step
Epoch 13/25
2832/2832 - 76s - loss: 0.1654 - 76s/epoch - 27ms/step
Epoch 14/25
2832/2832 - 76s - loss: 0.1652 - 76s/epoch - 27ms/step
Epoch 15/25
2832/2832 - 77s - loss: 0.1652 - 77s/epoch - 27ms/step
Epoc

<keras.callbacks.History at 0x7fdbf9705610>

In [26]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie, count=10, prnt = True):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-count:]
    closest_movies = list()
    for c in reversed(closest):
        closest_movies.append([movies[c][0], round(1-dists[c], round_est)])
        if prnt: print(c, '- ', movies[c][0], ' ---', round(1-dists[c], round_est))
    return closest_movies

##### Выбор набора наиболее близких фильмов по фильму

In [27]:
similar_movies('Harry Potter (film series)', 15);

33 -  Harry Potter (film series)  --- -0.0
380 -  Harry Potter and the Deathly Hallows – Part 2  --- 0.0142
332 -  Harry Potter and the Philosopher's Stone (film)  --- 0.0184
827 -  Harry Potter and the Order of the Phoenix (film)  --- 0.0209
871 -  Harry Potter and the Half-Blood Prince (film)  --- 0.0215
698 -  Harry Potter and the Deathly Hallows – Part 1  --- 0.0354
677 -  Harry Potter and the Goblet of Fire (film)  --- 0.0357
828 -  Harry Potter and the Prisoner of Azkaban (film)  --- 0.0363
169 -  Alice in Wonderland (2010 film)  --- 0.0656
91 -  The Hobbit (film series)  --- 0.0695
967 -  Harry Potter and the Chamber of Secrets (film)  --- 0.0703
56 -  Pirates of the Caribbean (film series)  --- 0.0732
93 -  The Lord of the Rings (film series)  --- 0.0799
66 -  Skyfall  --- 0.0907
961 -  The Golden Compass (film)  --- 0.0923


In [28]:
similar_movies('Titanic (1997 film)', 15);

35 -  Titanic (1997 film)  --- -0.0
84 -  Saving Private Ryan  --- 0.0293
680 -  A.I. Artificial Intelligence  --- 0.0318
245 -  Gravity (film)  --- 0.0384
972 -  Big Fish  --- 0.0436
939 -  Changeling (film)  --- 0.0439
449 -  The Curious Case of Benjamin Button (film)  --- 0.0449
2830 -  Evita (1996 film)  --- 0.0453
85 -  Inception  --- 0.0469
448 -  Minority Report (film)  --- 0.0472
582 -  King Kong (2005 film)  --- 0.0482
155 -  Gladiator (2000 film)  --- 0.0514
303 -  Raiders of the Lost Ark  --- 0.0524
1735 -  Letters from Iwo Jima  --- 0.0549
172 -  Schindler's List  --- 0.0553


In [29]:
similar_movies('Avatar (2009 film)', 15);

37 -  Avatar (2009 film)  --- 0.0
154 -  Star Trek (film)  --- 0.0177
29 -  Rogue One  --- 0.0248
19 -  Interstellar (film)  --- 0.0265
784 -  Spider-Man 2  --- 0.0268
413 -  Superman Returns  --- 0.0294
479 -  The Last Airbender  --- 0.03
613 -  Terminator Salvation  --- 0.0301
3349 -  Star Wars: The Force Awakens  --- 0.0314
86 -  Tomorrowland (film)  --- 0.0318
1159 -  Cowboys & Aliens  --- 0.0363
416 -  Spider-Man (2002 film)  --- 0.0366
118 -  Watchmen (film)  --- 0.0379
61 -  Man of Steel (film)  --- 0.0381
200 -  The Incredible Hulk (film)  --- 0.0386


In [30]:
similar_movies('Spider-Man (2002 film)', 15);

416 -  Spider-Man (2002 film)  --- -0.0
784 -  Spider-Man 2  --- 0.0133
76 -  The Dark Knight (film)  --- 0.0158
478 -  Spider-Man 3  --- 0.0165
613 -  Terminator Salvation  --- 0.0166
413 -  Superman Returns  --- 0.0167
118 -  Watchmen (film)  --- 0.0184
195 -  Iron Man (2008 film)  --- 0.0235
173 -  Batman Begins  --- 0.0255
200 -  The Incredible Hulk (film)  --- 0.0292
353 -  Iron Man 2  --- 0.0293
154 -  Star Trek (film)  --- 0.0321
19 -  Interstellar (film)  --- 0.0335
1159 -  Cowboys & Aliens  --- 0.0336
37 -  Avatar (2009 film)  --- 0.0366


In [31]:
similar_movies('Gladiator (2000 film)', 15);

155 -  Gladiator (2000 film)  --- 0.0
260 -  Braveheart  --- 0.0227
669 -  Munich (film)  --- 0.0405
1121 -  Empire of the Sun (film)  --- 0.0414
1048 -  Alexander (2004 film)  --- 0.042
3110 -  Hereafter (film)  --- 0.0422
240 -  Black Hawk Down (film)  --- 0.0457
433 -  The Bourne Ultimatum (film)  --- 0.0469
702 -  Robin Hood (2010 film)  --- 0.0501
1128 -  The Phantom of the Opera (2004 film)  --- 0.0512
35 -  Titanic (1997 film)  --- 0.0514
2343 -  United 93 (film)  --- 0.054
6841 -  Dead Again  --- 0.0545
418 -  Gangs of New York  --- 0.0549
825 -  The Good Shepherd (film)  --- 0.0552


### Часть III. Построение модели рекомендаций

##### Рекомендательная система одного фильма по заданному фильму

Выбирает из всех имеющихся фильмов с весами, равными степени косинусного расстояния по основанию basic. Реализовано для того, чтобы не рекомендовать все время одно и то же. Частотность рекомендации наиболее близких и остальных фильмов регулируется параметром basic (чем больше, тем более близкие фильмы к заданному будет чаще показывать).

In [32]:
def movie_choise(best_movie, basic=1E7):
    summ = 0
    movie_list = similar_movies(best_movie, 10000, False)
    for movie in movie_list[1:]:
        add = basic**(1- movie[1])
        summ += add
        movie.append(round(summ, 4))
    rnd = random.random()*movie_list[-1][-1]
    summ = 0
    for movie in movie_list:
        if movie[-1]>rnd: break
    return movie[0]

In [33]:
sent = lambda movie: f'Если Вам нравится фильм {movie}, обратите внимание на фильм {movie_choise(movie)}'

In [34]:
movie = 'Harry Potter (film series)'

sent(movie)

'Если Вам нравится фильм Harry Potter (film series), обратите внимание на фильм Moana (2016 film)'

In [35]:
sent(movie)

'Если Вам нравится фильм Harry Potter (film series), обратите внимание на фильм Burnt (film)'

In [36]:
sent(movie)

'Если Вам нравится фильм Harry Potter (film series), обратите внимание на фильм Veronica Guerin (film)'

In [37]:
sent(movie)

'Если Вам нравится фильм Harry Potter (film series), обратите внимание на фильм Promised Land (2012 film)'

In [38]:
sent(movie)

'Если Вам нравится фильм Harry Potter (film series), обратите внимание на фильм Desert Dancer'

##### Рекомендательная система набора фильмов по заданному набору фильмов

In [39]:
count_raiting_movies = 10
rating = "Rating3"
sort_films = data[data[rating]>0].sort_values(rating, ascending=False)
best_movies_set = ['Harry Potter (film series)',
                   'Titanic (1997 film)',
                   'Avatar (2009 film)',
                   'Gladiator (2000 film)',
                   'The Lord of the Rings (film series)',
                   'Sherlock Holmes (2010 film)',
                   'Gone with the Wind (film)',
                   'The Social Network',
                   'The Wizard of Oz (1939 film)',
                   'The Adventures of Robin Hood']
# best_movies_set = list(sort_films[-count_raiting_movies:].Name.values)
worst_movies_set = list(sort_films[-count_raiting_movies:].Name.values)
X = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best_movies_set + worst_movies_set])
y = np.asarray([1 for _ in best_movies_set] + [0 for _ in worst_movies_set])

In [40]:
print('Для рекомендательной системы были выбраны:')
print('Хорошие фильмы:', best_movies_set, sep='\n')
print('Плохие фильмы:', worst_movies_set, sep='\n')

Для рекомендательной системы были выбраны:
Хорошие фильмы:
['Harry Potter (film series)', 'Titanic (1997 film)', 'Avatar (2009 film)', 'Gladiator (2000 film)', 'The Lord of the Rings (film series)', 'Sherlock Holmes (2010 film)', 'Gone with the Wind (film)', 'The Social Network', 'The Wizard of Oz (1939 film)', 'The Adventures of Robin Hood']
Плохие фильмы:
['The Darkness (film)', 'Passion Play (film)', 'Down to You', 'I, Frankenstein', 'Shut In (2016 film)', 'Jack and Jill (2011 film)', 'The Legend of Hercules', 'The Apparition', 'Epic Movie', 'Left Behind (2014 film)']


Здесь можно задать номер модели классификации, с которой будем работать

In [41]:
class_option = 2
class_option_description = {1: 'SVC',
                            2: 'LinearSVC',
                            3: 'LogisticRegression',
                            4: 'SGDClassifier',
                            5: 'RidgeClassifier',
                            6: 'GradientBoostingClassifier', 
                            7: 'AdaBoostClassifier'}

In [42]:
models = {}
models['SVC'] = SVC(random_state=rnd_seed)
models['LinearSVC'] = LinearSVC(random_state=rnd_seed)
models['LogisticRegression'] = LogisticRegression(multi_class='multinomial', random_state=rnd_seed)
models['SGDClassifier'] = SGDClassifier(random_state=rnd_seed)
models['RidgeClassifier'] = RidgeClassifier(random_state=rnd_seed)
models['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=rnd_seed)
models['AdaBoostClassifier'] = AdaBoostClassifier(random_state=rnd_seed)

Обучим модель и получим требуемые результаты

In [43]:
clf = models[class_option_description[class_option]]
clf.fit(X, y)

LinearSVC(random_state=17)

In [44]:
print('На основании построенной модели рекомендаций были выбраны:')

estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print('Лучшие фильмы:')
for c in reversed(best[-20:]):
    print(c, movies[c][0], round(estimated_movie_ratings[c], round_est))

print('Худшие фильмы:')
for c in best[:5]:
    print(c, movies[c][0], round(estimated_movie_ratings[c], round_est))

На основании построенной модели рекомендаций были выбраны:
Лучшие фильмы:
134 Citizen Kane 1.5131
18 Star Wars (film) 1.4388
371 Lawrence of Arabia (film) 1.4066
310 Ben-Hur (1959 film) 1.3789
129 Gone with the Wind (film) 1.3551
186 Casablanca (film) 1.2466
220 Jaws (film) 1.2062
942 Gandhi (film) 1.1754
759 The Ten Commandments (1956 film) 1.1495
303 Raiders of the Lost Ark 1.1477
172 Schindler's List 1.1289
500 The Deer Hunter 1.1113
159 Apocalypse Now 1.098
68 The Godfather 1.067
141 Alien (film) 1.0633
683 Superman (1978 film) 1.0507
1031 Sunset Boulevard (film) 1.0461
556 Doctor Zhivago (film) 1.0118
424 It's a Wonderful Life 1.0106
84 Saving Private Ryan 0.9966
Худшие фильмы:
2191 Scary Movie (film series) -1.1907
994 Not Another Teen Movie -1.1698
3526 The Best Man Holiday -1.136
3682 Harold & Kumar Escape from Guantanamo Bay -1.1298
1685 She's All That -1.0997
