In [21]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup

In [22]:
def parse_movies(url):
    driver = webdriver.Firefox()
    driver.get(url)

    response = driver.page_source

    soup = BeautifulSoup(response, 'html.parser')

    movies = []

    movie_blocks = soup.find_all('div', class_='styles_root__ti07r')

    for block in movie_blocks:
        title_tag = block.find("span", class_='styles_mainTitle__IFQyZ styles_activeMovieTittle__kJdJj')
        title = title_tag.text if title_tag else "Unknown"
        info_tags = block.find_all("span", class_="desktop-list-main-info_truncatedText__IMQRP")
        if len(info_tags) > 0:
            country_genre_director = info_tags[0].text.split("•")
            country = country_genre_director[0].strip() if len(country_genre_director) > 0 else "Unknown"
            genre_director = country_genre_director[1].split("\xa0\xa0")
            genre = genre_director[0]

            director = genre_director[1].replace("Режиссёр: ", "").strip()
        else:
            genre = "Unknown"
            country = "Unknown"
            director = "Unknown"

        actors = (
            info_tags[1].text.replace("В ролях:", "").strip()
            if len(info_tags) > 1
            else "Unknown"
        )

        movies.append({
            "title": title,
            "country": country,
            "genre": genre,
            "director": director,
            "actors": actors,
        })
    driver.quit()
    return movies

In [23]:
def parse_pages(base_url):
    all_movies = []
    for page in range(1,6):
        url = f'{base_url}?page={page}'
        all_movies.extend(parse_movies(url))
    return all_movies

In [24]:
movies = parse_pages('https://www.kinopoisk.ru/lists/movies/top250/')

In [25]:
import pandas as pd

df = pd.DataFrame(movies)
df.to_csv('movies.csv', index=False, encoding='utf-8')

In [34]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
encoder = OneHotEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('genre', vectorizer, 'genre'),
        ('actors', vectorizer, 'actors'),
        ('country', encoder, ['country'])
    ])
data = pd.read_csv("movies.csv")
X = preprocessor.fit_transform(data)
X = X.toarray()

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)

In [36]:
import numpy as np
def get_recommendations(user_movies, X, knn):
    user_vector = X[user_movies].mean(axis=0)  # Усредняем векторы
    user_vector = np.asarray(user_vector).reshape(1, -1)  # Преобразуем в массив
    
    distances, indices = knn.kneighbors(user_vector)
    recommended_indices = [idx for idx in indices[0] if idx not in user_movies]
    
    return recommended_indices

# Фильмы пользователя
user_movies = [27, 185, 186, 52]  
# 27 Собачье сердце,СССР, драма,Владимир Бортко,"Евгений Евстигнеев, Владимир Толоконников"
# 185 Игра,США, триллер,Дэвид Финчер,"Майкл Дуглас, Шон Пенн"
# 186 Работа без авторства,Германия, биография,Флориан Хенкель фон Доннерсмарк,"Том Шиллинг, Себастьян Кох"
# 52 Трасса 60,Канада, фантастика,Боб Гейл,"Джеймс Марсден, Гари Олдман"

recommended_indices = get_recommendations(user_movies, X, knn)

recommended_movies = data.iloc[recommended_indices]
print("Рекомендованные фильмы:")
print(recommended_movies[['title']])

Рекомендованные фильмы:
                                  title
14                               Начало
176                               Изгой
171  Темный рыцарь: Возрождение легенды
