In [1]:
# импорт библиотек
import pandas as pd
import numpy as np


In [2]:
# увеличим ширину отображения столбцов
pd.set_option('display.max_colwidth', 200)


In [28]:
# загрузка
df = pd.read_csv('../data/movies_metadata.csv', low_memory = False)
df = df[df['title'].notna()]
df.shape

(45460, 24)

In [31]:
df.columns.tolist()

['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

In [32]:
# проверим пустые значения в описаниях (overview)
df['overview'].isna().mean()

np.float64(0.020985481742190937)

In [33]:
# 2% пропущенных описаний - это мало, можно удалить эти строки
df = df.dropna(subset = ['overview']).reset_index(drop = True)

In [34]:
df['overview'].isna().mean()

np.float64(0.0)

In [18]:
from sentence_transformers import SentenceTransformer

# загрузим предобученную модель
model = SentenceTransformer('all-MiniLM-L6-v2')

# создаём эмбеддинги для всех описаний
embeddings = model.encode(df['overview'].tolist(), show_progress_bar = True)
print('начало')
print(f"Размер эмбеддингов: {embeddings.shape}")
print('конец')

Batches:   0%|          | 0/1391 [00:00<?, ?it/s]

начало
Размер эмбеддингов: (44512, 384)
конец


In [35]:
# сохраним эмбеддинги и датафрейм, чтобы не пересчитывать их каждый раз
import pickle

# сохранение эмбеддинга
with open('../data/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


# сохранение датафрейма
df.to_csv('../data/movies_cleaned.csv', index = False)


In [36]:
with open('../data/embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_movies(query_index, embeddings, df, top_n = 5):
    # считаем косинусную близость между фильмом и всеми остальными
    similarities = cosine_similarity(
        [embeddings[query_index]], 
        embeddings
    )[0]

    # получаем индексы топ-N схожих фильмов, исключая сам фильм (query_index)
    similar_indices = similarities.argsort()[::-1][1:top_n + 1]

    return df.iloc[similar_indices][['title', 'overview']]
    

In [38]:
# используем
query_index = 111   # рандом

print('Запрос:', df.iloc[query_index]['title'])
print(df.iloc[query_index]['overview'], "\n")

# рекомендации
recommend_movies(query_index, embeddings, df)


Запрос: Margaret's Museum
In a town where half the men die down the coalpit, Margaret MacNeil is quite happy being single in her small Cape Breton island town. Until she meets Neil Currie, a charming and sincere bagpipe-playing, Gaelic-speaking dishwasher. But no matter what you do, you can't avoid the spectre of the pit forever. 



Unnamed: 0,title,overview
40182,Murder in a Small Town,"A widowed theatre director moves to a small Connecticut town where he gets involved in solving the murder of a millionaire, who was the most despised man in town."
4050,I Know Where I'm Going!,Joan Webster is an ambitious and stubborn middle-class English woman determined to move forward since her childhood. She meets her father in a fancy restaurant to tell him that she will marry the ...
23900,Dim Sum: A Little Bit of Heart,"In San Francisco, an immigrant Chinese widow welcomes the new year with some unhappiness: she's 62 now, she wants to make a trip to China to pay last respects to her ancestors, a fortune teller ha..."
36146,Oddsac,"Opening with torch-wielding villagers and a wall bleeding oil, this experimental film attaches vivid scenery and strange characters to the wonderful melodic wavelengths of the band Animal Collecti..."
28579,Fifty/Fifty,Two bickering mercenaries are hired by the CIA to overthrow a South East Asian dictator.


In [39]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations(title: str, df, embeddings, top_n = 5) -> list:
    # Находит фильмы, похожие по описанию
    title = title.lower()
    matches = df[df['title'].str.lower().str.contains(title)]

    if matches.empty:
        return []

    query_index = matches.index[0]

    sims = cosine_similarity(
        [embeddings[query_index]],
        embeddings
    )[0]

    similar_indices = sims.argsort()[::-1][1:top_n+1]
    return df.iloc[similar_indices][['title', 'overview']].to_dict(orient='records')


In [40]:
import nest_asyncio
nest_asyncio.apply()

In [43]:
from telegram import Update
from telegram.ext import ApplicationBuilder, CommandHandler, ContextTypes, MessageHandler, filters
import logging
from dotenv import load_dotenv
import os

load_dotenv()

BOT_TOKEN = os.getenv('BOT_TOKEN')


logging.basicConfig(
    format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 
    level = logging.INFO
)


async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await update.message.reply_text(
        "Привет, напиши название фильма, и я порекомендую похожие."
    )


async def recommend(update: Update, context: ContextTypes.DEFAULT_TYPE):
    user_text = update.message.text
    print(f"Получено сообщение: {user_text}")

    recommendations = get_recommendations(user_text, df, embeddings, top_n=5)
    print(f"Найдено рекомендаций: {len(recommendations)}")

    if not recommendations:
        await update.message.reply_text("Фильм не найден или похожих нет.")
    else:
        response = ""
        for rec in recommendations:
            response += f"{rec['title']}\n{rec['overview']}\n\n"
        await update.message.reply_text(response)


def main():
    app = ApplicationBuilder().token(TOKEN).build()

    app.add_handler(CommandHandler("start", start))
    app.add_handler(MessageHandler(filters.ALL, recommend))

    print("Бот запущен...")
    app.run_polling()


In [None]:
main()