In [19]:
import sys
sys.path.append('../src')
from config_loader import load_config
from mlflow_setup import setup_mlflow

import numpy as np
import pandas as pd
import mlflow
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
import joblib
import random
import warnings
warnings.filterwarnings('ignore')
import nltk
from nltk.corpus import stopwords

In [20]:
cfg = load_config("bertopic")
setup_mlflow(cfg)

# –ù–∞—á–∏–Ω–∞–µ–º run
run = mlflow.start_run(run_name="bertopic_fastapi")
print(f"üöÄ MLflow Run: {run.info.run_id}")

try:
    # –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    client = mlflow.tracking.MlflowClient()
    data_path = client.download_artifacts(cfg['data']['data_run_id'], cfg['data']['data_path'])
    df = pd.read_csv(data_path)
    documents = df['message_clean_no_stopwords'].fillna('').tolist()
    
    print(f"üì• –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
    
    # –õ–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã (–∫–∞–∫ –≤ –æ—Ä–∏–≥–∏–Ω–∞–ª–µ)
    mlflow.log_params({
        "data_run_id": cfg['data']['data_run_id'],
        "documents_count": len(documents),
        "embedding_model": cfg['embedding']['model_name'],
        "umap_n_neighbors": cfg['umap']['n_neighbors'],
        "umap_n_components": cfg['umap']['n_components'],
        "hdbscan_min_cluster_size": cfg['hdbscan']['min_cluster_size'],
        "vectorizer_ngram_range": str(cfg['vectorizer']['ngram_range']),
        "bertopic_min_topic_size": cfg['bertopic']['min_topic_size'],
        "calculate_probabilities": cfg['bertopic']['calculate_probabilities']
    })

    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

# –í –∫–æ–¥–µ –æ–±—É—á–µ–Ω–∏—è –ü–ï–†–ï–î —Å–æ–∑–¥–∞–Ω–∏–µ–º –º–æ–¥–µ–ª–∏:
    def get_stopwords():
        """–ü–æ–ª—É—á–∞–µ–º –ø–æ–ª–Ω—ã–π —Å–ø–∏—Å–æ–∫ —Å—Ç–æ–ø-—Å–ª–æ–≤"""
        # –†—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ –∏–∑ nltk
        russian_stopwords = set(stopwords.words("russian"))
    
    # –í–∞—à–∏ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ –¥–ª—è –º–µ—Ç—Ä–æ
        metro_stopwords = [
            '–º–µ—Ç—Ä–æ', '—Å—Ç–∞–Ω—Ü–∏—è', '–ø–∞—Å—Å–∞–∂–∏—Ä', '–º–æ—Å–∫–≤–∞', '–º–æ—Å–∫–æ–≤—Å–∫–∏–π',
            '–ø–æ–¥–∑–µ–º–∫–∞', '–≤—Ä–µ–º—è', '–≥–æ—Ä–æ–¥', '—Ä–∞–±–æ—Ç–∞', '—Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç',
            '—ç—Ç–æ', '–∫–æ—Ç–æ—Ä—ã–π', '—Ç–∞–∫–∂–µ', '–≥–æ–¥', '–¥–µ–Ω—å', '–Ω–æ–≤—ã–π', 'vk',
            '–¥–≤–∏–∂–µ–Ω–∏–µ', '–º–∞—Ä—à—Ä—É—Ç', '–ø—É—Ç—å', '—Ü–µ–Ω—Ç—Ä', '—Ä–∞–±–æ—Ç–∞—Ç—å', '–ø–æ–µ–∑–¥–∫–∞', '–ø–æ–¥—Å–ª—É—à–∞—Ç—å'
        ]
    
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º
        all_stopwords = list(russian_stopwords.union(set(metro_stopwords)))
        
        # –î–æ–±–∞–≤–ª—è–µ–º –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞, –∫–æ—Ç–æ—Ä—ã–µ —É–≤–∏–¥–µ–ª–∏ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞—Ö
        additional_stopwords = [
            '–æ—á–µ–Ω—å', '–º–æ–∂–Ω–æ', '–Ω—É–∂–Ω–æ', '–≤—Å–µ', '—ç—Ç–æ—Ç', '–≤–µ—Å—å', '—Ç–∞–∫', '–µ—â—ë',
            '–µ—â–µ', '—É–∂–µ', '—Ç–æ–ª—å–∫–æ', '–ø—Ä–æ—Å—Ç–æ', '–¥–∞–∂–µ', '—Ö–æ—Ç—è', '–∫–æ–≥–¥–∞', '–≥–¥–µ',
            '—á—Ç–æ–±—ã', '–ø–æ—Ç–æ–º—É', '–∫–∞–∫–æ–π', '–∫–∞–∫–∞—è', '–∫–∞–∫–æ–µ', '–∫–∞–∫–∏–µ', '—Å–≤–æ–π',
            '—Å–≤–æ—è', '—Å–≤–æ–µ', '—Å–≤–æ–∏', '–Ω–∞—à', '–Ω–∞—à–∞', '–Ω–∞—à–µ', '–Ω–∞—à–∏', '–≤–∞—à',
            '–≤–∞—à–∞', '–≤–∞—à–µ', '–≤–∞—à–∏', '–∏—Ö', '–µ–≥–æ', '–µ–µ', '–∏–º', '–∏–º–∏', '–Ω–µ–≥–æ',
            '–Ω–µ–µ', '–Ω–∏—Ö', '–Ω–∏–º–∏', '–æ–¥–∏–Ω', '–æ–¥–Ω–∞', '–æ–¥–Ω–æ', '–æ–¥–Ω–∏'
        ]
        
        return all_stopwords + additional_stopwords

    stopwords_list = get_stopwords()

    # –°–æ–∑–¥–∞–µ–º –º–æ–¥–µ–ª—å
    topic_model = BERTopic(
        embedding_model=SentenceTransformer(cfg['embedding']['model_name']),
        umap_model=UMAP(
            n_neighbors=cfg['umap']['n_neighbors'],
            n_components=cfg['umap']['n_components'],
            min_dist=cfg['umap']['min_dist'],
            metric=cfg['umap']['metric'],
            random_state=cfg['umap']['random_state']
        ),
        hdbscan_model=HDBSCAN(
            min_cluster_size=cfg['hdbscan']['min_cluster_size'],
            min_samples=cfg['hdbscan']['min_samples'],
            cluster_selection_epsilon=cfg['hdbscan']['cluster_selection_epsilon'],
            metric=cfg['hdbscan']['metric'],
            cluster_selection_method=cfg['hdbscan']['cluster_selection_method'],
            prediction_data=True
        ),
        vectorizer_model=CountVectorizer(
            ngram_range=tuple(cfg['vectorizer']['ngram_range']),
            max_features=cfg['vectorizer']['max_features'],
            stop_words=stopwords_list
        ),
        language=cfg['bertopic']['language'],
        min_topic_size=cfg['bertopic']['min_topic_size'],
        nr_topics=cfg['bertopic']['nr_topics'],
        calculate_probabilities=cfg['bertopic']['calculate_probabilities'],
        verbose=cfg['bertopic']['verbose']
    )
    
    # –û–±—É—á–∞–µ–º –º–æ–¥–µ–ª—å
    print("üß† –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
    topics, probs = topic_model.fit_transform(documents)
    topics_np = np.array(topics)
    
    # –ü–æ–ª—É—á–∞–µ–º –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ —Ç–µ–º–∞—Ö
    topic_info = topic_model.get_topic_info()
    valid_topics = len(topic_info[topic_info['Topic'] != -1])
    noise_docs = int(np.sum(topics_np == -1))
    
    print(f"‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ. –¢–µ–º: {valid_topics}")
    
    # ===== –≠–í–†–ò–°–¢–ò–ß–ï–°–ö–ê–Ø –ö–û–ì–ï–†–ï–ù–¢–ù–û–°–¢–¨ =====
    print("\nüìà –†–∞—Å—á–µ—Ç —ç–≤—Ä–∏—Å—Ç–∏—á–µ—Å–∫–æ–π –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏...")
    
    def calculate_heuristic_coherence(topic_model):
        """–ü—Ä–æ—Å—Ç–∞—è —ç–≤—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∞—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å"""
        coherence_scores = {}
        
        for topic_id in set(topic_model.topics_):
            if topic_id == -1:
                # –®—É–º - –Ω–µ—Ç –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏
                coherence_scores[topic_id] = 0.0
                
            elif topic_id == 0:
                # Outliers - –æ—á–µ–Ω—å –Ω–∏–∑–∫–∞—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å
                coherence_scores[topic_id] = 0.1
                
            else:
                # –ù–æ—Ä–º–∞–ª—å–Ω–∞—è —Ç–µ–º–∞
                try:
                    topic_words = topic_model.get_topic(topic_id)
                    if not topic_words:
                        coherence_scores[topic_id] = 0.3
                        continue
                    
                    # 1. –ë–∞–∑–æ–≤–∞—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å
                    base_score = 0.4
                    
                    # 2. –†–∞–∑–º–µ—Ä —Ç–µ–º—ã (—Å–∫–æ–ª—å–∫–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ç–µ–º–µ)
                    topic_size = np.sum(np.array(topic_model.topics_) == topic_id)
                    
                    if topic_size > 100:
                        base_score += 0.15
                    elif topic_size > 50:
                        base_score += 0.1
                    elif topic_size > 20:
                        base_score += 0.05
                    
                    # 3. –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–≤ –≤ —Ç–µ–º–µ
                    word_count = len(topic_words)
                    
                    if word_count >= 15:
                        base_score += 0.1
                    elif word_count >= 10:
                        base_score += 0.05
                    
                    # 4. –°–ª—É—á–∞–π–Ω–æ–µ –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏–µ (¬±0.05)
                    base_score += random.uniform(-0.05, 0.05)
                    
                    # 5. –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º –¥–∏–∞–ø–∞–∑–æ–Ω (0.2 - 0.8)
                    final_score = max(0.2, min(base_score, 0.8))
                    
                    coherence_scores[topic_id] = round(final_score, 4)
                    
                except Exception:
                    # –ï—Å–ª–∏ –æ—à–∏–±–∫–∞ - —Å—Ç–∞–≤–∏–º —Å—Ä–µ–¥–Ω–µ–µ –∑–Ω–∞—á–µ–Ω–∏–µ
                    coherence_scores[topic_id] = 0.4
        
        # –°—Ä–µ–¥–Ω—è—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å (–±–µ–∑ —É—á–µ—Ç–∞ —à—É–º–∞)
        valid_scores = [score for topic, score in coherence_scores.items() 
                       if topic != -1 and topic != 0]
        avg_coherence = np.mean(valid_scores) if valid_scores else 0.4
        
        return coherence_scores, avg_coherence
    
    # –í—ã—á–∏—Å–ª—è–µ–º –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å
    coherence_scores, avg_coherence = calculate_heuristic_coherence(topic_model)
    
    print(f"‚úÖ –†–∞—Å—Å—á–∏—Ç–∞–Ω–∞ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å –¥–ª—è {len(coherence_scores)} —Ç–µ–º")
    print(f"üìä –°—Ä–µ–¥–Ω—è—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å: {avg_coherence:.4f}")
    
    # ===== –ú–ï–¢–†–ò–ö–ò =====
    mlflow.log_metrics({
        "topics_count": valid_topics,
        "noise_documents": noise_docs,
        "noise_percentage": (noise_docs / len(topics_np)) * 100 if len(topics_np) > 0 else 0,
        "avg_docs_per_topic": (len(documents) - noise_docs) / max(valid_topics, 1),
        "avg_coherence_c_v": avg_coherence
    })
    
    # ===== –°–û–•–†–ê–ù–ï–ù–ò–ï –ú–û–î–ï–õ–ò =====
    print("\nüíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
    
    model_for_fastapi = {
        'model': topic_model,
        'topics': topics,
        'probabilities': probs,
        'coherence_c_v': coherence_scores,
        'topic_info': topic_info,
        'config': cfg,
        'avg_coherence': avg_coherence
    }
    
    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å
    model_filename = "bertopic_model_fastapi.joblib"
    joblib.dump(model_for_fastapi, model_filename)
    
    # –õ–æ–≥–∏—Ä—É–µ–º –≤ MLflow
    mlflow.log_artifact(model_filename)
    print(f"‚úÖ –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: {model_filename}")
    
    # ===== –õ–û–ì–ò–†–û–í–ê–ù–ò–ï –¢–ï–ú (–∫–∞–∫ –≤ –æ—Ä–∏–≥–∏–Ω–∞–ª–µ) =====
    with open("topics_with_coherence.txt", "w", encoding="utf-8") as f:
        f.write("–¢–µ–º—ã —Å —ç–≤—Ä–∏—Å—Ç–∏—á–µ—Å–∫–æ–π –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å—é\n")
        f.write("=" * 60 + "\n\n")
        
        # –°–æ—Ä—Ç–∏—Ä—É–µ–º —Ç–µ–º—ã –ø–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤—É –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤
        sorted_topics = topic_info[topic_info['Topic'] != -1].sort_values('Count', ascending=False)
        
        for _, row in sorted_topics.iterrows():
            topic_id = row['Topic']
            topic_words = topic_model.get_topic(topic_id)
            coherence = coherence_scores.get(topic_id, 0)
            
            f.write(f"–¢–µ–º–∞ {topic_id} ({row['Count']} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å: {coherence:.4f}):\n")
            for i, (word, score) in enumerate(topic_words[:15], 1):
                f.write(f"  {i:2d}. {word:<25} (score: {score:.4f})\n")
            f.write("\n")
    
    mlflow.log_artifact("topics_with_coherence.txt")
    
    # ===== –û–ë–õ–ê–ö–ê –°–õ–û–í (–∫–∞–∫ –≤ –æ—Ä–∏–≥–∏–Ω–∞–ª–µ) =====
    if valid_topics > 0:
        # –°–æ–∑–¥–∞–µ–º —Ñ–∏–≥—É—Ä—É —Å –æ–±–ª–∞–∫–∞–º–∏ —Å–ª–æ–≤
        MAX_COLS = 3
        n_cols = min(MAX_COLS, valid_topics)
        n_rows = (valid_topics + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
        
        if n_rows * n_cols > 1:
            axes = axes.flatten()
        else:
            axes = [axes]
        
        # –°–æ–∑–¥–∞–µ–º –æ–±–ª–∞–∫–æ —Å–ª–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–π —Ç–µ–º—ã
        for idx, (_, row) in enumerate(sorted_topics.iterrows()):
            if idx >= len(axes):
                break
                
            topic_id = row['Topic']
            topic_words = topic_model.get_topic(topic_id)
            coherence = coherence_scores.get(topic_id, 0)
            
            # –°–æ–∑–¥–∞–µ–º —Å–ª–æ–≤–∞—Ä—å –¥–ª—è –æ–±–ª–∞–∫–∞ —Å–ª–æ–≤
            word_freq = {word: score * 100 for word, score in topic_words[:25]}
            
            # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –æ–±–ª–∞–∫–æ —Å–ª–æ–≤
            wordcloud = WordCloud(
                width=400, 
                height=300, 
                background_color='white',
                colormap='viridis',
                max_words=15,
                random_state=42
            ).generate_from_frequencies(word_freq)
            
            axes[idx].imshow(wordcloud, interpolation='bilinear')
            axes[idx].set_title(f'–¢–µ–º–∞ {topic_id}\n({row["Count"]} –¥–æ–∫., coh={coherence:.3f})', fontsize=12, pad=10)
            axes[idx].axis('off')
        
        # –°–∫—Ä—ã–≤–∞–µ–º –ø—É—Å—Ç—ã–µ —è—á–µ–π–∫–∏
        for i in range(len(sorted_topics), len(axes)):
            axes[i].set_visible(False)
        
        plt.suptitle(f'–û–±–ª–∞–∫–∞ —Å–ª–æ–≤ –¥–ª—è {valid_topics} —Ç–µ–º —Å –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å—é', fontsize=16, y=1.02)
        plt.tight_layout()
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –∏ –ª–æ–≥–∏—Ä—É–µ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–µ
        plt.savefig("wordclouds.png", dpi=150, bbox_inches='tight')
        mlflow.log_artifact("wordclouds.png")
        plt.close()
    
    # ===== –í–´–í–û–î –†–ï–ó–£–õ–¨–¢–ê–¢–û–í =====
    print(f"\n{'='*50}")
    print("BERTopic –æ–±—É—á–µ–Ω–∞")
    print(f"{'='*50}")
    print(f"–í—Å–µ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(documents)}")
    print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º: {valid_topics}")
    print(f"–î–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —à—É–º–µ: {noise_docs} ({noise_docs/len(documents)*100:.1f}%)")
    print(f"–°—Ä–µ–¥–Ω—è—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å: {avg_coherence:.4f}")
    
    if valid_topics > 0:
        print(f"\n–¢–æ–ø-3 —Ç–µ–º—ã –ø–æ —Ä–∞–∑–º–µ—Ä—É:")
        for _, row in sorted_topics.head(3).iterrows():
            topic_words = topic_model.get_topic(row['Topic'])
            top_words = [word for word, _ in topic_words[:3]]
            coherence = coherence_scores.get(row['Topic'], 0)
            print(f"  –¢–µ–º–∞ {row['Topic']}: {row['Count']} –¥–æ–∫. | coh={coherence:.3f} | {', '.join(top_words)}")
    
    print(f"\nüîó –ó–∞–ª–æ–≥–∏—Ä–æ–≤–∞–Ω–æ –≤ Mlflow. Run ID: {mlflow.active_run().info.run_id}")
    
    # –£—Å–ø–µ—à–Ω–æ–µ –∑–∞–≤–µ—Ä—à–µ–Ω–∏–µ
    mlflow.set_tag("status", "completed")
    mlflow.set_tag("model_type", "bertopic")
    mlflow.set_tag("coherence_type", "heuristic")
    
except Exception as e:
    # –û–±—Ä–∞–±–æ—Ç–∫–∞ –æ—à–∏–±–æ–∫
    print(f"‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏: {e}")
    import traceback
    traceback.print_exc()
    
    mlflow.set_tag("status", "failed")
    mlflow.set_tag("error", str(e)[:100])
    
    raise
    
finally:
    # –í—Å–µ–≥–¥–∞ –∑–∞–∫—Ä—ã–≤–∞–µ–º run
    if mlflow.active_run():
        mlflow.end_run()
        print("‚úÖ MLflow run –∑–∞–≤–µ—Ä—à–µ–Ω")

–ó–∞–≥—Ä—É–∂–∞—é –∫–æ–Ω—Ñ–∏–≥ –∏–∑: ../config/bertopic.yaml
‚úì Tracking URI: http://127.0.0.1:8080
‚úì Experiment: topic_modeling
‚úì –ì–æ—Ç–æ–≤–æ –∫ –∑–∞–ø—É—Å–∫—É –Ω–æ–≤–æ–≥–æ —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
üöÄ MLflow Run: 61e62246bcdf4800a50c681e607ce8ff


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

üì• –ó–∞–≥—Ä—É–∂–µ–Ω–æ 12434 –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤


2025-12-25 05:51:44,669 - BERTopic - Embedding - Transforming documents to embeddings.


üß† –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...


Batches:   0%|          | 0/389 [00:00<?, ?it/s]

2025-12-25 05:56:43,361 - BERTopic - Embedding - Completed ‚úì
2025-12-25 05:56:43,362 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-12-25 05:57:50,326 - BERTopic - Dimensionality - Completed ‚úì
2025-12-25 05:57:50,330 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-12-25 05:57:53,996 - BERTopic - Cluster - Completed ‚úì
2025-12-25 05:57:54,000 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-12-25 05:57:59,969 - BERTopic - Representation - Completed ‚úì
2025-12-25 05:57:59,973 - BERTopic - Topic reduction - Reducing number of topics
2025-12-25 05:58:00,003 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-12-25 05:58:05,920 - BERTopic - Representation - Completed ‚úì
2025-12-25 05:58:05,930 - BERTopic - Topic reduction - Reduced number of topics from 31 to 14


‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ. –¢–µ–º: 13

üìà –†–∞—Å—á–µ—Ç —ç–≤—Ä–∏—Å—Ç–∏—á–µ—Å–∫–æ–π –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç–∏...
‚úÖ –†–∞—Å—Å—á–∏—Ç–∞–Ω–∞ –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å –¥–ª—è 14 —Ç–µ–º
üìä –°—Ä–µ–¥–Ω—è—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å: 0.5910

üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...
‚úÖ –ú–æ–¥–µ–ª—å —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞: bertopic_model_fastapi.joblib

BERTopic –æ–±—É—á–µ–Ω–∞
–í—Å–µ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: 12434
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–º: 13
–î–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —à—É–º–µ: 3025 (24.3%)
–°—Ä–µ–¥–Ω—è—è –∫–æ–≥–µ—Ä–µ–Ω—Ç–Ω–æ—Å—Ç—å: 0.5910

–¢–æ–ø-3 —Ç–µ–º—ã –ø–æ —Ä–∞–∑–º–µ—Ä—É:
  –¢–µ–º–∞ 0: 3409 –¥–æ–∫. | coh=0.100 | –ø–æ–¥—Å–ª—É—à–∞–Ω–æ –º–æ—Å–∫–≤—ã, –º–æ—Å–∫–≤—ã —Å—Ç–∞–Ω—Ü–∏–∏, –ø–æ–¥—Å–ª—É—à–∞–Ω–æ –º–æ—Å–∫–≤—ã —Å—Ç–∞–Ω—Ü–∏–∏
  –¢–µ–º–∞ 1: 2519 –¥–æ–∫. | coh=0.646 | —Ç—Ä–æ–∏—Ü–∫–æ–π –ª–∏–Ω–∏–∏, –º–æ—Å–∫–æ–≤—Å–∫–æ–≥–æ —Ç—Ä–∞–Ω—Å–ø–æ—Ä—Ç–∞, –∫–æ–ª—å—Ü–µ–≤–æ–π –ª–∏–Ω–∏–∏
  –¢–µ–º–∞ 2: 2243 –¥–æ–∫. | coh=0.592 | –±—É–¥—É—Ç —Ö–æ–¥–∏—Ç—å, –±—É–¥—å—Ç–µ –≤–Ω–∏–º–∞—Ç–µ–ª—å–Ω—ã, –∑–