In [None]:
import os
import json
import pandas as pd
from datetime import datetime
import requests
import re
import logging
import sys
import numpy as np
import textwrap
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabasz_score, davies_bouldin_score
import umap
import plotly.express as px



In [None]:
def call_gpt_oss_embedding(prompt, model="bge-m3:latest"):
    url = ''
    api_key = ''
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": True
    }
    response = requests.post(url, json=payload, stream=False, headers=headers, timeout=120)
    response.raise_for_status()
    embedding_payload = json.loads(response.content.decode('utf-8'))
    return embedding_payload.get('embedding')


def clean_complaint_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
def generate_embeddings(complaints, output_dir="embedding_results", batch_size=100):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    results = []
    errors = []
    total = len(complaints)

    for idx, complaint in enumerate(complaints, 1):
        try:
            cleaned_text = clean_complaint_text(complaint)
            embedding = call_gpt_oss_embedding(cleaned_text)
            embedding_dimension = len(embedding) if embedding else 0

            results.append({
                "id": idx,
                "original_text": complaint,
                "cleaned_text": cleaned_text,
                "embedding": embedding,
                "embedding_dimension": embedding_dimension,
                "processed_at": datetime.now().isoformat()
            })
        except Exception as exc:
            errors.append({
                "id": idx,
                "original_text": complaint,
                "error": str(exc),
                "processed_at": datetime.now().isoformat()
            })
            logging.error(f"Ошибка при обработке записи {idx}: {exc}")

        if idx % batch_size == 0 or idx == total:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            chunk_path = os.path.join(output_dir, f"embeddings_{timestamp}_upto_{idx}.json")
            payload = {
                "metadata": {
                    "total": total,
                    "saved_upto": idx,
                    "timestamp": timestamp,
                    "processed_at": datetime.now().isoformat()
                },
                "results": results,
                "errors": errors
            }
            with open(chunk_path, 'w', encoding='utf-8') as f:
                json.dump(payload, f, ensure_ascii=False, indent=2)
            results.clear()
            errors.clear()
            logging.info(f"Сохранен промежуточный файл: {chunk_path}")


def load_embedding_results(output_dir="embedding_results"):
    all_vectors = []
    all_meta = []
    if not os.path.exists(output_dir):
        logging.warning(f"Директория {output_dir} не найдена")
        return all_vectors, all_meta

    for filename in sorted(os.listdir(output_dir)):
        if not filename.endswith('.json'):
            continue
        file_path = os.path.join(output_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            payload = json.load(f)

        for row in payload.get('results', []):
            embedding = row.get('embedding')
            if embedding:
                all_vectors.append(embedding)
                all_meta.append({
                    "id": row.get('id'),
                    "original_text": row.get('original_text'),
                    "cleaned_text": row.get('cleaned_text'),
                    "embedding_dimension": row.get('embedding_dimension'),
                    "processed_at": row.get('processed_at'),
                    "source_file": filename
                })

    return all_vectors, all_meta



In [None]:
# ========== АНАЛИТИКА ЭМБЕДДИНГОВ ==========

def prepare_embedding_matrix(output_dir="embedding_results"):
    """Загружает все эмбеддинги, возвращает исходную и нормализованную матрицы"""
    vectors, meta = load_embedding_results(output_dir)
    if not vectors:
        raise ValueError(f"Не найдены эмбеддинги в директории {output_dir}")

    X = np.array(vectors, dtype=float)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    meta_df = pd.DataFrame(meta)
    return X, X_scaled, meta_df, scaler


def project_embeddings_umap(X, n_neighbors=30, min_dist=0.1, n_components=2, metric='cosine', random_state=42):
    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=random_state
    )
    coords = reducer.fit_transform(X)
    return coords, reducer


def evaluate_k_values(X, k_values, use_minibatch=True, random_state=42):
    """Подбор количества кластеров по силуэту"""
    scores = []
    for k in k_values:
        if k < 2:
            continue
        if use_minibatch:
            model = MiniBatchKMeans(n_clusters=k, random_state=random_state, batch_size=1024)
        else:
            model = KMeans(n_clusters=k, random_state=random_state, n_init='auto')
        labels = model.fit_predict(X)
        if len(set(labels)) < 2:
            continue
        score = silhouette_score(X, labels)
        scores.append({"k": k, "silhouette": score})
    return pd.DataFrame(scores)


def evaluate_multiple_cluster_metrics(X, k_values, use_minibatch=True, random_state=42):
    """Возвращает таблицу с silhouette, Calinski-Harabasz, Davies-Bouldin и inertia"""
    rows = []
    for k in k_values:
        if k < 2:
            continue
        if use_minibatch:
            model = MiniBatchKMeans(n_clusters=k, random_state=random_state, batch_size=1024)
        else:
            model = KMeans(n_clusters=k, random_state=random_state, n_init='auto')
        labels = model.fit_predict(X)
        if len(set(labels)) < 2:
            continue
        rows.append({
            "k": k,
            "silhouette": silhouette_score(X, labels),
            "calinski_harabasz": calinski_harabasz_score(X, labels),
            "davies_bouldin": davies_bouldin_score(X, labels),
            "inertia": getattr(model, 'inertia_', None)
        })
    return pd.DataFrame(rows)


def cluster_embeddings(X, n_clusters, use_minibatch=True, random_state=42):
    if use_minibatch:
        model = MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state, batch_size=1024)
    else:
        model = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    labels = model.fit_predict(X)
    return labels, model


def plot_inertia_curve(metrics_df, title="Elbow curve (Inertia vs k)"):
    if metrics_df.empty:
        raise ValueError("metrics_df пуст — нечего визуализировать")
    fig = px.line(
        metrics_df.sort_values('k'),
        x='k',
        y='inertia',
        markers=True,
        title=title
    )
    fig.update_layout(xaxis_title='k', yaxis_title='Inertia')
    fig.show()
    return fig


def wrap_text_for_hover(text, width=120, max_lines=None):
    if not isinstance(text, str):
        text = str(text)
    lines = textwrap.wrap(text, width=width)
    if max_lines is not None and len(lines) > max_lines:
        lines = lines[:max_lines] + ['...']
    return '<br>'.join(lines)


def visualize_clusters(coords, labels, meta_df=None, title="Карта жалоб", save_path=None, show=True, point_metrics=None):
    df_plot = pd.DataFrame(coords, columns=[f"dim_{i+1}" for i in range(coords.shape[1])])
    df_plot['cluster'] = labels.astype(str)
    if meta_df is not None:
        df_plot = pd.concat([df_plot, meta_df.reset_index(drop=True)], axis=1)
    if point_metrics is not None:
        df_plot = pd.concat([df_plot, point_metrics.reset_index(drop=True)], axis=1)

    text_source = df_plot.get('original_text', df_plot.get('cleaned_text', ''))
    df_plot['hover_text'] = text_source.apply(wrap_text_for_hover)

    custom_data = np.stack([
        df_plot['cluster'],
        df_plot.get('silhouette_point', pd.Series([np.nan] * len(df_plot))),
        df_plot['hover_text']
    ], axis=-1)

    fig = px.scatter(
        df_plot,
        x='dim_1',
        y='dim_2',
        color='cluster',
        title=title,
        height=650,
        custom_data=custom_data
    )

    fig.update_traces(
        hovertemplate=(
            "Кластер: %{customdata[0]}<br>"
            "Silhouette: %{customdata[1]:.3f}<br>"
            "%{customdata[2]}<extra></extra>"
        )
    )

    fig.update_layout(
        legend_title_text='Кластер',
        hoverlabel=dict(
            bgcolor='white',
            font_color='black',
            bordercolor='rgba(0,0,0,0.4)',
            align='left'
        )
    )

    if save_path:
        fig.write_html(save_path, include_plotlyjs='inline')
        logging.info(f"Интерактивный график сохранён: {save_path}")

    if show:
        fig.show()

    return df_plot, fig



In [None]:
df = pd.read_csv('complaints.csv', encoding='utf-16', sep='\t')
complaints = list(df['Описание претензии'])
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)



In [None]:
generate_embeddings(
    complaints=complaints[:10],  # замените на нужный диапазон
    output_dir="embedding_results",
    batch_size=5
)



In [None]:
# === Пример полного пайплайна: подбор k, кластеризация, визуализация ===

OUTPUT_DIR = "embedding_results"

# 1. Загружаем готовые эмбеддинги
X_raw, X_scaled, meta_df, scaler = prepare_embedding_matrix(OUTPUT_DIR)
print(f"Загружено {X_raw.shape[0]} эмбеддингов размерности {X_raw.shape[1]}")

# 2. Проекция UMAP в 2D (удобно и для визуализации, и для кластеризации)
coords_2d, umap_model = project_embeddings_umap(
    X_scaled,
    n_neighbors=40,
    min_dist=0.05,
    n_components=2,
    metric='cosine'
)
print("UMAP готов:", coords_2d.shape)

# 3. Перебор количества кластеров. Можно поменять диапазон k_values
k_values = range(5, 31, 5)
score_df = evaluate_k_values(coords_2d, k_values)
display(score_df.sort_values('silhouette', ascending=False))

metrics_df = evaluate_multiple_cluster_metrics(coords_2d, k_values)
display(metrics_df.sort_values('silhouette', ascending=False))
plot_inertia_curve(metrics_df)

best_k = int(score_df.loc[score_df['silhouette'].idxmax(), 'k']) if not score_df.empty else 5
print(f"Выбранное k: {best_k}")

# 4. Кластеризация (MiniBatchKMeans по UMAP координатам)
cluster_labels, cluster_model = cluster_embeddings(coords_2d, n_clusters=best_k)
print("Кластеризация завершена. Всего кластеров:", len(np.unique(cluster_labels)))

point_metrics_df = pd.DataFrame({
    'silhouette_point': silhouette_samples(coords_2d, cluster_labels)
})

# 5. Визуализация и выгрузка таблицы с координатами
cluster_map_df, cluster_fig = visualize_clusters(
    coords_2d,
    cluster_labels,
    meta_df=meta_df,
    title=f"UMAP + KMeans (k={best_k})",
    save_path=os.path.join(OUTPUT_DIR, f"cluster_map_k{best_k}.html"),
    show=False,
    point_metrics=point_metrics_df
)

cluster_fig.show()
cluster_map_df.head()



In [None]:
# === Этап 1. Подбор параметров UMAP (фиксируем k для черновой оценки) ===
OUTPUT_DIR = "embedding_results"

if 'X_scaled' not in globals() or 'meta_df' not in globals():
    X_raw, X_scaled, meta_df, scaler = prepare_embedding_matrix(OUTPUT_DIR)
    print(f"Загружено {X_raw.shape[0]} эмбеддингов размерности {X_raw.shape[1]}")

umap_param_grid = [
    {"n_neighbors": 20, "min_dist": 0.01},
    {"n_neighbors": 40, "min_dist": 0.05},
    {"n_neighbors": 60, "min_dist": 0.2}
]

umap_eval_rows = []
fixed_k_for_umap = 50  # зафиксировали число кластеров для сравнения UMAP-параметров

for params in umap_param_grid:
    coords_tmp, _ = project_embeddings_umap(
        X_scaled,
        n_neighbors=params["n_neighbors"],
        min_dist=params["min_dist"],
        n_components=2,
        metric='cosine'
    )
    labels_tmp, _ = cluster_embeddings(coords_tmp, n_clusters=fixed_k_for_umap)
    sil = silhouette_score(coords_tmp, labels_tmp)
    umap_eval_rows.append({
        "n_neighbors": params["n_neighbors"],
        "min_dist": params["min_dist"],
        "silhouette": sil
    })

umap_eval_df = pd.DataFrame(umap_eval_rows)
display(umap_eval_df.sort_values('silhouette', ascending=False))

# выберем лучший набор UMAP-параметров
if not umap_eval_df.empty:
    best_umap_params = umap_eval_df.sort_values('silhouette', ascending=False).iloc[0]
    print("Лучшие параметры UMAP:", best_umap_params.to_dict())
else:
    best_umap_params = {"n_neighbors": 40, "min_dist": 0.05}

# пересчитываем UMAP с лучшими параметрами
coords_2d, umap_model = project_embeddings_umap(
    X_scaled,
    n_neighbors=int(best_umap_params["n_neighbors"]),
    min_dist=float(best_umap_params["min_dist"]),
    n_components=2,
    metric='cosine'
)
print("Пересчитан UMAP с лучшими параметрами:", coords_2d.shape)

# === Этап 2. Подбор количества кластеров на выбранной проекции ===
k_values = range(20, 121, 5)
metrics_df = evaluate_multiple_cluster_metrics(coords_2d, k_values)
display(metrics_df.sort_values('silhouette', ascending=False))
plot_inertia_curve(metrics_df)

if not metrics_df.empty:
    best_k = int(metrics_df.sort_values('silhouette', ascending=False).iloc[0]['k'])
else:
    best_k = 50
print(f"Оптимальное k по silhouette: {best_k}")

cluster_labels, cluster_model = cluster_embeddings(coords_2d, n_clusters=best_k)
print("Кластеризация завершена. Всего кластеров:", len(np.unique(cluster_labels)))

point_metrics_df = pd.DataFrame({
    'silhouette_point': silhouette_samples(coords_2d, cluster_labels)
})

cluster_map_df, cluster_fig = visualize_clusters(
    coords_2d,
    cluster_labels,
    meta_df=meta_df,
    title=f"UMAP + KMeans (k={best_k})",
    save_path=os.path.join(OUTPUT_DIR, f"cluster_map_k{best_k}.html"),
    show=False,
    point_metrics=point_metrics_df
)
cluster_fig.show()
cluster_map_df.head()

