In [None]:
import os
import json
import pandas as pd
from datetime import datetime
import requests
import re
import logging
import sys



In [None]:
def call_gpt_oss_embedding(prompt, model="bge-m3:latest"):
    url = ''
    api_key = ''
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": True
    }
    response = requests.post(url, json=payload, stream=False, headers=headers, timeout=120)
    response.raise_for_status()
    embedding_payload = json.loads(response.content.decode('utf-8'))
    return embedding_payload.get('embedding')


def clean_complaint_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
def generate_embeddings(complaints, output_dir="embedding_results", batch_size=100):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    results = []
    errors = []
    total = len(complaints)

    for idx, complaint in enumerate(complaints, 1):
        try:
            cleaned_text = clean_complaint_text(complaint)
            embedding = call_gpt_oss_embedding(cleaned_text)
            embedding_dimension = len(embedding) if embedding else 0

            results.append({
                "id": idx,
                "original_text": complaint,
                "cleaned_text": cleaned_text,
                "embedding": embedding,
                "embedding_dimension": embedding_dimension,
                "processed_at": datetime.now().isoformat()
            })
        except Exception as exc:
            errors.append({
                "id": idx,
                "original_text": complaint,
                "error": str(exc),
                "processed_at": datetime.now().isoformat()
            })
            logging.error(f"Ошибка при обработке записи {idx}: {exc}")

        if idx % batch_size == 0 or idx == total:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            chunk_path = os.path.join(output_dir, f"embeddings_{timestamp}_upto_{idx}.json")
            payload = {
                "metadata": {
                    "total": total,
                    "saved_upto": idx,
                    "timestamp": timestamp,
                    "processed_at": datetime.now().isoformat()
                },
                "results": results,
                "errors": errors
            }
            with open(chunk_path, 'w', encoding='utf-8') as f:
                json.dump(payload, f, ensure_ascii=False, indent=2)
            results.clear()
            errors.clear()
            logging.info(f"Сохранен промежуточный файл: {chunk_path}")



In [None]:
df = pd.read_csv('complaints.csv', encoding='utf-16', sep='\t')
complaints = list(df['Описание претензии'])
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)



In [None]:
generate_embeddings(
    complaints=complaints[:10],  # замените на нужный диапазон
    output_dir="embedding_results",
    batch_size=5
)

