In [None]:
import re
import json
import os
import numpy as np
import pandas as pd
from datetime import datetime
from joblib import Parallel, delayed
import time
import requests
import logging
import sys


DEFAULT_CLASSIFICATION = {
    "type": "–î–†–£–ì–û–ï",
    "product": "–ù–ï–û–ü–†–ï–î–ï–õ–ï–ù",
    "confidence_level": "–Ω–∏–∑–∫–∏–π",
    "explanation": ""
}



In [None]:
def call_gpt_oss(prompt, model="gpt-oss:120b"):
    url = ''
    api_key = ''
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": True
    }
    try:
        response = requests.post(url, json=payload, stream=True, headers=headers)
        response.raise_for_status()
        logging.info(f"response code - {response.status_code}")
        full_response = ""
        for line in response.iter_lines():
            if line:
                json_response = json.loads(line)
                token = json_response.get("response", "")
                full_response += token
                print(token, end="", flush=True)
        print()
        return full_response
    except requests.exceptions.RequestException as e:
        print(f"Error calling Ollama API: {e}")
        return None


def call_gpt_oss_embedding(prompt, model="bge-m3:latest"):
    url = ''
    api_key = ''
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": True
    }
    response = requests.post(url, json=payload, stream=False, headers=headers)
    embedding = json.loads(response.content.decode('utf-8'))['embedding']
    return embedding



In [None]:
def clean_complaint_text(text):
    cleaned_text = re.sub(r'\d+', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


def load_knowledge_bases():
    with open('products.json', 'r', encoding='utf-8') as f:
        products_kb = json.load(f)
    with open('types_products.json', 'r', encoding='utf-8') as f:
        complaint_categories_kb = json.load(f)
    return products_kb, complaint_categories_kb


def create_prompt(products_kb, categories_kb, complaint_text):
    prompt_template = """
# –†–æ–ª—å
–¢—ã ‚Äî –æ–ø—ã—Ç–Ω—ã–π –∞–Ω–∞–ª–∏—Ç–∏–∫ —Å–ª—É–∂–±—ã –ø–æ–¥–¥–µ—Ä–∂–∫–∏ –±–∞–Ω–∫–∞. –¢–≤–æ—è –∑–∞–¥–∞—á–∞ ‚Äî —Ç–æ—á–Ω–æ –∫–ª–∞—Å—Å–∏—Ñ–∏—Ü–∏—Ä–æ–≤–∞—Ç—å –∫–ª–∏–µ–Ω—Ç—Å–∫–∏–µ –∂–∞–ª–æ–±—ã –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–æ–≤.

# –ë–∞–∑–∞ –∑–Ω–∞–Ω–∏–π

## –°–ø—Ä–∞–≤–æ—á–Ω–∏–∫ –ø—Ä–æ–¥—É–∫—Ç–æ–≤:
{products_json}

## –°–ø—Ä–∞–≤–æ—á–Ω–∏–∫ —Ç–∏–ø–æ–≤ –∂–∞–ª–æ–± –∏ –ø—Ä–æ–¥—É–∫—Ç–æ–≤:
{types_products_json}

# –ò–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ –ø–æ –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏:
1. –ê–Ω–∞–ª–∏–∑ —Ç–µ–∫—Å—Ç–∞ –∂–∞–ª–æ–±—ã: –í–Ω–∏–º–∞—Ç–µ–ª—å–Ω–æ –ø—Ä–æ—á–∏—Ç–∞–π –∂–∞–ª–æ–±—É –∏ –≤—ã–¥–µ–ª–∏ –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞, —Ñ—Ä–∞–∑—ã –∏ —Å—É—Ç—å –ø—Ä–æ–±–ª–µ–º—ã.
2. –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ü—Ä–æ–¥—É–∫—Ç–∞: –°–æ–ø–æ—Å—Ç–∞–≤—å —Å–æ–¥–µ—Ä–∂–∞–Ω–∏–µ –∂–∞–ª–æ–±—ã —Å –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è–º–∏ –∏ –∫–ª—é—á–µ–≤—ã–º–∏ —Å–ª–æ–≤–∞–º–∏ –∏–∑ products.json. –í—ã–±–µ—Ä–∏ –Ω–∞–∏–±–æ–ª–µ–µ –ø–æ–¥—Ö–æ–¥—è—â–∏–π –ø—Ä–æ–¥—É–∫—Ç –∏–∑ —Å–ø–∏—Å–∫–∞. –ï—Å–ª–∏ –∂–∞–ª–æ–±–∞ –∑–∞—Ç—Ä–∞–≥–∏–≤–∞–µ—Ç –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø—Ä–æ–¥—É–∫—Ç–æ–≤, –≤—ã–±–µ—Ä–∏ –æ—Å–Ω–æ–≤–Ω–æ–π.
3. –û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –¢–∏–ø–∞: –ò—Å–ø–æ–ª—å–∑—É–π types_products.json –¥–ª—è –ø–æ–∏—Å–∫–∞ –ø–æ–¥—Ö–æ–¥—è—â–µ–≥–æ —Ç–∏–ø–∞. –£—á–∏—Ç—ã–≤–∞–π —Å–≤—è–∑—å –º–µ–∂–¥—É —Ç–∏–ø–æ–º –∏ –ø—Ä–æ–¥—É–∫—Ç–æ–º - —Ç–∏–ø –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–≤—è–∑–∞–Ω —Å –≤—ã–±—Ä–∞–Ω–Ω—ã–º –ø—Ä–æ–¥—É–∫—Ç–æ–º. –°–æ–ø–æ—Å—Ç–∞–≤—å –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –∏–∑ –∂–∞–ª–æ–±—ã —Å type_tags –∏ type_description.

# –°–¢–†–û–ì–ò–ï –ü–†–ê–í–ò–õ–ê:
- –ò—Å–ø–æ–ª—å–∑—É–π –¢–û–õ–¨–ö–û –Ω–∞–∑–≤–∞–Ω–∏—è —Ç–∏–ø–æ–≤ –∏ –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –∏–∑ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–ª–µ–Ω–Ω—ã—Ö —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–æ–≤
- –ù–µ –ø—Ä–∏–¥—É–º—ã–≤–∞–π –Ω–æ–≤—ã–µ –Ω–∞–∑–≤–∞–Ω–∏—è –∏–ª–∏ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏
- –ï—Å–ª–∏ –Ω–µ—Ç —Ç–æ—á–Ω–æ–≥–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è, –≤—ã–±–µ—Ä–∏ –Ω–∞–∏–±–æ–ª–µ–µ –±–ª–∏–∑–∫–∏–π –≤–∞—Ä–∏–∞–Ω—Ç –∏–∑ —Å—É—â–µ—Å—Ç–≤—É—é—â–∏—Ö –≤ —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–∞—Ö
- –ù–∞–∑–≤–∞–Ω–∏—è —Ç–∏–ø–æ–≤ –∏ –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –¥–æ–ª–∂–Ω—ã —Ç–æ—á–Ω–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–æ–≤–∞—Ç—å —Å–ø—Ä–∞–≤–æ—á–Ω–∏–∫–∞–º

# –§–û–†–ú–ê–¢ –û–¢–í–ï–¢–ê (JSON):
{{
    "type": "–Ω–∞–∑–≤–∞–Ω–∏–µ –≤—ã–±—Ä–∞–Ω–Ω–æ–≥–æ —Ç–∏–ø–∞",
    "product": "–Ω–∞–∑–≤–∞–Ω–∏–µ –≤—ã–±—Ä–∞–Ω–Ω–æ–≥–æ –ø—Ä–æ–¥—É–∫—Ç–∞",
    "confidence_level": "–≤—ã—Å–æ–∫–∏–π/—Å—Ä–µ–¥–Ω–∏–π/–Ω–∏–∑–∫–∏–π",
    "explanation": "–∫—Ä–∞—Ç–∫–æ–µ –æ–±–æ—Å–Ω–æ–≤–∞–Ω–∏–µ –≤—ã–±–æ—Ä–∞ –Ω–∞ –æ—Å–Ω–æ–≤–µ –∫–ª—é—á–µ–≤—ã—Ö —Å–ª–æ–≤ –∏–∑ —Ç–µ–∫—Å—Ç–∞"
}}

–¢–µ–∫—Å—Ç –∂–∞–ª–æ–±—ã –¥–ª—è –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏: 
{complaint_text}

–¢–≤–æ–π –æ—Ç–≤–µ—Ç:
"""
    return prompt_template.format(
        products_json=json.dumps(products_kb, ensure_ascii=False, indent=2),
        types_products_json=json.dumps(categories_kb, ensure_ascii=False, indent=2),
        complaint_text=complaint_text
    )


def parse_classification_response(response_text, error_message=""):
    if not response_text:
        classification = DEFAULT_CLASSIFICATION.copy()
        classification["explanation"] = f"–ü—É—Å—Ç–æ–π –æ—Ç–≤–µ—Ç –º–æ–¥–µ–ª–∏. {error_message}".strip()
        return classification

    try:
        parsed = json.loads(response_text)
        if isinstance(parsed, dict) and isinstance(parsed.get("classification"), dict):
            parsed = parsed["classification"]

        return {
            "type": parsed.get("type", DEFAULT_CLASSIFICATION["type"]),
            "product": parsed.get("product", DEFAULT_CLASSIFICATION["product"]),
            "confidence_level": parsed.get("confidence_level", DEFAULT_CLASSIFICATION["confidence_level"]),
            "explanation": parsed.get("explanation", DEFAULT_CLASSIFICATION["explanation"])
        }
    except (json.JSONDecodeError, AttributeError, TypeError) as exc:
        classification = DEFAULT_CLASSIFICATION.copy()
        classification["explanation"] = f"–û—à–∏–±–∫–∞ –ø–∞—Ä—Å–∏–Ω–≥–∞: {exc}. {error_message}".strip()
        return classification



In [None]:
def process_single_complaint(complaint_text, products_kb, categories_kb, index, text_column='text'):
    try:
        cleaned_text = clean_complaint_text(complaint_text)
        embedding = call_gpt_oss_embedding(cleaned_text, model="bge-m3:latest")
        print(embedding)

        individual_prompt = create_prompt(products_kb, categories_kb, cleaned_text)
        classification_response = call_gpt_oss(individual_prompt, model="gpt-oss:120b")
        classification_data = parse_classification_response(classification_response, error_message=f"–ñ–∞–ª–æ–±–∞ #{index + 1}")

        return {
            'id': index + 1,
            'original_text': complaint_text,
            'cleaned_text': cleaned_text,
            'embedding': embedding,
            'embedding_dimension': len(embedding) if embedding else 0,
            'prompt': individual_prompt,
            'classification': classification_data,
            'processed_at': datetime.now().isoformat(),
            'batch_index': index
        }

    except Exception as e:
        print(f"‚ùå –û—à–∏–±–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏ –∂–∞–ª–æ–±—ã {index}: {e}")
        fallback = DEFAULT_CLASSIFICATION.copy()
        fallback["explanation"] = f"–û—à–∏–±–∫–∞ –æ–±—Ä–∞–±–æ—Ç–∫–∏: {str(e)}"
        return {
            'id': index + 1,
            'original_text': complaint_text,
            'cleaned_text': '',
            'embedding': None,
            'embedding_dimension': 0,
            'prompt': '',
            'classification': fallback,
            'processed_at': datetime.now().isoformat(),
            'batch_index': index
        }


def save_results(results, batch_number, total_processed, output_dir="classification_results"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{output_dir}/batch_{batch_number}_{timestamp}.json"
    print(results)

    results_data = {
        "metadata": {
            "batch_number": batch_number,
            "total_processed": total_processed,
            "timestamp": timestamp,
            "saved_at": datetime.now().isoformat(),
            "embedding_dimension": len(results[0]['embedding']) if results and results[0].get('embedding') else 0
        },
        "results": results
    }

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results_data, f, ensure_ascii=False, indent=2, default=str)

    print(f"‚úì Batch {batch_number} —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {filename} ({total_processed} –∂–∞–ª–æ–±)")
    return filename


def merge_all_results(output_dir="classification_results"):
    if not os.path.exists(output_dir):
        print("‚ùå –î–∏—Ä–µ–∫—Ç–æ—Ä–∏—è —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏ –Ω–µ –Ω–∞–π–¥–µ–Ω–∞")
        return

    all_results = []
    batch_files = [f for f in os.listdir(output_dir) if f.startswith('batch_') and f.endswith('.json')]

    for batch_file in sorted(batch_files):
        batch_path = os.path.join(output_dir, batch_file)
        with open(batch_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            all_results.extend(data['results'])

    merged_file = f"{output_dir}/all_results_merged.json"
    with open(merged_file, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)

    print(f"‚úÖ –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –æ–±—ä–µ–¥–∏–Ω–µ–Ω—ã: {merged_file}")
    print(f"üìÅ –í—Å–µ–≥–æ –∑–∞–ø–∏—Å–µ–π: {len(all_results)}")
    return all_results


def process_complaints_parallel(complaints, batch_size=100, n_jobs=-1, output_dir="classification_results", text_column='text'):
    products_kb, categories_kb = load_knowledge_bases()
    total_complaints = len(complaints)

    print(f"üöÄ –ù–∞—á–∞–ª–æ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–∏ {total_complaints} –∂–∞–ª–æ–±")
    print(f"üîß –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —è–¥–µ—Ä: {n_jobs if n_jobs != -1 else '–≤—Å–µ –¥–æ—Å—Ç—É–ø–Ω—ã–µ'}")
    print(f"üì¶ –†–∞–∑–º–µ—Ä batch: {batch_size}")
    print("‚îÄ" * 50)

    start_time = time.time()
    all_results = []
    batch_number = 1

    for batch_start in range(0, total_complaints, batch_size):
        batch_end = min(batch_start + batch_size, total_complaints)
        batch_complaints = complaints[batch_start:batch_end]
        batch_indices = list(range(batch_start, batch_end))

        print(f"üîÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ batch {batch_number}: {batch_start}-{batch_end-1}")
        batch_results = Parallel(n_jobs=n_jobs, verbose=10)(
            delayed(process_single_complaint)(
                complaint, products_kb, categories_kb, idx
            ) for idx, complaint in zip(batch_indices, batch_complaints)
        )

        all_results.extend(batch_results)
        save_results(batch_results, batch_number, batch_end, output_dir)

        elapsed_time = time.time() - start_time
        estimated_total = (elapsed_time / batch_end) * total_complaints if batch_end else 0
        remaining_time = estimated_total - elapsed_time

        print(f"‚úÖ Batch {batch_number} –∑–∞–≤–µ—Ä—à–µ–Ω: {batch_end}/{total_complaints}")
        print(f"‚è±Ô∏è  –ü—Ä–æ—à–ª–æ: {elapsed_time/60:.1f} –º–∏–Ω | –û—Å—Ç–∞–ª–æ—Å—å: ~{remaining_time/60:.1f} –º–∏–Ω")
        print("‚îÄ" * 50)

        batch_number += 1

    total_time = time.time() - start_time
    print(f"üéâ –û–±—Ä–∞–±–æ—Ç–∫–∞ –∑–∞–≤–µ—Ä—à–µ–Ω–∞ –∑–∞ {total_time/60:.1f} –º–∏–Ω—É—Ç")
    print(f"üìä –°—Ä–µ–¥–Ω—è—è —Å–∫–æ—Ä–æ—Å—Ç—å: {total_complaints/(total_time/60):.1f} –∂–∞–ª–æ–±/–º–∏–Ω")

    merge_all_results(output_dir)
    return all_results


def create_comparison_dataframe(merged_results_file, original_df):
    with open(merged_results_file, 'r', encoding='utf-8') as f:
        classification_results = json.load(f)

    model_data = []
    for result in classification_results:
        classification = result.get('classification', DEFAULT_CLASSIFICATION)
        if isinstance(classification, str):
            classification = parse_classification_response(classification)
        elif not isinstance(classification, dict):
            classification = DEFAULT_CLASSIFICATION.copy()

        model_data.append({
            'index': result['id'] - 1,
            '–º–æ–¥–µ–ª—å_–ø—Ä–æ–¥—É–∫—Ç': classification.get('product', '---'),
            '–º–æ–¥–µ–ª—å_—Ç–∏–ø_–∂–∞–ª–æ–±—ã': classification.get('type', '---'),
            '–º–æ–¥–µ–ª—å_—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å': classification.get('confidence_level', '–Ω–∏–∑–∫–∏–π'),
            '–º–æ–¥–µ–ª—å_–æ–±–æ—Å–Ω–æ–≤–∞–Ω–∏–µ': classification.get('explanation', '')
        })

    model_df = pd.DataFrame(model_data)
    model_df = model_df.set_index('index')

    comparison_df = original_df.copy()
    for col in ['–º–æ–¥–µ–ª—å_–ø—Ä–æ–¥—É–∫—Ç', '–º–æ–¥–µ–ª—å_—Ç–∏–ø_–∂–∞–ª–æ–±—ã', '–º–æ–¥–µ–ª—å_—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å', '–º–æ–¥–µ–ª—å_–æ–±–æ—Å–Ω–æ–≤–∞–Ω–∏–µ']:
        comparison_df[col] = model_df[col]

    return comparison_df



In [None]:
df = pd.read_csv('complaints.csv', encoding='utf-16', sep='\t')
complaints = list(df['–û–ø–∏—Å–∞–Ω–∏–µ –ø—Ä–µ—Ç–µ–Ω–∑–∏–∏'])



In [None]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

results = process_complaints_parallel(
    complaints=complaints[:1],
    batch_size=100,
    n_jobs=-1,
    output_dir="parallel_results",
    text_column='text'
)

comparison_df = create_comparison_dataframe(
    merged_results_file='parallel_results/all_results_merged.json',
    original_df=df[["–û–ø–∏—Å–∞–Ω–∏–µ –ø—Ä–µ—Ç–µ–Ω–∑–∏–∏", "–¢–∏–ø", "–ü–æ–¥—Ç–∏–ø", "–ü—Ä–µ—Ç–µ–Ω–∑–∏—è –ø–æ –ø—Ä–æ–¥—É–∫—Ç—É"]]
)

# comparison_df.to_csv('final_classified_complaints.csv', index=False, encoding='utf-8-sig')
# print("‚úÖ –ò—Ç–æ–≥–æ–≤—ã–π –¥–∞—Ç–∞—Ñ—Ä–µ–π–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ final_classified_complaints.csv")

print("\n=== –ü–ï–†–í–´–ï 5 –ó–ê–ü–ò–°–ï–ô ===")
print(comparison_df[['–º–æ–¥–µ–ª—å_–ø—Ä–æ–¥—É–∫—Ç', '–º–æ–¥–µ–ª—å_—Ç–∏–ø_–∂–∞–ª–æ–±—ã', '–º–æ–¥–µ–ª—å_—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å']].head())

