In [None]:
import os
import json
import shutil
import re

base_dir = "RAG-Challenge-\archive"
output_dir = "/renamed_reports"
json_file_path = "/data/meta.json" 

os.makedirs(output_dir, exist_ok=True)

try:
    with open(json_file_path, 'r', encoding='utf-8') as f:
        company_data = json.load(f)
    print(f"JSON данные успешно загружены из {json_file_path}")
except Exception as e:
    print(f"Ошибка при загрузке JSON файла: {str(e)}")
    exit(1)

def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

sha1_to_company = {item["sha1"]: item["company_name"] for item in company_data}

processed = 0
skipped = 0
errors = 0

print(f"Начинаем обработку папок в {base_dir}...")
for sha1 in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, sha1)
    if not os.path.isdir(folder_path):
        continue
    
    if sha1 not in sha1_to_company:
        print(f"Предупреждение: Не найдено название компании для SHA1 {sha1}, пропускаем...")
        skipped += 1
        continue
    
    company_name = sha1_to_company[sha1]
    safe_company_name = sanitize_filename(company_name)
    
    md_file = os.path.join(folder_path, f"{sha1}.md")
    
    if not os.path.exists(md_file):
        print(f"Предупреждение: Не найден markdown файл для SHA1 {sha1}, пропускаем...")
        skipped += 1
        continue
    
    new_file = os.path.join(output_dir, f"{safe_company_name}.md")
    
    try:
        shutil.copy2(md_file, new_file)
        print(f"Успешно скопирован: {sha1}.md -> {safe_company_name}.md")
        processed += 1
    except Exception as e:
        print(f"Ошибка копирования {sha1}.md: {str(e)}")
        errors += 1


print(f"- Успешно обработано файлов: {processed}")
print(f"- Пропущено файлов: {skipped}")
print(f"- Встречено ошибок: {errors}")
print(f"Переименованные файлы находятся в: {output_dir}")

In [None]:

import os
import re
from opensearchpy import OpenSearch

OPENSEARCH_CONFIG = {
    "hosts": [{"host": "", "port": }],
    "http_auth": ("", ""),
    "use_ssl": True,
    "verify_certs": True,
    "ssl_show_warn": False,
    "timeout": 120,
    "retry_on_timeout": True,
    "max_retries": 3
}

client = OpenSearch(**OPENSEARCH_CONFIG)

INDEX_NAME = "annual_reports"

if not client.indices.exists(index=INDEX_NAME):
    index_body = {
        "settings": {
            "analysis": {
                "analyzer": {
                    "rebuilt_english": {
                        "tokenizer": "standard",
                        "filter": [
                            "lowercase",
                            "english_stop",
                            "english_stemmer"
                        ]
                    }
                },
                "filter": {
                    "english_stop": {
                        "type": "stop",
                        "stopwords": "_english_"
                    },
                    "english_stemmer": {
                        "type": "stemmer",
                        "language": "english"
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "company": {"type": "keyword"},
                "page_number": {"type": "integer"},
                "text": {"type": "text", "analyzer": "rebuilt_english"}
            }
        }
    }
    client.indices.create(index=INDEX_NAME, body=index_body)
    print(f"Индекс {INDEX_NAME} создан.")
else:
    print(f"Индекс {INDEX_NAME} уже существует.")

MARKDOWN_FOLDER = "/renamed_reports"

page_marker_pattern = re.compile(r"^\{(\d+)\}-+\s*$", re.MULTILINE)

for file in os.listdir(MARKDOWN_FOLDER):
    if file.endswith(".md"):
        company_name = os.path.splitext(file)[0]
        file_path = os.path.join(MARKDOWN_FOLDER, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        
        parts = re.split(page_marker_pattern, content)
        
        if parts and parts[0].strip() == "":
            parts = parts[1:]
        
        for i in range(0, len(parts), 2):
            try:
                page_num = int(parts[i])
            except ValueError:
                continue
            page_text = parts[i+1].strip() if (i+1) < len(parts) else ""
            doc = {
                "company": company_name,
                "page_number": page_num,
                "text": page_text
            }
            client.index(index=INDEX_NAME, body=doc)
        print(f"Проиндексирован файл: {file}")

def search_relevant_page(query, result_size=5):
    search_body = {
        "query": {
            "match": {
                "text": {
                    "query": query,
                    "analyzer": "rebuilt_english"
                }
            }
        },
        "size": result_size
    }
    response = client.search(index=INDEX_NAME, body=search_body)
    return response

user_query = input("Введите поисковый запрос: ")
search_results = search_relevant_page(user_query)
print("\nРезультаты поиска:")
for hit in search_results["hits"]["hits"]:
    source = hit["_source"]
    print(f"Score: {hit['_score']} | Company: {source['company']} | Page: {source['page_number']}")
    print(source["text"])
    print("-" * 80)
