In [None]:
!pip install rank_bm25 rbo sentence-transformers


In [None]:
import json

def parse_our_model_with_queries(path):
    with open(path, "r", encoding="utf-8", errors="replace") as f:
        lines = f.readlines()

    results = []
    i = 0
    n = len(lines)

    while i < n:
        line = lines[i].strip()

        # Ищем строку с запросом
        if line.startswith("Текст:"):
            # 1) Парсим текст запроса
            query = line[len("Текст:"):].strip()

            # 2) Двигаемся дальше до JSON-ответа
            i += 1
            while i < n and not lines[i].lstrip().startswith("{"):
                i += 1

            if i >= n:
                break  # дошли до конца, JSON не нашли

            # 3) Собираем JSON-блок с балансировкой скобок
            json_lines = []
            brace_level = 0
            started = False

            while i < n:
                l = lines[i]
                json_lines.append(l)

                # считаем фигурные скобки
                for ch in l:
                    if ch == '{':
                        brace_level += 1
                        started = True
                    elif ch == '}':
                        brace_level -= 1

                i += 1
                if started and brace_level == 0:
                    break

            json_str = "".join(json_lines).strip()

            # 4) Парсим JSON
            try:
                obj = json.loads(json_str)
                raw_games = obj.get("data", "")
                games = [g.strip() for g in raw_games.split(",") if g.strip()]
                results.append({
                    "query": query,
                    "games": games
                })
            except json.JSONDecodeError as e:
                # если что-то пошло не так — можно залогировать
                print("Ошибка JSON для запроса:", query)
                print("Причина:", e)
                # и просто пропустить этот блок
                continue

        else:
            i += 1

    return results


parsed = parse_our_model_with_queries('/content/data (2).txt')

print("Найдено пар (запрос + ответ):", len(parsed))
print("Пример первой пары:")
parsed[0]


In [None]:
import json
import pandas as pd

# путь к файлу (в коллабе загрузи через upload)
json_path = "/content/queries_part1_full.json"

# Читаем JSON baseline модели
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

results = data["results"]

# Преобразуем в список словарей
baseline_rows = []
for i, item in enumerate(results, start=1):
    baseline_rows.append({
        "index": i,
        "query": item["query"],
        "baseline_answer": item["response"],
    })

df_baseline = pd.DataFrame(baseline_rows)

print("Пример данных baseline:")
df_baseline.head()


In [None]:
baseline_processed = []

for row in df_baseline.to_dict("records"):
    games = [g.strip() for g in row["baseline_answer"].split(",")]
    baseline_processed.append({
        "query": row["query"],
        "baseline_games": games,
        "baseline_str": row["baseline_answer"],
    })

df_baseline_clean = pd.DataFrame(baseline_processed)
df_baseline_clean.head()

In [None]:
our_processed = []

for item in parsed:
    our_processed.append({
        "query": item["query"],
        "our_games": item["games"],
        "our_str": ", ".join(item["games"])
    })

df_our_clean = pd.DataFrame(our_processed)
df_our_clean.head()

In [None]:
df_full = df_our_clean.merge(df_baseline_clean, on="query", how="inner")

# Добавим индекс
df_full.insert(0, "index", range(1, len(df_full)+1))

len(df_full)

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
import rbo

model = SentenceTransformer("all-MiniLM-L6-v2")

def jaccard(a, b):
    set_a, set_b = set(a), set(b)
    return len(set_a & set_b) / len(set_a | set_b) if (set_a | set_b) else 0

def overlap_at_5(a, b):
    return len(set(a) & set(b))

def embedding_similarity(list1, list2):
    if not list1 or not list2:
        return 0

    emb1 = model.encode(list1, normalize_embeddings=True)
    emb2 = model.encode(list2, normalize_embeddings=True)

    mean1 = emb1.mean(axis=0)
    mean2 = emb2.mean(axis=0)

    return float(np.dot(mean1, mean2))


df_full["overlap5"] = df_full.apply(lambda row: overlap_at_5(row["our_games"], row["baseline_games"]), axis=1)
df_full["jaccard"] = df_full.apply(lambda row: jaccard(row["our_games"], row["baseline_games"]), axis=1)
df_full["embed_sim"] = df_full.apply(lambda row: embedding_similarity(row["our_games"], row["baseline_games"]), axis=1)

df_full.head()


In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_list(games):
    if not games:
        return np.zeros((1, 384))
    return model.encode(games, normalize_embeddings=True)


In [12]:
def semantic_rbo(listA, listB, p=0.9):
    # эмбеддинги
    A = embed_list(listA)
    B = embed_list(listB)

    # Матрица попарных cosine similarity
    sim_matrix = np.dot(A, B.T)  # shape: len(A) x len(B)

    # Для каждого элемента A → выбираем best match в B
    # Это заменяет прямое совпадение игр
    sim_scores = sim_matrix.max(axis=1)

    # RBO-like aggregation:
    # top-weighted average of similarity scores
    rbo = 0
    weight = 1 - p

    for k in range(1, len(sim_scores) + 1):
        rbo += weight * (p ** (k - 1)) * sim_scores[k - 1]

    return float(rbo)


In [13]:
def semantic_mrr(listA, listB):
    A = embed_list(listA)
    B = embed_list(listB)
    sim_matrix = np.dot(A, B.T)

    mrr_total = 0
    for i in range(len(A)):
        # сортируем по убыванию похожести
        ranks = np.argsort(-sim_matrix[i])
        # всегда 0-й — лучший матч
        mrr_total += 1 / (ranks[0] + 1)

    return mrr_total / len(A)


In [14]:
def ndcg_at_k(rels, k=5):
    rels = np.array(rels[:k])
    dcg = np.sum((2**rels - 1) / np.log2(np.arange(1, len(rels)+1) + 1))
    ideal = np.sort(rels)[::-1]
    idcg = np.sum((2**ideal - 1) / np.log2(np.arange(1, len(ideal)+1) + 1))
    return dcg / idcg if idcg > 0 else 0

def semantic_ndcg(listA, listB):
    A = embed_list(listA)
    B = embed_list(listB)
    sim_matrix = np.dot(A, B.T)

    # максимум схожести для каждого элемента → список релевантностей
    rels = sim_matrix.max(axis=1)
    return ndcg_at_k(rels, k=5)


In [15]:
df_full["semantic_rbo"] = df_full.apply(
    lambda row: semantic_rbo(row["our_games"], row["baseline_games"]),
    axis=1
)

df_full["semantic_mrr"] = df_full.apply(
    lambda row: semantic_mrr(row["our_games"], row["baseline_games"]),
    axis=1
)

df_full["semantic_ndcg"] = df_full.apply(
    lambda row: semantic_ndcg(row["our_games"], row["baseline_games"]),
    axis=1
)


In [21]:
df_full["final_score"] = (
    0.25 * df_full["embed_sim"] +
    0.25 * df_full["semantic_rbo"] +
    0.20 * df_full["semantic_ndcg"] +
    0.20 * df_full["semantic_mrr"] +
    0.10 * df_full["jaccard"]
)

df_full["final_percent"] = df_full["final_score"] * 100




In [None]:
report_stats = df_full[[
    "overlap5", "jaccard", "embed_sim",
    "semantic_rbo", "semantic_mrr", "semantic_ndcg",
    "final_score"
]].mean()

report_stats


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
sns.histplot(df_full["final_percent"], bins=40, kde=True)
plt.title("Распределение итогового качества (final_score %) нашей модели")
plt.xlabel("Процент качества (%)")
plt.ylabel("Количество запросов")
plt.show()


In [None]:
metrics = ["overlap5", "jaccard", "embed_sim", "semantic_rbo", "semantic_mrr", "semantic_ndcg"]

plt.figure(figsize=(14,8))
for i, metric in enumerate(metrics, 1):
    plt.subplot(2,3,i)
    sns.histplot(df_full[metric], kde=True)
    plt.title(metric)
plt.tight_layout()
plt.show()


In [None]:
worst = df_full.sort_values("final_score").head(20)
worst[["index", "query", "our_str", "baseline_str", "final_percent"]]


In [None]:
best = df_full.sort_values("final_score", ascending=False).head(20)
best[["index", "query", "our_str", "baseline_str", "final_percent"]]
