# Test Semantic Search on Labeled Set

In [None]:
import pandas as pd
from semantic_search.data import build_corpus
from semantic_search.local import LocalKnowledgeBase

## Configs

In [None]:
LEVEL     : int  = 4
MODEL_ID  : str  = "BAAI/bge-m3"
DESCRIPTOR: bool = """{title}"""

---

## Data

In [None]:
test_df = pd.read_csv("classification/ateco_2025/ateco_2025_index.csv")
ateco_df = pd.read_csv(f"classification/ateco_2025/ateco_2025_level_{LEVEL}.csv")

descriptors = []
for idx, row in ateco_df.iterrows():
    title = row["title"]
    description = row["description"]
    if pd.isna(description):
        description = ""
    if pd.isna(title):
        title = ""
    descriptors.append(DESCRIPTOR.format(title=title, description=description)) 

y_texts, y_true = test_df["COMBO_PARLATA_IT"].tolist(), test_df["ATECO2025"].tolist()

Create the knowledge base.

In [None]:
corpus = build_corpus(
    texts=descriptors,
    ids=ateco_df.index,
    metadata=[{"code": c} for c in ateco_df["code"]],
)

base = LocalKnowledgeBase(
    corpus,
    model_id=MODEL_ID,
    batch_size=64
)

## Semantic Search

In [None]:
TOP_K = 5

results = base.search(y_texts, top_k=TOP_K)

correct = []
for result, yt in zip(results, y_true):
    codes = [r.metadata["code"] for r in result]
    scores = [r.score for r in result]

    if any(str(code) in yt for code in codes):
        correct.append(1)
    else:
        correct.append(0)

print(f"Accuracy (Top {TOP_K}): {sum(correct) / len(y_true):.2%} ({sum(correct)}/{len(y_true)})")

### Diagnostics

In [None]:
test_df["CORRECT"] = correct

test_df["Level2"] = test_df["ATECO2025"].str[0:2]
test_df["Level4"] = test_df["ATECO2025"].str[0:5]

level_diagnostic = test_df.groupby("Level4").agg(
    {"CORRECT": "mean", "ATECO2025": "count"}
).sort_values(
    "CORRECT", ascending=True
)

level_diagnostic.head(10)