# ATECO 2025 Classification Preprocessing

In [None]:
import csv
import pandas as pd

df = pd.read_excel("classification/ateco_2025/ateco_2025_raw.xlsx")

main = ""

for i, row in df.iterrows():
    if row["GERARCHIA"] == 1:
        main = row["CODICE"]
    df.at[i, "main"] = main

## Level-specific CSV

In [49]:
level_ids = [1, 2, 3, 4]
level_labels = ["sezione", "divisione", "gruppo", "classe"]


for i, l in zip(level_ids, level_labels):
    level_df = df[df["GERARCHIA"] == i]
    level_df = level_df[["main", "CODICE", "IT_TITOLO", "IT_NOTA"]]

    level_df["level"] = [l] * level_df.shape[0]

    level_df.rename(columns={
        "CODICE": "code",
        "IT_TITOLO": "title",
        "IT_NOTA": "description"
    }, inplace=True)

    level_df = level_df[["main", "code", "level", "title", "description"]]

    level_df = level_df.groupby("code").aggregate({
        "level": lambda x: x.unique()[0],
        "title": lambda x: x.unique()[0],
        "description": lambda x: r"\n".join(x.dropna().astype(str))
    }).reset_index()

    level_df.to_csv(f"classification/ateco_2025/ateco_2025_level_{i}.csv", quoting=csv.QUOTE_ALL, index=False)

---

## Semantic Search

In [None]:
from semantic_search.data import build_corpus
from semantic_search.local import LocalKnowledgeBase

Build descriptors.

In [7]:
LEVEL: int = 4

df = pd.read_csv(f"classification/ateco_2025/ateco_2025_level_{LEVEL}.csv")

descriptor_template = """# {title}
{description}"""

descriptors = [
    descriptor_template.format(title=row["title"], description=row["description"]) for _, row in df.iterrows()
]

Build knowledge base.

In [None]:
corpus = build_corpus(
    texts=descriptors,
    ids=df.index,
    metadata=[{"code": c} for c in df["code"]]
)

base = LocalKnowledgeBase(
    corpus=corpus,
    model_id="paraphrase-multilingual-MiniLM-L12-v2",
    batch_size=32
)

Search the knowledge base.

In [None]:
query = "Vendita di ortaggi."

base.search(query, top_k=3)