In [None]:
%pip install git+https://github.com/istat-methodology/semantic-search.git

# Build ATECO 2022 vector database

In [1]:
import re
import pandas as pd
from semantic_search.local import LocalKnowledgeBase
from semantic_search.data import build_corpus

  from .autonotebook import tqdm as notebook_tqdm


## Utility functions

In [2]:
def preprocess_ateco_df(df: pd.DataFrame) -> pd.DataFrame:
    # Preprocessing function for ATECO DataFrame
	df = df[['main', 'section', 'title', 'description', 'exclusion_title', 'exclusion_list', 'exclusion_description']].copy()
	df = df.rename(columns={'section': 'code'})

    # Normalizzazione
	df['code'] = df['code'].astype(str)
	df['title'] = df['title'].astype(str).str.lower()

	for col in ['description', 'exclusion_title', 'exclusion_list', 'exclusion_description']:
		df[col] = df[col].fillna('')
		df[col] = df[col].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))

    # Colonna opzionale
	if 'exclusion_title' not in df.columns:
		df['exclusion_title'] = ''

	return df


## Configs

In [3]:
ATECO_PATH : str = "classification/ateco_2022/ateco_2022_raw.csv"

LEVEL_CONFIG = [
    {"level": "sezione",   "level_id": 1, "pattern": None},
    {"level": "divisione", "level_id": 2, "pattern": r'\d{2}'},
    {"level": "gruppo",    "level_id": 3, "pattern": r'\d{2}\.\d'},
    {"level": "classe",    "level_id": 4, "pattern": r'\d{2}\.\d{2}'}
]


## Data Load & Preprocessing
We import the ATECO 2022 classification and perform pre-processing tasks needed to build the vector database

---

In [4]:
# Read ATECO 2022
raw_ateco_df = pd.read_csv(ATECO_PATH)

Build a dataset at each level

In [5]:
ateco_levels = []

for config in LEVEL_CONFIG:
    # Filtro per il livello
    if config["pattern"] is None:
        df_filtered = raw_ateco_df[raw_ateco_df['section'].isin(raw_ateco_df['main'].unique())].copy()
    else:
        df_filtered = raw_ateco_df[raw_ateco_df['section'].str.fullmatch(config["pattern"])].copy()

    # Pre-processing
    df_prepared = preprocess_ateco_df(df_filtered)
    df_prepared["level"] = config["level"]
    ateco_levels.append(df_prepared)

    # Salvataggio CSV
    df_prepared.to_csv(
        f"classification/ateco_2022/ateco_2022_level_{config['level_id']}.csv",
        index=False
    )

# Salvataggio dataset completo
ateco_2022 = pd.concat(ateco_levels, ignore_index=True)
ateco_2022 = ateco_2022[["main", "code", "level", "title", "description", "exclusion_title", "exclusion_list", "exclusion_description"]]
ateco_2022.to_csv("classification/ateco_2022/ateco_2022.csv", index=False)


---

## Semantic Search
Now, we can build a semantic search database containing ATECO codes. To do this, we will use the `LocalKnowledgeBase` class from the `semantic_search` library.

### Define a descriptor TEMPLATE
This template is needed to create the texts (**descriptors**) that will be loaded in the vector database

In [7]:
descriptor_template = """# {title}

{description}
"""

exclusion_template = """
Questa sezione ATECO non comprende le seguenti attività:

{exclusion_list}, {exclusion_description}
"""


### ATECO 2022 Knowledge Base (for a specific level)

In [8]:
ateco_2022 = pd.read_csv("classification/ateco_2022/ateco_2022.csv")

Filter ATECO 2022 on a specified level

In [9]:
LEVEL: str = "divisione"  # sezione, divisione, gruppo, classe

df = ateco_2022[ateco_2022['level'] == LEVEL]

descriptors = [
    descriptor_template.format(title=row["title"], description=row["description"]) for _, row in df.iterrows()
]


In [10]:
MODEL_ID   : str = "paraphrase-multilingual-MiniLM-L12-v2" #paraphrase-multilingual-MiniLM-L12-v2, BAAI/bge-m3 

corpus = build_corpus(
    texts=descriptors,
    ids=df.index,
    metadata=[{"code": c, "title": t, "level": l} for c, t, l in zip(df["code"], df["title"], df["level"])]
)

base = LocalKnowledgeBase(
    corpus=corpus,
    model_id=MODEL_ID,
    batch_size=32
)

Batches: 100%|██████████| 3/3 [00:03<00:00,  1.11s/it]


#### Let's query the knowledge base

In [None]:
QUERY = ["Ho una pescheria", 
         "Ciao, io ho un'attività immobiliare che si occupa di affitti, quale codice ATECO devo usare?"]

results = base.search(QUERY, top_k=5)

for res in results:
    for res_i in res:
        print(f"{res_i.score:.2f} | {res_i.metadata['title']}")
    print("---")
