In [None]:
%pip install git+https://github.com/istat-methodology/semantic-search.git

# Build a vector database with ATECO 2022 Datapoints

In [None]:
import re
import pandas as pd
from semantic_search.local import LocalKnowledgeBase
from semantic_search.data import build_corpus

## Utility functions

In [None]:
def prepare_ateco_level_with_descriptor(
    df: pd.DataFrame,
    desc_template: str,
    exclusion_template: str
) -> pd.DataFrame:
    """
    Cleans, normalizes and enriches an ATECO-level DataFrame by:
    - Renaming 'section' to 'code'
    - Lowercasing the title
    - Filling missing values
    - Normalizing whitespace
    - Generating a text descriptor using provided templates
    """

    # Select and rename relevant columns
    df = df[['main', 'section', 'title', 'description', 'exclusion_list', 'exclusion_description']].copy()
    df = df.rename(columns={'section': 'code'})

    # Normalize fields
    df['code'] = df['code'].astype(str)
    df['title'] = df['title'].astype(str).str.lower()

    for col in ['description', 'exclusion_list', 'exclusion_description']:
        df[col] = df[col].fillna('')
        df[col] = df[col].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))

    if 'exclusion_title' not in df.columns:
        df['exclusion_title'] = ''

    # Apply templates to build descriptor
    def render_descriptor(row):
        base = desc_template.format(
            title=row['title'],
            description=row['description']
        )
        if any([row['exclusion_title'], row['exclusion_list'], row['exclusion_description']]):
            exclusion_block = exclusion_template.format(
                exclusion_title=row['exclusion_title'],
                exclusion_list=row['exclusion_list'],
                exclusion_description=row['exclusion_description']
            )
            return base + exclusion_block
        return base

    df['descriptor'] = df.apply(render_descriptor, axis=1)
    return df


## Configs
Set configuration parameters, such as the ATECO 2022 path and the embedding model.

In [None]:
# ATECO 2022 classification
ATECO_PATH : str = "classification/ateco_2022/ateco_2022.csv"

# Order of columns in the DataFrame
COLUMN_ORDER = [
    'main', 'code', 'level', 'title', 'description',
    'descriptor', 'exclusion_list', 'exclusion_description', 'exclusion_title'
]
# Sentence Embedding model
MODEL_ID   : str = "all-MiniLM-L6-v2" #all-MiniLM-L6-v2, BAAI/bge-m3 

## Define a descriptor TEMPLATE
This template is needed to create the texts (**descriptors**) that will be loaded in the vector database

In [None]:
desc_template = """# {title}

{description}
"""

exclusion_template = """
Questa sezione ATECO non comprende le seguenti attività:

{exclusion_list}, {exclusion_description}
"""


## Data Load & Preprocessing
We import the ATECO 2022 classification and perform pre-processing tasks needed to build the vector database

---

In [None]:
# Read ATECO 2022
raw_ateco_df = pd.read_csv(ATECO_PATH)

Build a dataset at each level

In [None]:
# Level 1
ateco_level_1 = raw_ateco_df[raw_ateco_df['section'].isin(raw_ateco_df['main'].unique())]
ateco_level_1 = prepare_ateco_level_with_descriptor(ateco_level_1, desc_template, exclusion_template)
ateco_level_1["level"] = "sezione"
ateco_level_1 = ateco_level_1[COLUMN_ORDER]

# Level 2
ateco_level_2 = raw_ateco_df[raw_ateco_df['section'].str.fullmatch(r'\d{2}')]
ateco_level_2 = prepare_ateco_level_with_descriptor(ateco_level_2, desc_template, exclusion_template)
ateco_level_2["level"] = "divisione"
ateco_level_2 = ateco_level_2[COLUMN_ORDER]

# Level 3
ateco_level_3 = raw_ateco_df[raw_ateco_df['section'].str.fullmatch(r'\d{2}\.\d')]
ateco_level_3 = prepare_ateco_level_with_descriptor(ateco_level_3, desc_template, exclusion_template)
ateco_level_3["level"] = "gruppo"
ateco_level_3 = ateco_level_3[COLUMN_ORDER]

# Level 4
ateco_level_4 = raw_ateco_df[raw_ateco_df['section'].str.fullmatch(r'\d{2}\.\d{2}')]
ateco_level_4 = prepare_ateco_level_with_descriptor(ateco_level_4, desc_template, exclusion_template)
ateco_level_4["level"] = "classe"
ateco_level_4 = ateco_level_4[COLUMN_ORDER]

Merge all the levels in a dataframe

In [None]:
ateco_2022 = pd.concat(
    [ateco_level_1, ateco_level_2, ateco_level_3, ateco_level_4],
    axis=0,
    ignore_index=True
)

Save to files in the **ateco_2022** folder

In [None]:
ateco_level_1.to_csv("classification/ateco_2022/ateco_2022_level_1.csv", index=False)
ateco_level_2.to_csv("classification/ateco_2022/ateco_2022_level_2.csv", index=False)
ateco_level_3.to_csv("classification/ateco_2022/ateco_2022_level_3.csv", index=False)
ateco_level_4.to_csv("classification/ateco_2022/ateco_2022_level_4.csv", index=False)

# Save the complete ATECO 2022 DataFrame with descriptors
ateco_2022.to_csv("classification/ateco_2022/ateco_2022_descriptor.csv", index=False)

---

In [None]:
print(ateco_2022['level'].unique())

In [None]:
ateco_2022[ateco_2022['level'] == 'divisione']

## Semantic Search
Now, we can build a semantic search database containing ATECO codes. To do this, we will use the `LocalKnowledgeBase` class from the `semantic_search` library.

### Knowledge Base: ATECO - Section (level 1)

In [None]:
corpus_level_1 = build_corpus(
    texts=ateco_level_1['descriptor'],
    ids=ateco_level_1['code'],
    metadata = [{"code": c, "title": t, "level": s} for c, t, s  in zip(ateco_level_1["code"], ateco_level_1["title"], ateco_level_1["level"])]
)

base_level_1 = LocalKnowledgeBase(
    corpus=corpus_level_1,
    model_id=MODEL_ID
)

### Knowledge Base: ATECO - Division (level 2)

In [None]:
corpus_level_2 = build_corpus(
    texts=ateco_level_2['descriptor'],
    ids=ateco_level_2['code'],
    metadata = [{"code": c, "title": t, "level": s} for c, t, s  in zip(ateco_level_2["code"], ateco_level_2["title"], ateco_level_2["level"])]
)

base_level_2 = LocalKnowledgeBase(
    corpus=corpus_level_2,
    model_id=MODEL_ID
)

#### Let's query the knowledge base

In [None]:
QUERY = ["Ho una pescheria", 
         "Ciao, io ho un'attività immobiliare che si occupa di affitti, quale codice ATECO devo usare?"]

results = base_level_2.search(QUERY, top_k=5)

for res in results:
    for res_i in res:
        print(f"{res_i.score:.2f} | {res_i.metadata["title"]}")
    print("---")
