# Structuring ECOICOP information for classification

The ECOICOP classification contains unstructured information for its most detailed level of classification. This notebook leverages LLMs in order to structure this information in a format which can be used for similarity search.

In [1]:
import time # Add delays because of free API rate limits
from datetime import datetime

import pandas as pd

from typing import List
from pydantic import BaseModel
from pydantic_ai import Agent
from pydantic_ai.models.mistral import MistralModel
#from pydantic_ai.models.groq import GroqModel

from dotenv import dotenv_values

In [2]:
import nest_asyncio # Fix issues with Jupyter notebook event loop
nest_asyncio.apply()

Import environment variables with API keys

In [3]:
config = dotenv_values(".env")

Define the Pydantic model for parsing the additional information in each ECOICOP level 4

In [4]:
class EcoicopDetails(BaseModel):
    examples: List[str]

Initialize PydanticAI agent to structure information

In [5]:
model = MistralModel('mistral-large-latest', api_key=config.get("MISTRAL_API_KEY"))
agent = Agent(
    model=model,
    retries=3,
    result_type=EcoicopDetails,
    system_prompt=(
        'You are an expert data curator. You will receive a string of text '
        'with examples of items to be included into a certain classification. '
        'Your task is to transform this text into a list of self-explainig '
        'items descriptions, exploding the examples to the most granular level you can '
        'identify in the text. '
        'Each item description should contain all necessary information for classification '
        'as provided in the original text. Prefer complete descriptions rather than single words.'
        ),
)

Read ECOICOP definitions (only selected columns)

In [6]:
usecols = ["CODE", "NAME_EN", "LEVEL", "PARENT_CODE", "InclusionNote"]

data_df = pd.read_csv(
    filepath_or_buffer="ecoicop_v1/ECOICOP-HICP_Structure_Labels.tsv", 
    sep="\t",
    usecols=usecols,)

Filter only level 4 items

In [8]:
data_df = data_df.loc[data_df["LEVEL"] == 4]

In [11]:
data_dict = data_df.to_dict(orient="records")

Run calls to the Agent to extract and format information

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    print(f"Processing item {i+1} out of {len(data_dict)}")
    # Time delay to respect API rate limits
    time.sleep(3)
    try:
        agent_result = agent.run_sync(item.get("InclusionNote"), model_settings={'temperature': 0.0})
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("NAME_EN")] + agent_result.data.model_dump().get("examples"):
            # the new "NAME_EN" is inserted at the end, so it overwrites the original one
            results.append({**item, "NAME_EN": ex})
    except Exception as e:
        failed_calls.append(item)
        print(e)
        continue

Processing item 1 out of 295
Processing item 2 out of 295
Processing item 3 out of 295
Processing item 4 out of 295
Processing item 5 out of 295
Processing item 6 out of 295
Processing item 7 out of 295
Processing item 8 out of 295
Processing item 9 out of 295
Processing item 10 out of 295
Processing item 11 out of 295
Processing item 12 out of 295
Processing item 13 out of 295
Processing item 14 out of 295
Processing item 15 out of 295
Processing item 16 out of 295
Processing item 17 out of 295
Processing item 18 out of 295
Processing item 19 out of 295
Processing item 20 out of 295
Processing item 21 out of 295
Processing item 22 out of 295
Processing item 23 out of 295
Processing item 24 out of 295
Processing item 25 out of 295
Processing item 26 out of 295
Processing item 27 out of 295
Processing item 28 out of 295
Processing item 29 out of 295
Processing item 30 out of 295
Processing item 31 out of 295
Processing item 32 out of 295
Processing item 33 out of 295


Save results and failed calls

In [22]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["InclusionNote", "LEVEL"]).to_csv(
    "results/ecoicop_v1_{}.csv".format(datetime.now().strftime("%Y-%m-%d")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    filepath_or_buf="results/failed_ecoicop_v1_{}.csv".format(datetime.now().strftime("%Y-%m-%d")),
    index=False)