# Structuring ECOICOP v1 information for classification

The ECOICOP v1 classification contains unstructured information for its most detailed level of classification. This notebook leverages LLMs in order to structure this information in a format which can be used for similarity search.

In [None]:
import os
import time # Add delays because of free API rate limits
from datetime import datetime
import asyncio
from typing import List

import pandas as pd
from ftfy import fix_text
from unidecode import unidecode

from beeai_framework.adapters.watsonx.backend.chat import WatsonxChatModel
from pydantic import BaseModel, Field
from beeai_framework.backend.message import UserMessage, SystemMessage

from dotenv import dotenv_values

In [None]:
config = dotenv_values(".env")

In [None]:
class EcoicopDetails(BaseModel):
    examples: List[str] = Field(description="List of products or services in the subclass.")

In [None]:
usecols = ["CODE", "NAME_EN", "LEVEL", "PARENT_CODE", "InclusionNote", "ExclusionNote"]

data_df = pd.read_csv(
    filepath_or_buffer="ecoicop_v1/ECOICOP-HICP_Structure_Labels.tsv", 
    sep="\t",
    usecols=usecols,)

In [None]:
data_df = data_df.loc[data_df["LEVEL"] == 4]

In [None]:

data_dict = data_df.to_dict(orient="records")

In [None]:
system_prompt = """ou are an expert data curator. When provided with a subclass name and its inclusion/exclusion description:

1. Identify specific products/services that belong in this subclass based on the inclusion description. Use exclusions only to understand boundaries.

2. Generate a list of specific product/service names that:
   - Belong within the defined subclass
   - Represent specific items, not categories
   - Cover the full range of inclusions

3. Make each name semantically unique by:
   - Avoiding generic terms ("other," "miscellaneous")
   - Minimizing word overlap between entries
   - Using varied terminology

4. Output in the same language as input, preserving industry terminology.

5. Format as a simple list of product/service names.
"""

subclass_prompt = """Subclass title: {title}

Inclusion note: {description}

Exclusion note: {excludes}
"""

In [None]:
llm = WatsonxChatModel(
    model_id="ibm/granite-3-8b-instruct",
    project_id=config.get("GRANITE_PROJECT"),
    api_key=config.get("GRANITE_API_KEY"),
    api_base=config.get("GRANITE_ENDPOINT"),
)

In [None]:
async def watson_structure(llm: WatsonxChatModel, system_prompt: str, subclass_prompt: str, item: dict) -> None:
    system_message = SystemMessage(system_prompt)
    user_message = UserMessage(subclass_prompt.format(
        title=item.get("NAME_EN"), 
        description=item.get("InclusionNote"), 
        excludes=item.get("ExclusionNote")))
    response = await llm.create_structure(
        {
            "schema": EcoicopDetails,
            "messages": [system_message, user_message],
        }
    )
    return response.object

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("InclusionNote") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(0.5)
    try:
        agent_result = await watson_structure(llm, system_prompt, subclass_prompt, item)
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("NAME_EN")] + agent_result.get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "NAME_EN": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["InclusionNote", "LEVEL", "ExclusionNote"]).to_csv(
    "results/ecoicop_v1_{}_{}.csv".format(
        "ibm-granite-3-8b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop2018_{}_{}.csv".format(
        "ibm-granite-3-8b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)