# Structuring COICOP 2018 information for classification

The COICOP 2018 classification contains unstructured information for its most detailed level of classification. This notebook leverages LLMs in order to structure this information in a format which can be used for similarity search.

In [None]:
import os
import time # Add delays because of free API rate limits
from datetime import datetime
import asyncio
from typing import List

import pandas as pd
from ftfy import fix_text
from unidecode import unidecode

from beeai_framework.adapters.watsonx.backend.chat import WatsonxChatModel
from pydantic import BaseModel, Field
from beeai_framework.backend.message import UserMessage, SystemMessage

from dotenv import dotenv_values

In [None]:
config = dotenv_values(".env")

In [None]:
class CoicopDetails(BaseModel):
    examples: List[str] = Field(description="List of products or services in the subclass.")

In [None]:
usecols = ["code","title","intro", "includes", "alsoIncludes", "excludes"]

data_df = pd.read_excel(
    "coicop_2018/COICOP_2018_English_structure_edited.xlsx", 
    usecols=usecols,)

In [None]:
def process_subclasses(df):
    """
    Process DataFrame with the following operations:
    1. Filter rows where Code contains exactly 3 dots
    2. Combine description columns
    3. Remove classification markers from Description
    
    Args:
        df (pd.DataFrame): Input DataFrame
    
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # 1. Select rows where Code has exactly 3 dots
    result_df = result_df[result_df['code'].str.count(r'\.') == 3]
    
    # 2. Combine description columns
    columns_to_concat = ["intro", "includes", "alsoIncludes"]
    
    def concatenate_non_nan_columns(row):
        # Filter out NaN values and convert to string
        non_nan_values = [str(row[col]) for col in columns_to_concat if pd.notna(row[col])]
        return ' \n '.join(non_nan_values) if non_nan_values else ''
    
    result_df['description'] = result_df.apply(concatenate_non_nan_columns, axis=1)
    # Fix encoding issues
    result_df['description'] = result_df['description'].apply(fix_text)
    result_df['description'] = result_df['description'].apply(unidecode)
    result_df['description'] = result_df['description'].str.replace("_x000D_", " ")

    # Fix the exclusion column
    result_df['excludes'] = result_df['excludes'].fillna('')
    result_df['excludes'] = result_df['excludes'].apply(fix_text)
    result_df['excludes'] = result_df['excludes'].apply(unidecode)
    result_df['excludes'] = result_df['excludes'].str.replace("_x000D_", " ")
        
    # 3. Remove classification markers from Description
    markers_pattern = r'\s*\((ND|SD|S|D)\)'
    result_df['title'] = result_df['title'].str.replace(markers_pattern, '', regex=True)
    
    return result_df[["code", "title", "description", "excludes"]]

In [None]:
def process_subsubclasses(df):
    """
    Process DataFrame with the following operations:
    1. Filter rows where Code contains exactly 4 dots
    2. Combine description columns
    3. Remove classification markers from Description
    4. censor the code to subclass level
    
    Args:
        df (pd.DataFrame): Input DataFrame
    
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # 1. Select rows where Code has exactly 3 dots
    result_df = result_df[result_df['code'].str.count(r'\.') == 4]
    
    # 2. Combine description columns
    columns_to_concat = ["intro", "includes", "alsoIncludes"]
    
    def concatenate_non_nan_columns(row):
        # Filter out NaN values and convert to string
        non_nan_values = [str(row[col]) for col in columns_to_concat if pd.notna(row[col])]
        return ' \n '.join(non_nan_values) if non_nan_values else ''
    
    result_df['description'] = result_df.apply(concatenate_non_nan_columns, axis=1)
    # Fix encoding issues
    result_df['description'] = result_df['description'].apply(fix_text)
    result_df['description'] = result_df['description'].apply(unidecode)
    result_df['description'] = result_df['description'].str.replace("_x000D_", " ")

    # Fix the exclusion column
    result_df['excludes'] = result_df['excludes'].fillna('')
    result_df['excludes'] = result_df['excludes'].apply(fix_text)
    result_df['excludes'] = result_df['excludes'].apply(unidecode)
    result_df['excludes'] = result_df['excludes'].str.replace("_x000D_", " ")
        
    # 3. Remove classification markers from Description
    markers_pattern = r'\s*\((ND|SD|S|D)\)'
    result_df['title'] = result_df['title'].str.replace(markers_pattern, '', regex=True)

    # 4. censor the code to subclass level
    result_df['code'] = result_df['code'].str[:8]
    
    return result_df[["code", "title", "description", "excludes"]]

In [None]:
subsub_df = process_subsubclasses(data_df)

In [None]:
sub_df = process_subclasses(data_df)

In [None]:
data_df = pd.concat([sub_df, subsub_df], axis=0)

Remove divisions 14 and 15, as they do not add further information

In [None]:
data_df = data_df[~data_df["code"].str.startswith(("14", "15"))]

In [None]:
data_dict = data_df.to_dict(orient="records")

In [None]:
system_prompt = """ou are an expert data curator. When provided with a subclass name and its inclusion/exclusion description:

1. Identify specific products/services that belong in this subclass based on the inclusion description. Use exclusions only to understand boundaries.

2. Generate a list of specific product/service names that:
   - Belong within the defined subclass
   - Represent specific items, not categories
   - Cover the full range of inclusions

3. Make each name semantically unique by:
   - Avoiding generic terms ("other," "miscellaneous")
   - Minimizing word overlap between entries
   - Using varied terminology

4. Output in the same language as input, preserving industry terminology.

5. Format as a simple list of product/service names.
"""

subclass_prompt = """Subclass title: {title}

Inclusion note: {description}

Exclusion note: {excludes}
"""

## Llama 3.3 70B

In [None]:
llm_model = "meta-llama/llama-3-3-70b-instruct"

In [None]:
llm = WatsonxChatModel(
    model_id=llm_model,
    project_id=config.get("GRANITE_PROJECT"),
    api_key=config.get("GRANITE_API_KEY"),
    api_base=config.get("GRANITE_ENDPOINT"),
)

In [None]:
async def watson_structure(llm: WatsonxChatModel, system_prompt: str, subclass_prompt: str, item: dict) -> None:
    system_message = SystemMessage(system_prompt)
    user_message = UserMessage(subclass_prompt.format(
        title=item.get("title"), 
        description=item.get("description"), 
        excludes=item.get("excludes")))
    response = await llm.create_structure(
        {
            "schema": CoicopDetails,
            "messages": [system_message, user_message],
        }
    )
    return response.object

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("description") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(0.5)
    try:
        agent_result = await watson_structure(llm, system_prompt, subclass_prompt, item)
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("title")] + agent_result.get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "title": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["description", "excludes"]).to_csv(
    "results/coicop2018_{}_{}.csv".format(
        "llama-3-3-70b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop2018_{}_{}.csv".format(
        "llama-3-3-70b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

## Mistral large

In [None]:
llm_model = "mistralai/mistral-large"

In [None]:
llm = WatsonxChatModel(
    model_id=llm_model,
    project_id=config.get("GRANITE_PROJECT"),
    api_key=config.get("GRANITE_API_KEY"),
    api_base=config.get("GRANITE_ENDPOINT"),
)

In [None]:
async def watson_structure(llm: WatsonxChatModel, system_prompt: str, subclass_prompt: str, item: dict) -> None:
    system_message = SystemMessage(system_prompt)
    user_message = UserMessage(subclass_prompt.format(
        title=item.get("title"), 
        description=item.get("description"), 
        excludes=item.get("excludes")))
    response = await llm.create_structure(
        {
            "schema": CoicopDetails,
            "messages": [system_message, user_message],
        }
    )
    return response.object

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("description") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(0.5)
    try:
        agent_result = await watson_structure(llm, system_prompt, subclass_prompt, item)
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("title")] + agent_result.get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "title": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["description", "excludes"]).to_csv(
    "results/coicop2018_{}_{}.csv".format(
        "mistral-large",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop2018_{}_{}.csv".format(
        "mistral-large",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

## IBM Granite 3 8B

In [None]:
llm_model = "ibm/granite-3-8b-instruct"

In [None]:
llm = WatsonxChatModel(
    model_id=llm_model,
    project_id=config.get("GRANITE_PROJECT"),
    api_key=config.get("GRANITE_API_KEY"),
    api_base=config.get("GRANITE_ENDPOINT"),
)

In [None]:
async def watson_structure(llm: WatsonxChatModel, system_prompt: str, subclass_prompt: str, item: dict) -> None:
    system_message = SystemMessage(system_prompt)
    user_message = UserMessage(subclass_prompt.format(
        title=item.get("title"), 
        description=item.get("description"), 
        excludes=item.get("excludes")))
    response = await llm.create_structure(
        {
            "schema": CoicopDetails,
            "messages": [system_message, user_message],
        }
    )
    return response.object

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("description") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(0.5)
    try:
        agent_result = await watson_structure(llm, system_prompt, subclass_prompt, item)
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("title")] + agent_result.get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "title": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["description", "excludes"]).to_csv(
    "results/coicop2018_{}_{}.csv".format(
        "granite-3-8b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop2018_{}_{}.csv".format(
        "granite-3-8b-instruct",
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)