# Structuring COICOP 1999 information for classification

The COICOPP 1999 classification contains unstructured information for its most detailed level of classification. This notebook leverages LLMs in order to structure this information in a format which can be used for similarity search.

In [None]:
import os
import time # Add delays because of free API rate limits
from datetime import datetime

import pandas as pd

from typing import List
from pydantic import BaseModel
from pydantic_ai import Agent
from pydantic_ai.models.mistral import MistralModel
from pydantic_ai.models.groq import GroqModel

from dotenv import dotenv_values

In [None]:
import nest_asyncio # Fix issues with Jupyter notebook event loop
nest_asyncio.apply()

Import environment variables with API keys

In [None]:
config = dotenv_values(".env")

Define the Pydantic model for parsing the additional information in each COICOP 1999 level 3

In [None]:
class CoicopDetails(BaseModel):
    examples: List[str]

Read COICOP 1999 definitions (different languages)

In [None]:
usecols = ["Code","Description","ExplanatoryNote"]
usecols_es = ["Code","Title","ExplanatoryNoteInclusion"] # Spanish has different columns

data_en = pd.read_csv(
    filepath_or_buffer="coicop_1999/coicop1999_en.csv", 
    usecols=usecols,)

data_fr = pd.read_csv(
    filepath_or_buffer="coicop_1999/coicop1999_fr.csv", 
    usecols=usecols,)

data_es = pd.read_csv(
    filepath_or_buffer="coicop_1999/coicop1999_es.csv", 
    usecols=usecols_es,)

Remove exclusion note from the English and French files, rename Spanish columns.

Remove (ND), (SD), (D), (S) markings from the class names

Filter level 3 classes only for LLMs queries

In [None]:
def process_classes(df):
    """
    Process DataFrame with the following operations:
    1. Filter rows where Code contains exactly 2 dots
    2. Truncate ExplanatoryNote at either 'Excludes:' or 'Sont exclus'
    3. Remove classification markers from Description
    
    Args:
        df (pd.DataFrame): Input DataFrame with columns Code, Description, and ExplanatoryNote
    
    Returns:
        pd.DataFrame: Processed DataFrame
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # 1. Select rows where Code has exactly 2 dots
    result_df = result_df[result_df['Code'].str.count(r'\.') == 2]
    
    # 2. Process ExplanatoryNote column
    def truncate_at_exclusions(text):
        if pd.isna(text):
            return text
            
        # Check for both patterns
        excludes_split = text.split('Excludes:')
        sont_exclus_split = text.split('Sont exclus')
        
        # If 'Excludes:' is found, use that split
        if len(excludes_split) > 1:
            return excludes_split[0].strip()
        # If 'Sont exclus' is found, use that split
        elif len(sont_exclus_split) > 1:
            return sont_exclus_split[0].strip()
        # If neither pattern is found, return the original text
        return text.strip()
    
    result_df['ExplanatoryNote'] = result_df['ExplanatoryNote'].apply(truncate_at_exclusions)
    
    # 3. Remove classification markers from Description
    markers_pattern = r'\s*\((ND|SD|S|D)\)'
    result_df['Description'] = result_df['Description'].str.replace(markers_pattern, '', regex=True)
    
    return result_df

In [None]:
level3_en = process_classes(data_en)
level3_fr = process_classes(data_fr)

In [None]:
data_es["Description"] = data_es["Title"]
data_es["ExplanatoryNote"] = data_es["ExplanatoryNoteInclusion"]

data_es = data_es.drop(columns=["Title", "ExplanatoryNoteInclusion"], axis=1)
level3_es = process_classes(data_es)

In [None]:
data_df = pd.concat([level3_en, level3_fr, level3_es], ignore_index=True)

In [None]:
data_dict = data_df.to_dict(orient="records")

## Mistral

Initialize PydanticAI agent to structure information

In [None]:
llm_model = "mistral-large-latest"

In [None]:
model = MistralModel(model_name=llm_model, api_key=config.get("MISTRAL_API_KEY"))
agent = Agent(
    model=model,
    retries=3,
    result_type=CoicopDetails,
    system_prompt=(
        'You are an expert data curator. You will receive a string of text '
        'with examples of items to be included into a certain classification. '
        'Your task is to transform this text into a list of self-explainig '
        'items descriptions, exploding the examples to the most granular level you can '
        'identify in the text. '
        'Each item description should contain all necessary information for classification '
        'as provided in the original text. Prefer complete descriptions rather than single words. '
        'If possible, avoid the use of ambiguous or generic terms such as `other` or `miscellaneous`. '
        'Your output should be in the same language as the input text. '
        ),
)

Run calls to the Agent to extract and format information

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("ExplanatoryNote") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(3)
    try:
        prompt = "{}. {}".format(item.get("Description"), item.get("ExplanatoryNote"))
        agent_result = agent.run_sync(prompt, model_settings={'temperature': 0.0})
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("Description")] + agent_result.data.model_dump().get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "Description": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

Save results and failed calls

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["ExplanatoryNote"]).to_csv(
    "results/coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

## Llama 3.3 on Groq

In [None]:
llm_model = "llama-3.3-70b-versatile"

In [None]:
model = GroqModel(
    model_name=llm_model, 
    api_key=config.get("GROQ_API_KEY"))
agent = Agent(
    model=model,
    retries=3,
    result_type=CoicopDetails,
    system_prompt=(
        'You are an expert data curator. You will receive a string of text '
        'with examples of items to be included into a certain classification. '
        'Your task is to transform this text into a list of self-explainig '
        'items descriptions, exploding the examples to the most granular level you can '
        'identify in the text. '
        'Each item description should contain all necessary information for classification '
        'as provided in the original text. Prefer complete descriptions rather than single words. '
        'If possible, avoid the use of ambiguous or generic terms such as `other` or `miscellaneous`. '
        'Your output should be in the same language as the input text. '
        ),
)

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("ExplanatoryNote") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(3)
    try:
        prompt = "{}. {}".format(item.get("Description"), item.get("ExplanatoryNote"))
        agent_result = agent.run_sync(prompt, model_settings={'temperature': 0.0})
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("Description")] + agent_result.data.model_dump().get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "Description": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["ExplanatoryNote"]).to_csv(
    "results/coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

## Deepseek on Groq

In [None]:
llm_model = "deepseek-r1-distill-llama-70b"

In [None]:
model = GroqModel(
    model_name=llm_model, 
    api_key=config.get("GROQ_API_KEY"))
agent = Agent(
    model=model,
    retries=3,
    result_type=CoicopDetails,
    system_prompt=(
        'You are an expert data curator. You will receive a string of text '
        'with examples of items to be included into a certain classification. '
        'Your task is to transform this text into a list of self-explainig '
        'items descriptions, exploding the examples to the most granular level you can '
        'identify in the text. '
        'Each item description should contain all necessary information for classification '
        'as provided in the original text. Prefer complete descriptions rather than single words. '
        'If possible, avoid the use of ambiguous or generic terms such as `other` or `miscellaneous`. '
        'Your output should be in the same language as the input text. '
        ),
)

In [None]:
results = []
failed_calls = []
for i, item in enumerate(data_dict):
    # Print every 20 items to show progress
    if i % 20 == 0:
        print(f"Processing item {i+1} out of {len(data_dict)}")
    # Add switch to skip none items
    if item.get("ExplanatoryNote") is None:
        # No information to parse, just append existing item
        results.append(item)
        continue
    # Time delay to respect API rate limits
    time.sleep(3)
    try:
        prompt = "{}. {}".format(item.get("Description"), item.get("ExplanatoryNote"))
        agent_result = agent.run_sync(prompt, model_settings={'temperature': 0.0})
        # Add to results all examples, including the original class name...list concatenation
        for ex in [item.get("Description")] + agent_result.data.model_dump().get("examples"):
            # the new "Description" is inserted at the end, so it overwrites the original one
            results.append({**item, "Description": ex})
    except Exception as e:
        failed_calls.append(item)
        print(f"Error processing item {i+1} out of {len(data_dict)}")
        print(item)
        print(e)
        continue

In [None]:
results_df = pd.DataFrame(results)
results_df.drop(columns=["ExplanatoryNote"]).to_csv(
    "results/coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

In [None]:
failed_df = pd.DataFrame(failed_calls)
failed_df.to_csv(
    "results/failed_coicop1999_{}_{}.csv".format(
        llm_model,
        datetime.now().strftime("%Y-%m-%d_%H%M%S")),
    index=False)

# Combine results

In [None]:
result_files = [f for f in os.listdir("results/") if f.startswith("coicop1999") and f.endswith(".csv")]

In [None]:
results_list = []
for f in result_files:
    temp_df = pd.read_csv(os.path.join("results", f))
    results_list.append(temp_df)

results_df = pd.concat(results_list)

In [None]:
results_df["Code"].nunique()

Normalize to lowercase and remove all "other" labels

In [None]:
# Lowercase
results_df["Description"] = results_df["Description"].str.lower()
# Remove duplicates
results_df = results_df.drop_duplicates(ignore_index=True)
# Remove items with "other" or "miscellaneous"
results_df = results_df[~results_df["Description"].str.contains("other|miscellaneous")]

In [None]:
results_df["Code"].nunique()

In [None]:
results_df.to_csv(
    "results/consolidated_coicop1999_{}.csv".format(datetime.now().strftime("%Y-%m-%d")), 
    index=False)