# RAC for COICOP 2018 classification

This notebooks compares the performance of different LLMs for Retrieval Augmented Classification (RAC) oriented to classify products from web scraping into COICOP 2018 subclasses.

In [1]:
from datetime import datetime
from collections import Counter

import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client import models as qdrant_models
from sentence_transformers import SentenceTransformer

from outlines import models, generate
from openai import AsyncOpenAI
from outlines.models.openai import OpenAIConfig

In [2]:
def count_value_occurrences(my_list):
    """Counts the occurrences of each value in a list and returns a dictionary.

    Args:
        my_list: The input list.

    Returns:
        A dictionary where keys are the unique values in the list (in the order they first appear)
        and values are the number of times each value appears.
    """

    # Use Counter for efficient counting:
    value_counts = Counter(my_list)

    # Preserve order of first appearance using a list comprehension:
    ordered_keys = sorted(list(set(my_list)))
    
    # Create the ordered dictionary:
    ordered_dict = {key: value_counts[key] for key in ordered_keys}

    return ordered_dict

## Load data

In [3]:
grid_df = pd.read_csv("results/consolidated_coicop2018_2025-03-16.csv")
test_df = pd.read_csv("manual_labels/manual_labels_coicop2018.csv")

In [4]:
test_dict = test_df.to_dict(orient="records")
grid_dict = grid_df.to_dict(orient="records")

Load data in Qdrant

In [5]:
client = QdrantClient(":memory:")

In [6]:
encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

In [7]:
client.create_collection(
    collection_name="coicop2018",
    vectors_config=qdrant_models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=qdrant_models.Distance.DOT, # Try with Cosine as well
    ),
)

True

In [8]:
%%time
client.upload_points(
    collection_name="coicop2018",
    points=[
        qdrant_models.PointStruct(
            id=idx+1, vector=encoder.encode(doc["title"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(grid_dict)
    ],
)

CPU times: user 3min 15s, sys: 759 ms, total: 3min 16s
Wall time: 3min 16s


  return self._client.upload_points(


Create an index for the COICOP code to facilitate grouped search (only for Qdrant server)

In [None]:
%%time
_ = client.create_payload_index(
    collection_name="coicop2018",
    field_name="code",
    field_schema=qdrant_models.PayloadSchemaType.KEYWORD,
)

# Search strategies

### 12 results from product - category combination

In [9]:
%%time
correct = 0
n_correct_method1 = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method1.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 77.75%
CPU times: user 1h 23min 26s, sys: 1min 53s, total: 1h 25min 20s
Wall time: 5min 46s


In [10]:
count_value_occurrences(n_correct_method1)

{0: 1111,
 1: 565,
 2: 432,
 3: 433,
 4: 305,
 5: 345,
 6: 412,
 7: 459,
 8: 462,
 9: 235,
 10: 146,
 11: 60,
 12: 28}

### 12 results from  category - product combination

In [11]:
%%time
correct = 0
n_correct_method2 = []

for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")

    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["category"], item["name"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method2.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 76.73%
CPU times: user 1h 21min 27s, sys: 1min 53s, total: 1h 23min 21s
Wall time: 5min 30s


In [12]:
count_value_occurrences(n_correct_method2)

{0: 1162,
 1: 559,
 2: 438,
 3: 437,
 4: 339,
 5: 308,
 6: 384,
 7: 427,
 8: 452,
 9: 264,
 10: 135,
 11: 55,
 12: 33}

### 6 results from category  and 6 from product

In [13]:
%%time
correct = 0
n_correct_method3 = []

for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")

    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode(item["name"]).tolist(),
        limit=6
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }

    search_result2 = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode(item["category"]).tolist(),
        limit=6
    )

    search_dict2 = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result2.points
    }

    search_dict.update(search_dict2)

    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method3.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 71.78%
CPU times: user 2h 29min 2s, sys: 3min 27s, total: 2h 32min 29s
Wall time: 9min 52s


In [14]:
count_value_occurrences(n_correct_method3)

{0: 1409,
 1: 560,
 2: 539,
 3: 678,
 4: 469,
 5: 398,
 6: 467,
 7: 217,
 8: 133,
 9: 72,
 10: 32,
 11: 12,
 12: 7}

## 12 results from grouped search on category - product combination

In [15]:
%%time
correct = 0
n_correct_method4 = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points_groups(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        group_by="code",
        limit=6,
        group_size=2,
    )
    res = []

    for gr in search_result.groups:
        res.extend(gr.hits)

    search_dict = {
        s.payload.get("title"): s.payload.get("code")
        for s in res
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method4.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 78.41%
CPU times: user 3h 19min 12s, sys: 1min 24s, total: 3h 20min 37s
Wall time: 52min 36s


In [16]:
count_value_occurrences(n_correct_method4)

{0: 1078, 2: 3915}

## Analysis of missed retrievals

In [17]:
retrieval_dict = []

for i, item in enumerate(test_dict):
    new_item = {**item, "prod_cat_search": n_correct_method1[i]>0,
                "cat_prod_search": n_correct_method2[i]>0,
                "separate_search": n_correct_method3[i]>0,
                "group_search": n_correct_method4[i]>0}
    retrieval_dict.append(new_item)

In [18]:
retrieval_df = pd.DataFrame(retrieval_dict)

In [19]:
# Group by 'code' and calculate the share of False for each '_search' column
search_columns = [col for col in retrieval_df.columns if col.endswith('_search')]
missed_retrieval_df = retrieval_df.groupby('code')[search_columns].apply(lambda x: (x < 1).mean())

# Add a column with the count of records for each code
missed_retrieval_df['count'] = retrieval_df.groupby('code').size()

Salve search results for analysis of missed retrievals by COICOP subclass.

In [20]:
missed_retrieval_df.to_csv("analysis/missed_retrieval_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))

# RAC models

LLM template, including 12 search examples with the product name-category combination (~1% less successful that the grouped search, but ~6 times faster)

In [21]:
prompt_template = """You are an expert data curator. You are given a product name and the commercial category where it belongs.
Your task is to find the most similar match from a list of possible options. 
If no option is suitable, you should output "none of the above". The options are:

1. {options[0]}
2. {options[1]}
3. {options[2]}
4. {options[3]}
5. {options[4]}
6. {options[5]}
7. {options[6]}
8. {options[7]}
9. {options[8]}
10. {options[9]}
11. {options[10]}
12. {options[11]}
13. none of the above

Your output should only be the exact text of one of the options above, and nothing else.

The product name is: {name}
The commercial category where the product belongs is: {category}
The most similar option is:"""

## SmolLM2

In [22]:
llmclient = AsyncOpenAI(
    api_key="none",
    base_url='http://localhost:11434/v1/',
)
config = OpenAIConfig(model="smollm2:135m", temperature=0)
model = models.openai(llmclient, config)

In [23]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
CPU times: user 2h 33min 3s, sys: 1min 29s, total: 2h 34min 33s
Wall time: 49min 20s


In [24]:
results_df = pd.DataFrame(processed_items)

In [25]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [26]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 77.75%
LLM accuracy: 0.92%


In [27]:
results_df.to_csv("results/rac_coicop2018_smollm2-135m_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)

## Qwen 2.5 

In [52]:
config = OpenAIConfig(model="qwen2.5:3b-instruct", temperature=0)
model = models.openai(llmclient, config)

In [53]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
CPU times: user 2h 33min 12s, sys: 1min 32s, total: 2h 34min 44s
Wall time: 1h 23min 48s


In [54]:
results_df = pd.DataFrame(processed_items)

In [55]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [56]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 77.75%
LLM accuracy: 39.34%


In [57]:
results_df.to_csv("results/rac_coicop2018_qwen25-3b_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)

## LLama 3.2

In [34]:
config = OpenAIConfig(model="llama3.2:1b", temperature=0)
model = models.openai(llmclient, config)

In [35]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
CPU times: user 2h 32min 47s, sys: 1min 29s, total: 2h 34min 17s
Wall time: 59min 8s


In [36]:
results_df = pd.DataFrame(processed_items)

In [37]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [38]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 77.75%
LLM accuracy: 33.35%


In [39]:
results_df.to_csv(path_or_buf="results/rac_coicop2018_llama32-1b_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)

## Deepseek

In [46]:
config = OpenAIConfig(model="deepseek-r1:1.5b", temperature=0)
model = models.openai(llmclient, config)

In [47]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query_points(
        collection_name="coicop2018",
        query=encoder.encode("{} {}".format(item["name"], item["category"])).tolist(),
        limit=12
    )

    search_dict = {
        p.payload.get("title"): p.payload.get("code")
        for p in search_result.points
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
CPU times: user 2h 33min 27s, sys: 1min 29s, total: 2h 34min 56s
Wall time: 1h 17min 17s


In [48]:
results_df = pd.DataFrame(processed_items)

In [49]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [50]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 77.75%
LLM accuracy: 2.44%


In [51]:
results_df.to_csv(path_or_buf="results/rac_coicop2018_deepseek-1_5b_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)