# RAG for COICOP 2018 classification

This notebooks compares the performance of different LLMs for RAG oriented to classify products from web scraping into COICOP 2018 subclasses.

In [13]:
from datetime import datetime
from collections import Counter
import pandas as pd
from qdrant_client import QdrantClient
from outlines import models, generate
from openai import AsyncOpenAI
from outlines.models.openai import OpenAIConfig

In [18]:
def count_value_occurrences(my_list):
    """Counts the occurrences of each value in a list and returns a dictionary.

    Args:
        my_list: The input list.

    Returns:
        A dictionary where keys are the unique values in the list (in the order they first appear)
        and values are the number of times each value appears.
    """

    # Use Counter for efficient counting:
    value_counts = Counter(my_list)

    # Preserve order of first appearance using a list comprehension:
    ordered_keys = sorted(list(set(my_list)))
    
    # Create the ordered dictionary:
    ordered_dict = {key: value_counts[key] for key in ordered_keys}

    return ordered_dict

## Load data

In [2]:
grid_df = pd.read_csv("results/consolidated_coicop2018_20250210.csv")
test_df = pd.read_csv("manual_labels/manual_labels_coicop2018.csv")

In [3]:
test_dict = test_df.to_dict(orient="records")
grid_dict = grid_df.to_dict(orient="records")

In [4]:
grid = {}
grid["docs"] = [item.get("title") for item in grid_dict]
grid["metadata"] = grid_dict
grid["ids"] = [item+1 for item in range(len(grid_dict))]

Load data in Qdrant

In [5]:
client = QdrantClient(":memory:")

In [6]:
client.set_model("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")


In [7]:
%%time 

_ = client.add(
    collection_name="coicop2018",
    documents=grid["docs"],
    metadata=grid["metadata"],
    ids=grid["ids"]
)

CPU times: user 7min 39s, sys: 658 ms, total: 7min 39s
Wall time: 57.6 s


# Search strategies

### 6 results from product - category combination

In [8]:
correct = 0
n_correct_method1 = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["name"], item["category"]),
        limit=6
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method1.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 64.97%


In [19]:
count_value_occurrences(n_correct_method1)

{0: 1749, 1: 680, 2: 582, 3: 679, 4: 388, 5: 334, 6: 581}

### 12 results from product - category combination

In [9]:
correct = 0
n_correct_method2 = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["name"], item["category"]),
        limit=12
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method2.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 74.34%


In [20]:
count_value_occurrences(n_correct_method2)

{0: 1281,
 1: 673,
 2: 482,
 3: 617,
 4: 409,
 5: 329,
 6: 208,
 7: 152,
 8: 143,
 9: 257,
 10: 263,
 11: 124,
 12: 55}

### 6 results from category - product combination

In [10]:
correct = 0
n_correct_method3 = []

for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["category"], item["name"]),
        limit=6
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method3.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 63.91%


In [21]:
count_value_occurrences(n_correct_method3)

{0: 1802, 1: 651, 2: 571, 3: 664, 4: 355, 5: 346, 6: 604}

### 12 results from  category - product combination

In [11]:
correct = 0
n_correct_method4 = []

for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["category"], item["name"]),
        limit=12
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method4.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 74.38%


In [22]:
count_value_occurrences(n_correct_method4)

{0: 1279,
 1: 681,
 2: 493,
 3: 607,
 4: 394,
 5: 305,
 6: 237,
 7: 164,
 8: 161,
 9: 240,
 10: 230,
 11: 149,
 12: 53}

### 6 results from category  and 6 from product

In [12]:
correct = 0
n_correct_method5 = []

for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{}".format(item["name"]),
        limit=6
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }

    search_result2 = client.query(
        collection_name="coicop2018",
        query_text="{}".format(item["category"]),
        limit=6
    )

    search_dict2 = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result2
    }

    search_dict.update(search_dict2)

    if item["code"] in search_dict.values():
        correct += 1
    n_correct_method5.append(list(search_dict.values()).count(item["code"]))

print("Search accuracy: {:.2f}%".format((correct / len(test_dict))*100))

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993
Processing item 2501 out of 4993
Processing item 3001 out of 4993
Processing item 3501 out of 4993
Processing item 4001 out of 4993
Processing item 4501 out of 4993
Search accuracy: 64.41%


In [23]:
count_value_occurrences(n_correct_method5)

{0: 1777,
 1: 535,
 2: 501,
 3: 584,
 4: 381,
 5: 279,
 6: 589,
 7: 171,
 8: 142,
 9: 28,
 10: 6}

LLM template, including 12 search examples

In [24]:
prompt_template = """You are an expert data curator. You are given a product name and the commercial category where it belongs.
Your task is to classify the product into the most likely type from the following options:
1. {options[0]}
2. {options[1]}
3. {options[2]}
4. {options[3]}
5. {options[4]}
6. {options[5]}
7. {options[6]}
8. {options[7]}
9. {options[8]}
10. {options[9]}
11. {options[10]}
12. {options[11]}
13. none of the above

Your output should only be the exact text of one of the options above, and nothing else.

The product name is: {name}
The commercial category is: {category}
The correct type is:"""

## SmolLM2

In [25]:
model = models.transformers(
    "HuggingFaceTB/SmolLM2-135M-Instruct",
    device="cuda",
    model_kwargs={"temperature": 0, "do_sample": True}
    )

In [26]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["category"], item["name"]),
        limit=12
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993


IndexError: list index out of range

In [27]:
results_df = pd.DataFrame(processed_items)

In [28]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [29]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 83.02%
LLM accuracy: 29.78%


In [30]:
results_df.to_csv("results/rag_results_smollm2_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)

## Qwen 2.5 

In [31]:
llmclient = AsyncOpenAI(
    api_key="none",
    base_url='http://localhost:11434/v1/',
)
config = OpenAIConfig(model="qwen2.5:1.5b-instruct", temperature=0)
model = models.openai(llmclient, config)

In [32]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["category"], item["name"]),
        limit=12
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993
Processing item 2001 out of 4993


IndexError: list index out of range

In [33]:
results_df = pd.DataFrame(processed_items)

In [34]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [35]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

Search accuracy: 83.02%
LLM accuracy: 53.83%


In [36]:
results_df.to_csv("results/rag_results_qwen25_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)

## LLama 3.2

In [37]:
config = OpenAIConfig(model="llama3.2:1b", temperature=0)
model = models.openai(llmclient, config)

In [None]:
%%time
processed_items = []
for i, item in enumerate(test_dict):
    if i % 500 == 0:
        print(f"Processing item {i+1} out of {len(test_dict)}")
    search_result = client.query(
        collection_name="coicop2018",
        query_text="{} {}".format(item["category"], item["name"]),
        limit=12
    )

    search_dict = {
        item.metadata.get("document"): item.metadata.get("code")
        for item in search_result
    }
    correct_option = item["code"] in search_dict.values()
    correct_n = list(search_dict.values()).count(item["code"])
    options = list(search_dict.keys()) + ["none of the above"]
    generator = generate.choice(model, options)
    res = generator(prompt_template.format(options=options, name=item["name"], category=item["category"]))
    processed_items.append({**item, "prediction": search_dict.get(res), "correct_option": correct_option, "correct_n": correct_n})
    

Processing item 1 out of 4993
Processing item 501 out of 4993
Processing item 1001 out of 4993
Processing item 1501 out of 4993


In [None]:
results_df = pd.DataFrame(processed_items)

In [None]:
results_df["match"] = results_df["prediction"] == results_df["code"]

In [None]:
print("Search accuracy: {:.2f}%".format((results_df["correct_option"].sum() / len(results_df))*100))
print("LLM accuracy: {:.2f}%".format((results_df["match"].sum() / len(results_df))*100))

In [None]:
results_df.to_csv(path_or_buf="results/rag_results_llama32_{}.csv".format(datetime.now().strftime("%Y-%m-%d_%H%M%S")), index=False)