# run on laptop

In [170]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import bm25s

In [171]:
df = pd.read_csv('harmonized-system.csv')

In [172]:
df

Unnamed: 0,section,hscode,description,parent,level
0,I,01,Animals; live,TOTAL,2
1,I,0101,"Horses, asses, mules and hinnies; live",01,4
2,I,010121,"Horses; live, pure-bred breeding animals",0101,6
3,I,010129,"Horses; live, other than pure-bred breeding an...",0101,6
4,I,010130,Asses; live,0101,6
...,...,...,...,...,...
6935,XXI,970690,Antiques; of an age exceeding 100 years but no...,9706,6
6936,TOTAL,99,Commodities not specified according to kind,TOTAL,2
6937,TOTAL,9999,Commodities not specified according to kind,99,4
6938,TOTAL,999999,Commodities not specified according to kind,9999,6


In [173]:
df["description"] = df["description"].astype(str).str.strip()
df = df[df["description"] != ""].reset_index(drop=True)

In [174]:
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

In [175]:
hs_data = df[["hscode", "description"]].values.tolist()

In [176]:
corpus = df["description"].tolist()  # for use with bm25s
corpus_tokens = bm25s.tokenize(corpus, stopwords="en")
retriever = bm25s.BM25()
retriever.index(corpus_tokens)

                                                              

In [177]:
descriptions = [desc for _, desc in hs_data]
embeddings = model.encode(descriptions, normalize_embeddings=True)
embeddings = np.array(embeddings).astype('float32')  # FAISS expects float32

In [178]:
embeddings.shape  # can check there are over 6k hs codes

(6940, 768)

In [179]:
faiss.normalize_L2(embeddings)

In [180]:
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine sim
index.add(embeddings)
print(f"Stored {index.ntotal} HS embeddings in FAISS index.")

Stored 6940 HS embeddings in FAISS index.


In [181]:
query = "klicheandel plast betalt af stok"
query_emb = model.encode([query]).astype('float32')
faiss.normalize_L2(query_emb)

query_tokens = bm25s.tokenize(query)  # try bm25s and see if it works any better
results, scores = retriever.retrieve(query_tokens, k=2)

D, I = index.search(query_emb, k=20)   # get top 20 for safety
faiss_scores = D[0]
faiss_ids = I[0]

# BM25S output fix
bm25_ids, bm25_scores = retriever.retrieve(query_tokens, k=20)
bm25_ids = bm25_ids[0].tolist()
bm25_scores = bm25_scores[0].tolist()

# Build hybrid scores
hybrid_scores = {}

# Add FAISS scores
for doc_id, s in zip(faiss_ids, faiss_scores):
    hybrid_scores[int(doc_id)] = hybrid_scores.get(int(doc_id), 0) + 0.9 * float(s)

# Add BM25 scores
for doc_id, s in zip(bm25_ids, bm25_scores):
    hybrid_scores[int(doc_id)] = hybrid_scores.get(int(doc_id), 0) + 0.1 * float(s)

# Sort by hybrid score
ranked = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
top_k = ranked[:3]

# Print results
for rank, (doc_id, score) in enumerate(top_k, 1):
    hs_code, desc = hs_data[doc_id]
    print(f"{rank}. HS {hs_code} – {desc} (hybrid={score:.3f})")


                                                     

1. HS 39 – Plastics and articles thereof (hybrid=0.677)
2. HS 3925 – Plastics; builders' wares n.e.c. or included (hybrid=0.674)
3. HS 3923 – Plastic articles for the conveyance or packing of goods; stoppers, lids, caps and other closures of plastics (hybrid=0.659)




In [182]:
# save

import os
import joblib
import faiss
import numpy as np

os.makedirs("hs_model", exist_ok=True)

joblib.dump(hs_data, "hs_model/hs_data.pkl")
faiss.write_index(index, "hs_model/faiss_index.index")
model.save("hs_model/embedding_model")

In [183]:
# load

hs_data = joblib.load("hs_model/hs_data.pkl")

index = faiss.read_index("hs_model/faiss_index.index")
embeddings = joblib.load("hs_model/embeddings.pkl")
model = SentenceTransformer("hs_model/embedding_model")

query = "pap20 - corrugated fibreboard"
query_emb = model.encode([query], normalize_embeddings=True).astype("float32")
faiss.normalize_L2(query_emb)

D, I = index.search(query_emb, k=3)

for rank, idx in enumerate(I[0]):
    hs_code, desc = hs_data[idx]
    print(f"{rank+1}. HS {hs_code} - {desc} (score={D[0][rank]:.3f})")

FileNotFoundError: [Errno 2] No such file or directory: 'hs_model/embeddings.pkl'

In [None]:
CHAPTER_RULES = {
    "live animal": "01", "cattle": "01", "horses": "01", "sheep": "01",

    "meat": "02", "carcass": "02",

    "fish": "03", "seafood": "03", "shellfish": "03",

    "milk": "04", "dairy": "04", "cheese": "04", "yogurt": "04",

    "honey": "04",  # sometimes 04 or 21, but 04 is safe
    "egg": "04",

    "animal product": "05", "ivory": "05", "whale bone": "05",
    "mother of pearl": "05", "animal gut": "05",

	    "plant": "06", "live plant": "06", "flower": "06",

    "vegetable": "07", "onion": "07", "carrot": "07",

    "fruit": "08", "berry": "08", "banana": "08",

    "coffee": "09", "tea": "09", "spice": "09",

    "grain": "10", "wheat": "10", "corn": "10", "rice": "10",

    "milling": "11", "flour": "11",

    "soybean": "12", "seeds": "12", "vegetable oilseeds": "12",

    "animal feed": "23",   # note: feed can appear in both 12 and 23

    "oil": "15",           # general vegetable fat/oil

    "rubber latex": "40",  # rubber seeds relate to 40 not 12

    "wood": "44", "timber": "44",
    "butter": "15", "fatty acid": "15", "olive oil": "15",

    "meat prep": "16", "sausage": "16", "canned meat": "16",

    "sugar": "17", "molasses": "17",

    "cocoa": "18", "chocolate": "18",

    "bread": "19", "cake": "19", "pasta": "19",

    "vegetable prep": "20", "preserve": "20", "pickled": "20",

    "water": "22", "beer": "22", "wine": "22", "spirits": "22",

    "tobacco": "24",
    "salt": "25", "cement": "25", "minerals": "25",

    "ore": "26", "metal ore": "26",

    "coal": "27", "oil fuel": "27", "diesel": "27", "gasoline": "27",
    "chemical": "28", "element": "28", "acid": "28",
    "alkali": "28", "compound": "28",

    "organic chemical": "29", "ketone": "29", "ester": "29",
    "ethylene glycol": "29", "polyethylene glycol": "29",
    "propylene glycol": "29",

    "medicine": "30", "pharmaceutical": "30", "drug": "30", "vaccine": "30",

    "fertilizer": "31",

    "pigment": "32", "ink": "32", "dye": "32", "paint": "32",

    "tanning": "32", "leather chemical": "32",

    "essential oil": "33", "perfume": "33", "cosmetic": "33",

    "soap": "34", "wax": "34", "cleaning product": "34",

    "glue": "35", "enzyme": "35",

    "explosive": "36", "fireworks": "36",

    "photographic": "37",

    "plastic": "39", "polyethylene": "39", "polypropylene": "39",
    "polyester": "55",  # for fibers
    "polystyrene": "39", "PVC": "39", "UPVC": "39",
"polyethylene": "39",   # must map to chapter 39
"polypropylene": "39",
"plastic": "39",
    "plastic": "39", "resin": "39", "polymer": "39",

    "rubber": "40", "latex": "40",

    "leather": "41", "hide": "41", "animal skin": "41",

    "fur": "43",
    "wood": "44", "plywood": "44", "lumber": "44",

    "cork": "45",

    "straw": "46", "basket": "46",

    "paper": "48", "cardboard": "48", "newsprint": "48",

    "book": "49", "printing": "49", "newspaper": "49",
    "silk": "50",

    "wool": "51",

    "cotton": "52",

    "linen": "53", "jute": "53",

    "synthetic fiber": "54",
    "polyester fiber": "54",
    "nylon": "54",

    "yarn": "55",

    "fabric": "59", "textile": "59",

    "carpet": "57", "rug": "57",

    "clothing": "61", "garment": "62",

    "blanket": "63", "home textile": "63",
    "shoes": "64", "footwear": "64",

    "hat": "65", "headgear": "65",

    "umbrella": "66",

    "feather": "67",
    "stone": "68", "granite": "68", "concrete": "68",

    "ceramic": "69", "tile": "69",

    "glass": "70", "bottle": "70",

    "jewelry": "71", "gold": "71", "silver": "71", "gemstone": "71",
    "steel": "72", "iron": "72",

    "stainless": "72", "ferrous": "72",

    "copper": "74",

    "nickel": "75",

    "aluminum": "76",

    "lead": "78",
    "zinc": "79",
    "tin": "80",

    "tools": "82", "hand tool": "82",

    "hardware": "83", "lock": "83",
    "machine": "84", "motor": "84", "pump": "84",
    "compressor": "84", "industrial machine": "84",
    
    "computer": "84", "server": "84", "cpu": "84",

    "electronics": "85", "battery": "85", "cable": "85",
    "transformer": "85", "semiconductor": "85",
    "train": "86",

    "car": "87", "vehicle": "87", "motorcycle": "87",

    "tractor": "87",

    "aircraft": "88", "drone": "88",

    "boat": "89", "ship": "89", "yacht": "89",
    "instrument": "90", "medical device": "90",
    "camera": "90", "optical": "90",

    "clock": "91", "watch": "91",

    "music": "92", "instrument": "92", "guitar": "92",
    "gun": "93", "firearm": "93", "rifle": "93",
    "ammunition": "93", "cartridge": "93", "clip": "93", "magazine": "93",
    "furniture": "94",

    "matress": "94", "lamp": "94",

    "toy": "95", "game": "95", "sports equipment": "95",

    "brush": "96", "broom": "96",

    "pen": "96", "stationery": "96",
    "art": "97", "painting": "97", "sculpture": "97",

    "antiques": "97",

    "coins": "97",

    "special import": "99", "special category": "99",
}
