In [1]:
import pandas as pd, re, pint
from tqdm.auto import tqdm

In [2]:
ureg = pint.UnitRegistry()

# --- synonym maps ---
UNIT_SYNONYMS = {
    "ml": "milliliter", "milliliters": "milliliter",
    "l":  "liter",      "liters":      "liter",
    "cl": "centiliter", "dl": "deciliter",
    "g":  "gram",       "grams": "gram", "gr": "gram",
    "kg": "kilogram",   "kilograms": "kilogram",
    "mg": "milligram",  "milligrams": "milligram",
    "oz": "ounce",      "ounces": "ounce",
    "lb": "pound", "lbs": "pound", "pounds": "pound",
    "pc": "piece", "pcs": "piece", "pieces": "piece",
    "tab": "tablet", "tabs": "tablet", "tablets": "tablet",
    "pack": "pack", "packs": "pack"
}

UNIT_PATTERN = "|".join(sorted(UNIT_SYNONYMS, key=len, reverse=True))
UNIT_RX = re.compile(rf"(?P<qty>\d+(?:[\.,]\d+)?)[\s\-]*?(?P<unit>{UNIT_PATTERN})\b", re.I)
MULTIPACK_RX = re.compile(
    rf"(?:(?P<count1>\d+)[\s×x*]+(?P<qty1>\d+(?:[\.,]\d+)?)[\s\-]*?(?P<unit1>{UNIT_PATTERN})\b)|"
    rf"(?:(?P<qty2>\d+(?:[\.,]\d+)?)[\s\-]*?(?P<unit2>{UNIT_PATTERN})[\s×x*]+(?P<count2>\d+))", re.I
)

def parse_size_unit(title: str):
    """Return (qty, canonical_unit) or (None, None) on failure."""
    title = title.lower()

    # multipack
    mpack = MULTIPACK_RX.search(title)
    if mpack:
        if mpack.group("count1"):
            count, qty, unit = int(mpack["count1"]), mpack["qty1"], mpack["unit1"]
        else:
            count, qty, unit = int(mpack["count2"]), mpack["qty2"], mpack["unit2"]
        qty = float(qty.replace(",", "."))
        unit = UNIT_SYNONYMS.get(unit.lower())
        return (qty * count, unit) if unit else (None, None)

    # single
    m = UNIT_RX.search(title)
    if m:
        qty  = float(m["qty"].replace(",", "."))
        unit = UNIT_SYNONYMS.get(m["unit"].lower())
        return (qty, unit) if unit else (None, None)

    return (None, None)


In [3]:
df = pd.read_csv("../data/productsDB.csv")

In [4]:
qtys, units = [], []
for title, fallback in tqdm(zip(df["translated_itemname"], df["itemquantity"]),
                            total=len(df)):
    qty, unit = parse_size_unit(title)
    # fallback if regex failed
    if qty is None:
        qty, unit = fallback, "raw"
        
    qtys.append(qty)
    units.append(unit)

df["parsed_qty"]  = qtys          # float
df["parsed_unit"] = units         # canonical string

  0%|          | 0/141180 [00:00<?, ?it/s]

In [6]:
df.head(30)[["translated_itemname", "parsed_qty", "parsed_unit"]]

Unnamed: 0,translated_itemname,parsed_qty,parsed_unit
0,Tortilla chips 360 grams,360.0,gram
1,French butter flash-shard by weight,84.0,raw
2,Sucralose 200 tablets,200.0,tablet
3,Sucralose Jar 1200 Tablets,1200.0,tablet
4,Kerem almond flour - 250 g,250.0,gram
5,Liquid Sugarlite 220 grams,220.0,gram
6,Sucralose Patent 700 Tablets,700.0,tablet
7,Sucralose 120 sachets,86.0,raw
8,Popstar reduced fat natural popcorn 5 pcs,5.0,piece
9,Spaghetti Barilla No. 7 Barilla,99.0,raw


In [7]:
df.to_csv("../data/unit_productsDB.csv", index=False)