In [11]:
import polars as pl

df = pl.read_ndjson("../data/output.ndjson")
df = df.explode("items").unnest(
    "items"
)  # .filter(pl.col("category").str.to_lowercase().str.contains("coffee"))

In [None]:
import difflib
from typing import List

import polars_ds as pds


def longest_common_subsequence(str1, str2):
    sequence_matcher = difflib.SequenceMatcher(None, str1, str2)
    match = sequence_matcher.find_longest_match(0, len(str1), 0, len(str2))
    return {"lcs": str1[match.a : match.a + match.size], "start_index": match.a}


# Define the function to compute the similarity ratio
def similarity_ratio(lcs, str1, str2):
    return len(lcs) / max(min(len(str1), len(str2)), 1)


def get_hash_map(
    df, terms: List[str], col_name="description", col_name_to_match="for_maarten"
):
    assert col_name in df.columns, f"The DataFrame should have a {col_name} column"

    def normalize_col(col_name: str) -> pl.Expr:
        return (
            pds.normalize_whitespace(
                pds.remove_diacritics(
                    pl.col(col_name)
                    .str.to_lowercase()
                    .str.strip_chars()
                    .str.replace_all(r"\s+", " ")
                    .str.split(" ")
                    .list.set_difference(["boni", "bio", "everyday"])
                    .list.set_difference(pds.extract_numbers(col_name))
                    .list.join(" ")
                )
            )
        ).alias(f"{col_name}_normalized")

    cross_joined = df.with_columns(pl.lit(terms).alias(col_name_to_match)).explode(
        col_name_to_match
    )

    return (
        cross_joined.with_columns(
            normalize_col(col_name), normalize_col(col_name_to_match)
        )
        .with_columns(
            [
                pl.struct(f"{col_name}_normalized", f"{col_name_to_match}_normalized")
                .map_elements(
                    lambda x: longest_common_subsequence(
                        x[f"{col_name}_normalized"],
                        x[f"{col_name_to_match}_normalized"],
                    ),
                    return_dtype=pl.Struct({"lcs": pl.Utf8, "start_index": pl.Int64}),
                )
                .alias("lcs_struct")
            ]
        )
        .unnest("lcs_struct")
        .with_columns(pl.col("lcs").str.len_chars().alias("lcs_len"))
        .with_columns(
            pl.struct("lcs", col_name, col_name_to_match)
            .map_elements(
                lambda row: similarity_ratio(
                    row["lcs"], row[col_name], row[col_name_to_match]
                ),
                return_dtype=pl.Float64,
            )
            .alias("similarity_ratio"),
            pl.col("lcs").str.len_chars().alias("lcs_length"),
        )
        .with_columns(
            pl.col("similarity_ratio")
            .max()
            .over(col_name)
            .alias("max_similarity_ratio")
        )
        .filter(pl.col("similarity_ratio") == pl.col("max_similarity_ratio"))
        .filter(pl.col("similarity_ratio") >= 0.9)
        .unique("description")
    )

In [10]:
import json

json.dumps(
    get_hash_map(df, ["soya", "espresso", "koffie", "graindor", "bananen", "actimel"])
    .sort("max_similarity_ratio", descending=True)
    .select("description", (pl.col("unit_price") * pl.col("amount")).alias("price"))
    .to_dicts()
)

'[{"description": "GRAINDOR Bonen Espresso RFA 2.5kg", "price": 7.19}, {"description": "BONI BIO bananen Fairtrade +1kg", "price": 2.98}]'

In [None]:
import polars_ds as pds

df.with_columns(
    pl.lit(["soya", "espresso", "koffie", "graindor", "bananen", "actimel"]).alias(
        "for_maarten"
    ),
    pl.col("description")
    .str.to_lowercase()
    .str.split(" ")
    .list.set_difference(["boni", "bio", "everyday"])
    .list.set_difference(pds.extract_numbers("description"))
    .list.join(" ")
    .alias("description_prepped"),
).explode("for_maarten").with_columns(
    pds.str_fuzz(
        "description_prepped",
        "for_maarten",
    ).alias("score")
).sort("score", descending=True).filter(pl.col("score") > 0.38).unique("description")

date,page,total_amount,unit_price,unit,amount,description,category,path,for_maarten,description_prepped,score
str,i64,f64,f64,str,i64,str,str,str,str,str,f64
"""2023-10-19""",1,66.28,1.99,"""piece""",1,"""BONI champignons groot 500g""","""Vegetables""","""data/Kasticket_19022025_17h09_…","""graindor""","""500g champignons groot""",0.4
"""2023-10-19""",1,66.28,2.98,"""piece""",1,"""BONI BIO bananen Fairtrade 1kg""","""Fruits""","""data/Kasticket_19022025_17h09_…","""bananen""","""1kg fairtrade bananen""",0.5
"""2023-10-19""",1,66.28,7.19,"""piece""",1,"""GRAINDOR Bonen Espresso RFA 2.…","""Meat""","""data/Kasticket_19022025_17h09_…","""espresso""","""graindor bonen espresso rfa 2.…",0.390244
"""2023-10-19""",1,66.28,1.19,"""piece""",1,"""mango ready to eat""","""Fruits""","""data/Kasticket_19022025_17h09_…","""graindor""","""mango ready to eat""",0.384615
"""2023-10-19""",1,66.28,1.59,"""piece""",1,"""BONI groene pesto 190g""","""cooking""","""data/Kasticket_19022025_17h09_…","""espresso""","""190g groene pesto""",0.4
