In [42]:
import duckdb

In [43]:
con = duckdb.connect("fake.duckdb")

In [None]:
con.read_csv("./generated_data_01.csv")

In [25]:
sql = """
CREATE TABLE data_01 AS
(select SKU, description, price from generated_data_01.csv)
"""

con.sql(sql)

In [26]:
sql = """
CREATE TABLE data_02 AS
(select SKU, description, price from generated_data_02.csv)
"""

con.sql(sql)

In [27]:
# save data to parquet file
con.close()

In [28]:
# reopen
con = duckdb.connect("fake.duckdb")

In [29]:
sql = "CREATE TABLE similarity_matrix (sku_a TEXT, description_a TEXT, sku_b TEXT, description_b TEXT, similar_ratio FLOAT);"
con.execute(sql)

<duckdb.duckdb.DuckDBPyConnection at 0x10d2b2ab0>

In [25]:
sql = "select sku, description from data_01 limit 500;"
rs = con.execute(sql).fetchall()

In [26]:
sql = "select sku, description from data_01 limit 500 offset 500;"
rs2 = con.execute(sql).fetchall()

In [None]:
rs

In [None]:
rs2

In [None]:
from thefuzz import fuzz
sm = []
for row in rs:
    for row2 in rs2:
        sr = fuzz.ratio(row[1], row2[1])
        if sr > 70:
            print(f"{row[1]}, {row2[1]}, {sr}")
            sm.append((row[0], row[1], row2[0], row2[1], sr))
#
sql = "insert into similarity_matrix (sku_a, description_a, sku_b, description_b, similar_ratio) values (?, ?, ?, ?, ?);"
con.executemany(sql, sm)

In [30]:
con.close()

In [None]:
con = duckdb.connect("products.duckdb")
con.execute("select * from similarity_matrix order by sku_a, similar_ratio desc, sku_b limit 100 ").fetchall()

In [None]:
# https://medium.com/@ilakk2023/advanced-sql-features-in-duckdb-window-functions-common-table-expressions-and-more-bbf9c4216986
sql = """
select * from (
SELECT sku_a, description_a, sku_b, description_b, similar_ratio, 
        row_number() OVER (PARTITION BY sku_a ORDER BY similar_ratio desc) as rn
FROM similarity_matrix
) where rn < 3;
"""
con.execute(sql).fetchall()

In [1]:
# COPY (SELECT * FROM tbl) TO 'output.csv' (HEADER, DELIMITER ',');
sql = """
COPY (
    select * from (
    SELECT sku_a, description_a, sku_b, description_b, similarity, 
            row_number() OVER (PARTITION BY sku_a ORDER BY similarity desc) as rn
    FROM data_all where similarity > 80 
    ) where rn < 3
) TO 'similarity_matrix_output.csv' (HEADER, DELIMITER ',');
"""
con.execute(sql)

NameError: name 'con' is not defined

In [None]:
from duckdb.typing import *
from faker import Faker

def generate_random_name():
    fake = Faker()
    return fake.name()

duckdb.create_function("random_name", generate_random_name, [], VARCHAR)
res = duckdb.sql("SELECT random_name()").fetchall()
print(res)

In [16]:
res = duckdb.sql("SELECT random_name()").fetchall()
print(res)

[('Lauren Kemp',)]


In [46]:
from thefuzz import fuzz
def fuzzy_similarity(str1: str, str2: str) -> int:
    return fuzz.ratio(str1, str2)

In [47]:
con.create_function("fuzzy_similarity", fuzzy_similarity)


<duckdb.duckdb.DuckDBPyConnection at 0x10df683f0>

In [36]:
res = con.sql("SELECT fuzzy_similarity('a', 'b')").fetchall()
res

[(0,)]

In [38]:
sql = """
SELECT t2.sku as sku_a, t2.description as description_a, t0.sku as sku_b, t0.description as description_b,
        fuzzy_similarity(t0.description, t2.description) as similarity
FROM data_02 t2, data_01 t0
"""
res = con.execute(sql).fetchall()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [41]:
con.close()

In [44]:
sql = """
CREATE TABLE data_all as
select t0.sku as sku_a, t0.description as description_a,
       t1.sku as sku_b, t1.description as description_b,
       0 as similarity
from
(select SKU, description, price from generated_data_01.csv) t0,
(select SKU, description, price from generated_data_02.csv) t1,
"""

con.sql(sql)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [48]:
sql = """
update data_all set similarity = fuzzy_similarity(description_a, description_b);
"""
con.execute(sql).fetchall()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[(100000000,)]

In [49]:
sql = """
select * from data_all where similarity > 80 order by sku_a, sku_b, similarity desc;
"""
con.execute(sql).fetchall()

[('001-21-6217',
  'Reverse-engineered well-modulated matrix',
  '707-13-6022',
  'Reverse-engineered well-modulated conglomeration',
  86),
 ('001-24-3801',
  'Cross-group foreground portal',
  '544-40-0053',
  'Cross-group foreground intranet',
  83),
 ('001-38-6253',
  'Integrated 5thgeneration moratorium',
  '437-80-3650',
  'Inverse 5thgeneration moderator',
  82),
 ('001-38-6253',
  'Integrated 5thgeneration moratorium',
  '446-64-4004',
  'Integrated 4thgeneration moratorium',
  97),
 ('001-38-6253',
  'Integrated 5thgeneration moratorium',
  '560-25-3133',
  'Ameliorated 4thgeneration moratorium',
  85),
 ('001-49-4694',
  'Function-based scalable pricing structure',
  '233-35-0529',
  'Function-based scalable infrastructure',
  89),
 ('001-52-2137',
  'Upgradable incremental time-frame',
  '199-83-6564',
  'Versatile incremental time-frame',
  83),
 ('001-63-6504',
  'User-friendly mobile concept',
  '279-18-3815',
  'User-friendly logistical concept',
  83),
 ('001-65-2310',
