In [2]:
import polars as pl

In [39]:
ids = [1,2,3]
id_vals = ['a','b','c']
dts = ['2023-01-01','2023-02-05','2023-02-08']

df = pl.DataFrame(
    {'id': ids, 'vals': id_vals, 'dts': dts}, 
    schema={'id': pl.Int64, 'vals': pl.Utf8, 'dts': pl.Date}
)

ids = [2,3,4]
id_vals = ['b','c','d']
dts = ['2023-02-01','2023-03-05','2023-04-08']

df2 = pl.DataFrame(
   {'id': ids, 'vals': id_vals, 'dts': dts}, 
    schema={'id': pl.Int64, 'vals': pl.Utf8, 'dts': pl.Date}
)


#### Testing Unique Counts

In [None]:
ids = [1,1,1,2,2,3,3,3,3]
id_vals = ['a','b','b','x','y','z','a','a','b']
dfx = pl.DataFrame(
   {'id': ids, 'vals': id_vals}, 
    schema={'id': pl.Int64, 'vals': pl.Utf8}
)

dfx.group_by("id").agg(pl.col("vals").unique().count())



id,vals
i64,u32
1,2
2,2
3,3


In [71]:
df3 = pl.DataFrame([
    {'order_id': 1, 'order_line_id':1, 'qty': 5, 'total_price': 20.35},
    {'order_id': 1, 'order_line_id':2, 'qty': 4, 'total_price': 8.46},
    {'order_id': 2, 'order_line_id':1, 'qty': 12, 'total_price': 51.44},
    {'order_id': 2, 'order_line_id':2, 'qty': 7, 'total_price': 36.82}
])
res = df3.group_by("order_id").agg([
    pl.col("order_line_id").unique().count().alias("line_cnt"),
    pl.col("qty").sum().alias("total_qty"),
    pl.col("total_price").sum().alias("total_price")
])

In [72]:
res.write_clipboard()

In [34]:
df.select(pl.col("dts").min().alias("blah"), pl.col('id').count().unique().alias('a'))

blah,a
date,u32
2023-01-01,3


In [43]:
# df2 = pl.DataFrame({'id':[i for i in range(0,9)]})

# print(df2.select(pl.col('id').max()))

df.join(df2, on="id", how="anti")



id,vals,dts
i64,str,date
1,"""a""",2023-01-01


### Fuzzy Match on Dates

In [19]:
txn_ids = [1,2,3]
txn_dts = ['2023-01-01','2023-02-05','2023-02-08']
ord_types = ['online','in-store','online']

df = pl.DataFrame(
    {'txn_id': txn_ids, 'txn_dt': txn_dts, 'ord_type': ord_types}, 
    schema={'txn_id': pl.Int64, 'txn_dt': pl.Date, 'ord_type': pl.Utf8, }
)

txn_ids = [1,2,3]
txn_dts = ['2023-01-04','2023-02-04','2023-02-09']
ord_prices = [21.39, 26.46, 34.33]

df2 = pl.DataFrame(
    {'txn_id': txn_ids, 'txn_dt': txn_dts, 'ord_price': ord_prices}, 
    schema={'txn_id': pl.Int64, 'txn_dt': pl.Date, 'ord_price': pl.Float32, }
)
print(df)
print(df2)

shape: (3, 3)
┌────────┬────────────┬──────────┐
│ txn_id ┆ txn_dt     ┆ ord_type │
│ ---    ┆ ---        ┆ ---      │
│ i64    ┆ date       ┆ str      │
╞════════╪════════════╪══════════╡
│ 1      ┆ 2023-01-01 ┆ online   │
│ 2      ┆ 2023-02-05 ┆ in-store │
│ 3      ┆ 2023-02-08 ┆ online   │
└────────┴────────────┴──────────┘
shape: (3, 3)
┌────────┬────────────┬───────────┐
│ txn_id ┆ txn_dt     ┆ ord_price │
│ ---    ┆ ---        ┆ ---       │
│ i64    ┆ date       ┆ f32       │
╞════════╪════════════╪═══════════╡
│ 1      ┆ 2023-01-04 ┆ 21.389999 │
│ 2      ┆ 2023-02-04 ┆ 26.459999 │
│ 3      ┆ 2023-02-09 ┆ 34.330002 │
└────────┴────────────┴───────────┘


In [20]:
joined_df = (
    df.join(df2, on="txn_id", how="inner", suffix="_right")
    .filter((pl.col("txn_dt_right") - pl.col("txn_dt")).abs() <= pl.duration(days=2))
    .select("txn_id", "txn_dt", "ord_type", "ord_price")
)
print(joined_df)

shape: (2, 4)
┌────────┬────────────┬──────────┬───────────┐
│ txn_id ┆ txn_dt     ┆ ord_type ┆ ord_price │
│ ---    ┆ ---        ┆ ---      ┆ ---       │
│ i64    ┆ date       ┆ str      ┆ f32       │
╞════════╪════════════╪══════════╪═══════════╡
│ 2      ┆ 2023-02-05 ┆ in-store ┆ 26.459999 │
│ 3      ┆ 2023-02-08 ┆ online   ┆ 34.330002 │
└────────┴────────────┴──────────┴───────────┘
