In [23]:
import kumoai.experimental.rfm as rfm, os
from pathlib import Path
import pandas as pd
import numpy as np

In [24]:
home_api_key_file = Path.home() / "kumoai_key.txt"
with open(home_api_key_file, "r") as file:
    api_key = file.read().strip()
os.environ["KUMO_API_KEY"] = api_key

In [25]:
rfm.init()

Client has already been created. To re-initialize Kumo, please start a new interpreter. No changes will be made to the current session.


In [26]:
root = 's3://kumo-sdk-public/rfm-datasets/online-shopping'
users_df = pd.read_parquet(f'{root}/users.parquet')
items_df = pd.read_parquet(f'{root}/items.parquet')
orders_df = pd.read_parquet(f'{root}/orders.parquet')

In [29]:
users = rfm.LocalTable(users_df, name="users").infer_metadata()
items = rfm.LocalTable(items_df, name="items").infer_metadata()
orders = rfm.LocalTable(orders_df, name="orders").infer_metadata()

Detected primary key 'user_id' in table 'users'
Detected primary key 'item_id' in table 'items'
Detected primary key 'order_id' and time column 'date' in table 'orders'


In [30]:
users.print_metadata()
items.print_metadata()
orders.print_metadata()

### 🏷️ Metadata of Table `users` (1,000 rows)

name,dtype,stype,is_primary_key,is_time_column
user_id,int,ID,True,False
active,bool,categorical,False,False
age,int,numerical,False,False


### 🏷️ Metadata of Table `items` (1,000 rows)

name,dtype,stype,is_primary_key,is_time_column
item_id,int,ID,True,False
item_name,string,text,False,False
category,string,categorical,False,False
color,string,categorical,False,False
descriptions,string,text,False,False


### 🏷️ Metadata of Table `orders` (267,774 rows)

name,dtype,stype,is_primary_key,is_time_column
user_id,int,ID,False,False
item_id,int,ID,False,False
date,date,timestamp,False,True
sales_channel_id,int,ID,False,False
price,float,numerical,False,False
order_id,int,ID,True,False


In [31]:
graph = rfm.LocalGraph(tables=[users, items, orders])

In [32]:
graph.link(src_table="orders", fkey="user_id", dst_table="users")

LocalGraph(
  tables=[users, items, orders],
  edges=[Edge(src_table='orders', fkey='user_id', dst_table='users')],
)

In [33]:
graph.link(src_table="orders", fkey="item_id", dst_table="items")

LocalGraph(
  tables=[users, items, orders],
  edges=[Edge(src_table='orders', fkey='user_id', dst_table='users'), Edge(src_table='orders', fkey='item_id', dst_table='items')],
)

In [34]:
graph.print_metadata()
graph.print_links()

### 🗂️ Graph Metadata

name,primary_key,time_column
users,user_id,-
items,item_id,-
orders,order_id,date


### 🕸️ Graph Links (FK ↔️ PK)

- `orders.item_id` ↔️ `items.item_id`
- `orders.user_id` ↔️ `users.user_id`

In [35]:
model = rfm.KumoRFM(graph)

Output()

In [36]:
orders_df[['date']].describe()

Unnamed: 0,date
count,267774
mean,2023-09-15 02:06:26.385534
min,2022-09-20 00:00:00
25%,2023-03-31 00:00:00
50%,2023-08-24 00:00:00
75%,2024-03-27 00:00:00
max,2024-09-19 00:00:00


In [119]:
#item_id = 42
#anchor_date = pd.Timestamp("2023-03-31") 

# Looking at average error for a sample of items, in the next 30 days from different anchor dates/

days = 30
samp_size = 5000

anchor_dates = [pd.Timestamp("2022-12-20"), 
                pd.Timestamp("2023-03-31"), 
                pd.Timestamp("2023-08-24"), 
                pd.Timestamp("2024-03-27"), 
                pd.Timestamp("2024-07-18")]

for anchor_date in anchor_dates: 

    anchor_date_plus_30 = anchor_date + pd.Timedelta(days=days)

    train_table = model.get_train_table(query, size = samp_size, anchor_time=anchor_date)
    sample_item_ids = tuple(train_table.ENTITY)
    #sample_item_ids = tuple(items_df.item_id.sample(samp_size))


    query = f"PREDICT SUM(orders.price, 0, {days}, days) FOR items.item_id in {sample_item_ids}"
    predictions = model.predict(query, anchor_time=anchor_date, verbose=False)[['ENTITY', 'TARGET_PRED']]

    orders_for_sample_in_next_x_days = orders_df[
        (orders_df['item_id'].isin(sample_item_ids)) &
        (orders_df['date'] > anchor_date) &
        (orders_df['date'] <= anchor_date_plus_30)
    ]
    actuals = (
        orders_for_sample_in_next_x_days[['item_id', 'price']]
        .groupby('item_id', as_index=False)['price']
        .sum()
    )

    actuals_and_actuals = train_table.merge(actuals, left_on='ENTITY', right_on='item_id', how='inner')
    z = abs(actuals_and_actuals['price'] - actuals_and_actuals['TARGET']).max()


    preds_and_actuals = predictions.merge(actuals, left_on='ENTITY', right_on='item_id', how='inner')
    avg_abs_error = (preds_and_actuals['TARGET_PRED'] - preds_and_actuals['price']).abs().mean()
    evaluation = model.evaluate(query, anchor_time=anchor_date, verbose=False)
    mae_value = evaluation.loc[evaluation['metric'] == 'mae', 'value'].iloc[0]

    print(anchor_date, z, avg_abs_error, mae_value)


2022-12-20 00:00:00 8.789062485448085e-05 162.90065910270624 168.64468383789062
2023-03-31 00:00:00 8.300781246362021e-05 158.72074088989257 154.96910095214844
2023-08-24 00:00:00 7.812500007275958e-05 152.13921017990268 153.7912139892578
2024-03-27 00:00:00 6.835937506366463e-05 157.3534420472113 155.17543029785156
2024-07-18 00:00:00 4.3945312427240424e-05 157.9994882330579 167.11534118652344


In [144]:
predictions

Unnamed: 0,ENTITY,TARGET_PRED
0,978,329.257477
1,933,486.274384
2,859,349.018311
3,916,359.525208
4,127,338.536438
...,...,...
995,548,511.728363
996,609,304.197754
997,961,500.260071
998,973,373.070221


In [113]:
train_table = model.get_train_table(query, size = 27,anchor_time=anchor_date)
sample_item_ids = tuple(train_table.ENTITY)
orders_for_sample_in_next_x_days = orders_df[
    (orders_df['item_id'].isin(sample_item_ids)) &
    (orders_df['date'] > anchor_date) &
    (orders_df['date'] <= anchor_date_plus_30)
]
actuals = (
    orders_for_sample_in_next_x_days[['item_id', 'price']]
    .groupby('item_id', as_index=False)['price']
    .sum()
)
actuals_and_actuals = train_table.merge(actuals, left_on='ENTITY', right_on='item_id', how='inner')
abs(actuals_and_actuals['price'] - actuals_and_actuals['TARGET']).max()


np.float64(2.4414062522737368e-05)

In [123]:
orders_df[(orders_df['date'] > anchor_date)]['date'].min(), anchor_date

(Timestamp('2024-07-19 00:00:00'), Timestamp('2024-07-18 00:00:00'))

In [128]:
query = f"PREDICT SUM(orders.price, 0, {days}, days) FOR items.item_id in {sample_item_ids[0:3]}"
query

'PREDICT SUM(orders.price, 0, 30, days) FOR items.item_id in (978, 933, 859)'

In [None]:
min	2022-09-20 00:00:00
max	2024-09-19 00:00:00

In [142]:
query = 'PREDICT SUM(orders.price, 0, 30, days) FOR items.item_id in (978, 933, 859)'
model.evaluate(query, anchor_time=pd.Timestamp("2024-08-20"), verbose=True)

Output()

Unnamed: 0,metric,value
0,mae,160.63678
1,mse,49606.113281
2,rmse,222.724304


In [134]:
195.908081**2

38379.97620110257

In [130]:
model.predict(query, anchor_time=anchor_date, verbose=True)

Output()

Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED
0,978,2024-07-18T00:00:00,288.427399
1,933,2024-07-18T00:00:00,518.106506
2,859,2024-07-18T00:00:00,337.2005


Hello. Based on the explanation that comes with the output of model.evaluate, the metrics (e.g., MAE) are based on 1000 test examples. Is there a way to restrict the metrics to only the entities specified in the query? For example, I have query = f"PREDICT SUM(orders.price, 0, 30, days) FOR items.item_id in (978,933,859)", and may want to get the MAE only for those 3 items. And can you describe how those 1000 test examples are selected, and whether their selection depends on the 3 items in the query? 

In [146]:
query = f"PREDICT SUM(orders.price, 0, 30, days) FOR items.item_id in {sample_item_ids[0:7]}"
model.predict(query, anchor_time=pd.Timestamp("2024-08-20"), verbose=True)

Output()

Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED
0,978,2024-08-20T00:00:00,209.930023
1,933,2024-08-20T00:00:00,208.928314
2,859,2024-08-20T00:00:00,233.45343
3,916,2024-08-20T00:00:00,368.220856
4,127,2024-08-20T00:00:00,208.719055
5,608,2024-08-20T00:00:00,362.014709
6,856,2024-08-20T00:00:00,326.346008


### MY QUESTIONS
- How do I access to the actual value I am predicting, without having to calculate it manually.
- How to interpret the result of evaluate. It does not seem to be just the metrics on a single prediction.