In [19]:
import kumoai.experimental.rfm as rfm, os
from pathlib import Path
import pandas as pd
import csv
import math
import random
from datetime import datetime, timedelta, timezone


In [2]:
home_api_key_file = Path.home() / "kumoai_key.txt"
with open(home_api_key_file, "r") as file:
    api_key = file.read().strip()
os.environ["KUMO_API_KEY"] = api_key

rfm.init()

[2025-08-17 09:50:28 - kumoai:196 - INFO] Successfully initialized the Kumo SDK against deployment https://kumorfm.ai/api, with log level INFO.


## Exercise

Create a synthetic dataset with the purpose of testing how KumoRFM deals with temporal dependency that is not in the transactional history, but instead in a user type variable that can change over time.
Essentially, there are users of 2 types (premium/free), and premium users can do actions that free users cannot.
The exercise is about testing if KumoRFM detects this dependency and consideres it for predictions.

### Dataset description
- Create a synthetic dataset representing users of 2 types (tier free or tier premium), uploading files of different sizes, and with their user tiers changing over time.
- While on Premium tier, users can upload any size. 
- While on free tier, users can upload files below 10GB. 
- For simplicity, there are only 2 sizes: 5GB and 50GB.
- Users can change their tier no earlier than 24 hours after the last change. 
- Within an hour of becoming premium, the likelihood of the user uploading a 50GB is much higher than after the first hour.
- There are 50 users, covering 5 cohorts:
    - Always premium (unknown start) → a single interval that spans the whole window.
    - Always free → a single interval that spans the whole window.
    - Premium → Free once (≥24h after start).
    - Free → Premium once (≥24h).
    - Free → Premium → Free (each change ≥24h apart; all within window).
- The history of transactions last 10 days from March 1st to March 11th, 2025.
- The prediction tasks will be done for different users, at different points in time, predicting their likelihood of uploading a 50GB file within the next hour.
- Expectations: 
    - For users that are Free tier, this should be 0. 
    - For users that just became Premium it should be high during the first hour. 
    - For users that had been Premium for sometime (beyond the first hour), it should be >> 0 but not as high. 

### Tables

- users (50 users). This set is just random ids and names.
    - user_id (PK)
    - name

- items (80 different items: 45 of 5 GB, 35 of 50 GB). Basically random ids.
    - item_id (PK)
    - size_gb in {5, 50}

- tiers. See creation process below.
    - tier_status_id (PK)
    - user_id (FK -> users.user_id)
    - from_datetime
    - until_datetime
    - status in {'free', 'premium'}


- uploads. See creation process below.
    - upload_id (PK)
    - user_id (FK -> users.user_id)
    - item_id (FK -> items.item_id)
    - datetime


### Creation process (not efficient, prioritize simplicity/clarity)

- tiers:
    - User ids from 1 to 10: always premium: 1 row in tiers.
        - from_date = START, until_date = END, status = 'premium'
    - User ids from 11 to 20: always free: 1 row in tiers.
        - from_date = START, until_date = END: status = 'free'    
    - User ids from 21 to 30: premium to free: 2 rows in tiers.
        - Take a random date X in (START, END)
        - from_date = START, until_date = X: status = 'premium'
        - from_date = X, until_date = END: status = 'free'
    - user ids from 31 to 40: free to premium: 2 rows in tiers.
        - Take a random date X in (START, END)
        - from_date = START, until_date = X: status = 'free'
        - from_date = X, until_date = END: status = 'premium'
    - user ids from 41 to 50: free to premium to free: 3 rows in tiers.
        - Take random dates X1 < X2 in (START, END)
        - from_date = START, until_date = X1: status = 'free'
        - from_date = X1, until_date = X2: status = 'premium'
        - from_date = X2, until_date = END: status = 'free'

- uploads:
    - for each tier_status_id, create the uploads in the corresponding interval.
    - Given a tier_status_id, get the details: user_id, from_date, until_date, status.
    - If status = 'free':
        - compute the number of hours delta_h from from_date to until_date.
        - take a sample of r = ceil(delta_h/5) times in (from_date, until_date).
        - use those as datetime.
        - generate the corresponding number of rows for uploads.
        - keep track of the last generated upload_id.
        - the user_id is the current one.
        - items_id's should be taken at random from those with 5GB.
    - if status = 'premium':
        - Do the same as for free.
        - Then do the same as for free, but sampling ceil(delta_h/10) times in (from_date, until_date), and from 50GB items.
        - And then:
        - compute the from_date_plus as 1 hour after from_date.
        - Generate 10 datetimes in (from_date, from_date_plus), each with probability 50%.
        - use those as datetime, generate the corresponding number of rows for uploads.
        - keep track of the last generated upload_id.
        - use the same user id, and sample item_id's from those with 50GB.


In [21]:
import csv
import math
import random
from datetime import datetime, timedelta, timezone

# ----------------------------
# Config
# ----------------------------
SEED = 42
random.seed(SEED)

START = datetime(2025, 3, 1, 0, 0, 0, tzinfo=timezone.utc)
END   = datetime(2025, 3, 11, 0, 0, 0, tzinfo=timezone.utc)  # half-open [START, END)

USERS_CSV   = "users.csv"
ITEMS_CSV   = "items.csv"
TIERS_CSV   = "tiers.csv"
UPLOADS_CSV = "uploads.csv"

N_USERS = 50
N_ITEMS_5GB = 45
N_ITEMS_50GB = 35

# ----------------------------
# Helpers
# ----------------------------
def rand_dt(a, b):
    total_sec = int((b - a).total_seconds())
    if total_sec <= 1:
        return a + timedelta(seconds=1)
    off = random.randint(1, total_sec - 1)
    return a + timedelta(seconds=off)

def sample_times(n, a, b):
    return [rand_dt(a, b) for _ in range(n)]

def hours_between(a, b):
    return (b - a).total_seconds() / 3600.0

# ----------------------------
# 1) users
# ----------------------------
users = []
for uid in range(1, N_USERS + 1):
    users.append([uid, f"User {uid:03d}"])

# ----------------------------
# 2) items
# ----------------------------
items = []
for iid in range(1, N_ITEMS_5GB + 1):
    items.append([iid, 5])
for iid in range(N_ITEMS_5GB + 1, N_ITEMS_5GB + N_ITEMS_50GB + 1):
    items.append([iid, 50])

item_ids_5 = [row[0] for row in items if row[1] == 5]
item_ids_50 = [row[0] for row in items if row[1] == 50]

# ----------------------------
# 3) tiers
# ----------------------------
tiers = []
tier_status_id = 1

def add_interval(uid, a, b, status):
    global tier_status_id
    tiers.append([tier_status_id, uid, a, b, status])
    tier_status_id += 1

# always premium
for uid in range(1, 11):
    add_interval(uid, START, END, "premium")

# always free
for uid in range(11, 21):
    add_interval(uid, START, END, "free")

# premium -> free
for uid in range(21, 31):
    lo = START + timedelta(hours=24)
    hi = END - timedelta(hours=24)
    x = lo + timedelta(seconds=random.randint(0, int((hi - lo).total_seconds())))
    add_interval(uid, START, x, "premium")
    add_interval(uid, x, END, "free")

# free -> premium
for uid in range(31, 41):
    lo = START + timedelta(hours=24)
    hi = END - timedelta(hours=24)
    x = lo + timedelta(seconds=random.randint(0, int((hi - lo).total_seconds())))
    add_interval(uid, START, x, "free")
    add_interval(uid, x, END, "premium")

# free -> premium -> free
for uid in range(41, 51):
    lo1 = START + timedelta(hours=24)
    hi1 = END - timedelta(hours=48)
    x1 = lo1 + timedelta(seconds=random.randint(0, int((hi1 - lo1).total_seconds())))

    lo2 = x1 + timedelta(hours=24)
    hi2 = END - timedelta(seconds=1)
    if lo2 >= hi2:
        hi2 = x1 + timedelta(hours=48)
    x2 = lo2 + timedelta(seconds=random.randint(0, int((hi2 - lo2).total_seconds())))

    add_interval(uid, START, x1, "free")
    add_interval(uid, x1, x2, "premium")
    add_interval(uid, x2, END, "free")

# ----------------------------
# 4) uploads
# ----------------------------
uploads = []
upload_id = 1

for row in tiers:
    uid = row[1]
    a = row[2]
    b = row[3]
    status = row[4]

    delta_h = hours_between(a, b)
    if delta_h <= 0:
        continue

    if status == "free":
        r = math.ceil(delta_h / 5.0)
        times = sample_times(r, a, b)
        for t in times:
            uploads.append([upload_id, uid, random.choice(item_ids_5), t])
            upload_id += 1

    if status == "premium":
        r5 = math.ceil(delta_h / 5.0)
        times5 = sample_times(r5, a, b)
        for t in times5:
            uploads.append([upload_id, uid, random.choice(item_ids_5), t])
            upload_id += 1

        r50 = math.ceil(delta_h / 10.0)
        times50 = sample_times(r50, a, b)
        for t in times50:
            uploads.append([upload_id, uid, random.choice(item_ids_50), t])
            upload_id += 1

        burst_end = min(a + timedelta(hours=1), b)
        for _ in range(10):
            if random.random() < 0.5:
                t = rand_dt(a, burst_end)
                uploads.append([upload_id, uid, random.choice(item_ids_50), t])
                upload_id += 1

# ----------------------------
# Write CSVs
# ----------------------------
with open(USERS_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["user_id", "name"])
    w.writerows(users)

with open(ITEMS_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["item_id", "size_gb"])
    w.writerows(items)

with open(TIERS_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["tier_status_id", "user_id", "from_datetime", "until_datetime", "status"])
    for tr in tiers:
        w.writerow([
            tr[0],
            tr[1],
            tr[2].strftime("%Y-%m-%dT%H:%M:%SZ"),
            tr[3].strftime("%Y-%m-%dT%H:%M:%SZ"),
            tr[4]
        ])

with open(UPLOADS_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["upload_id", "user_id", "item_id", "datetime"])
    for up in uploads:
        w.writerow([
            up[0],
            up[1],
            up[2],
            up[3].strftime("%Y-%m-%dT%H:%M:%SZ")
        ])

print("Done. Wrote:", USERS_CSV, ITEMS_CSV, TIERS_CSV, UPLOADS_CSV)


Done. Wrote: users.csv items.csv tiers.csv uploads.csv


In [22]:
users_df = pd.read_csv(USERS_CSV)
tiers_df = pd.read_csv(TIERS_CSV)
items_df = pd.read_csv(ITEMS_CSV)
uploads_df = pd.read_csv(UPLOADS_CSV)

In [29]:
display(users_df.head(3))
display(tiers_df.head(3))
display(items_df.head(3))
display(uploads_df.head(3))


Unnamed: 0,user_id,name
0,1,User 001
1,2,User 002
2,3,User 003


Unnamed: 0,tier_status_id,user_id,from_datetime,until_datetime,status
0,1,1,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
1,2,2,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
2,3,3,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium


Unnamed: 0,item_id,size_gb
0,1,5
1,2,5
2,3,5


Unnamed: 0,upload_id,user_id,item_id,datetime
0,1,1,35,2025-03-02T04:10:15Z
1,2,1,16,2025-03-05T08:33:38Z
2,3,1,11,2025-03-05T04:11:04Z


In [33]:
print(users_df.shape[0], users_df['user_id'].nunique())
print(tiers_df.shape[0], tiers_df['tier_status_id'].nunique())
print(items_df.shape[0], items_df['item_id'].nunique())
print(uploads_df.shape[0], uploads_df['upload_id'].nunique())


50 50
90 90
80 80
3181 3181


In [72]:
uid = 33
tiers_df.loc[tiers_df.user_id == uid]

Unnamed: 0,tier_status_id,user_id,from_datetime,until_datetime,status
44,45,33,2025-03-01T00:00:00Z,2025-03-02T09:15:26Z,free
45,46,33,2025-03-02T09:15:26Z,2025-03-11T00:00:00Z,premium


In [74]:
tmp = uploads_df.loc[uploads_df.user_id == uid].sort_values(by='datetime').copy()
tmp = tmp.merge(items_df)
tmp.head(40).tail(20)

Unnamed: 0,upload_id,user_id,item_id,datetime,size_gb
20,2028,33,7,2025-03-03T20:17:31Z,5
21,2034,33,27,2025-03-03T22:00:42Z,5
22,2071,33,78,2025-03-04T00:24:00Z,50
23,2027,33,7,2025-03-04T04:12:17Z,5
24,2042,33,3,2025-03-04T04:19:48Z,5
25,2031,33,19,2025-03-04T09:56:51Z,5
26,2030,33,7,2025-03-04T11:36:43Z,5
27,2080,33,61,2025-03-04T15:59:51Z,50
28,2033,33,23,2025-03-04T17:23:34Z,5
29,2040,33,39,2025-03-04T20:04:14Z,5


In [None]:
#tmp.head(80).tail(20)



Unnamed: 0,tier_status_id,user_id,from_datetime,until_datetime,status
20,21,21,2025-03-01T00:00:00Z,2025-03-09T18:14:47Z,premium
21,22,21,2025-03-09T18:14:47Z,2025-03-11T00:00:00Z,free


#### Notes:
- items is only 2 rows. We do not distinguish between items: just make it 

In [3]:
"""
Generate a small relational dataset (CSV) for graph + prediction testing.

Tables:
- users(user_id PK, name)
- tiers(user_id FK, from_datetime, until_datetime, tier)  # composite PK (user_id, from_datetime)
- items(item_id PK, size_gb)                              # exactly two rows: (1,5), (2,50)
- uploads(txn_id PK, user_id FK, item_id FK, datetime)

Cohorts (20 users each):
1) Always premium
2) Always free
3) Premium -> Free (once)
4) Free -> Premium (once)
5) Free -> Premium -> Free (twice; each change >= 24h apart)

Behavior:
- While free: uploads only 5GB (item_id=1).
- While premium: uploads both 5GB and 50GB (item_id ∈ {1,2}).
- Within 1 hour after becoming premium: very high chance (90%) of at least one 50GB upload.
- Exactly when/while free: 50GB uploads are impossible.
- Upload rates are user-specific and higher on premium than free.
"""

from __future__ import annotations
import csv
import math
from dataclasses import dataclass
from datetime import datetime, timezone, timedelta
import random
import numpy as np

# ------------------------
# Configuration
# ------------------------
SEED = 42
N_USERS = 50
START = datetime(2025, 3, 1, 0, 0, 0, tzinfo=timezone.utc)
END = datetime(2025, 3, 11, 0, 0, 0, tzinfo=timezone.utc)  # end-exclusive (covers Mar 1–10)
MIN_GAP_BETWEEN_TIER_CHANGES = timedelta(hours=24)

# Upload rate modeling (per-hour) via user-specific lognormal draws
FREE_RATE_DAY_MEAN = 1.0   # typical free uploads per day
FREE_RATE_DAY_STD = 0.7
PREM_RATE_DAY_MEAN = 3.0   # typical premium uploads per day
PREM_RATE_DAY_STD = 1.0

# Size selection while premium (outside the "first hour" spike)
BASE_P50_WHILE_PREMIUM = 0.4  # 40% chance of 50GB for non-spike premium uploads

# Spike rules when becoming premium
P_SPIKE_50GB_WITHIN_1H = 0.90
MAX_SPIKE_50GB = 2  # cap the number of spike 50GB uploads within the first hour

# Output files
USERS_CSV = "users.csv"
TIERS_CSV = "tiers.csv"
ITEMS_CSV = "items.csv"
UPLOADS_CSV = "uploads.csv"

# Fixed items table: exactly two rows
ITEMS = [
    {"item_id": 1, "size_gb": 5},
    {"item_id": 2, "size_gb": 50},
]
ITEM_SIZE_BY_ID = {1: 5, 2: 50}

random.seed(SEED)
np.random.seed(SEED)

@dataclass
class TierInterval:
    user_id: int
    from_dt: datetime
    until_dt: datetime  # end-exclusive
    tier: str           # 'free' | 'premium'

def iso(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def draw_user_rate(mean_per_day: float, std_per_day: float) -> float:
    """Draw a user-specific rate (per hour) using a lognormal around mean/std per day."""
    mean = max(0.01, mean_per_day)
    std = max(0.01, std_per_day)
    cv2 = (std / mean) ** 2
    sigma2 = math.log(cv2 + 1.0)
    sigma = math.sqrt(sigma2)
    mu = math.log(mean) - 0.5 * sigma2
    per_day = np.random.lognormal(mean=mu, sigma=sigma)
    return float(per_day / 24.0)

def bounded_random_time(start: datetime, end: datetime) -> datetime:
    """Uniform random time in [start, end)."""
    if end <= start:
        return start
    span = (end - start).total_seconds()
    offs = random.random() * span
    return start + timedelta(seconds=offs)

def generate_cohort_tiers(user_id: int, cohort: int) -> list[TierInterval]:
    """Create tier intervals for a single user according to cohort."""
    t0 = START
    tE = END

    def random_change_time(low: datetime, high: datetime) -> datetime:
        lo = low + MIN_GAP_BETWEEN_TIER_CHANGES
        hi = high - MIN_GAP_BETWEEN_TIER_CHANGES
        if hi <= lo:
            return low + (high - low) / 2
        return bounded_random_time(lo, hi)

    intervals: list[TierInterval] = []

    if cohort == 1:  # Always premium
        intervals.append(TierInterval(user_id, t0, tE, "premium"))

    elif cohort == 2:  # Always free
        intervals.append(TierInterval(user_id, t0, tE, "free"))

    elif cohort == 3:  # Premium -> Free (once)
        c1 = random_change_time(t0, tE)
        intervals.append(TierInterval(user_id, t0, c1, "premium"))
        intervals.append(TierInterval(user_id, c1, tE, "free"))

    elif cohort == 4:  # Free -> Premium (once)
        c1 = random_change_time(t0, tE)
        intervals.append(TierInterval(user_id, t0, c1, "free"))
        intervals.append(TierInterval(user_id, c1, tE, "premium"))

    elif cohort == 5:  # Free -> Premium -> Free
        c1 = random_change_time(t0, tE - MIN_GAP_BETWEEN_TIER_CHANGES)
        c2_low = c1 + MIN_GAP_BETWEEN_TIER_CHANGES
        c2_high = tE
        if c2_low + MIN_GAP_BETWEEN_TIER_CHANGES >= c2_high:
            c2 = c1 + (tE - c1) / 2
        else:
            c2 = random_change_time(c2_low, c2_high)
        intervals.append(TierInterval(user_id, t0, c1, "free"))
        intervals.append(TierInterval(user_id, c1, c2, "premium"))
        intervals.append(TierInterval(user_id, c2, tE, "free"))

    intervals.sort(key=lambda x: x.from_dt)
    fixed: list[TierInterval] = []
    prev_end = None
    for it in intervals:
        if prev_end and it.from_dt < prev_end:
            it = TierInterval(it.user_id, prev_end, it.until_dt, it.tier)
        prev_end = it.until_dt
        fixed.append(it)
    return fixed

def simulate_uploads_for_interval(user_id: int,
                                  interval: TierInterval,
                                  free_rate_per_hour: float,
                                  prem_rate_per_hour: float,
                                  next_txn_id: int) -> tuple[list[dict], int]:
    """
    Simulate uploads for a single interval.
    Returns uploads_rows, next_txn_id
    """
    uploads_rows: list[dict] = []
    tier = interval.tier
    rate_per_hour = free_rate_per_hour if tier == "free" else prem_rate_per_hour

    # Hourly grid across [from, until)
    cursor = interval.from_dt
    while cursor < interval.until_dt:
        hour_end = min(cursor + timedelta(hours=1), interval.until_dt)
        lam = rate_per_hour * (hour_end - cursor).total_seconds() / 3600.0
        n = np.random.poisson(lam=lam) if lam > 0 else 0
        for _ in range(n):
            ts = bounded_random_time(cursor, hour_end)
            if tier == "free":
                item_id = 1  # 5 GB only
            else:
                # Premium outside spike window: choose with base probability
                item_id = 2 if random.random() < BASE_P50_WHILE_PREMIUM else 1
            uploads_rows.append({
                "txn_id": next_txn_id,
                "user_id": user_id,
                "item_id": item_id,
                "datetime": iso(ts)
            })
            next_txn_id += 1
        cursor = hour_end

    # Spike: within first hour after becoming premium
    if tier == "premium":
        became_premium_now = interval.from_dt > START
        if became_premium_now and random.random() < P_SPIKE_50GB_WITHIN_1H:
            spike_window_end = min(interval.from_dt + timedelta(hours=1), interval.until_dt)
            if spike_window_end > interval.from_dt:
                n_spike = 1 + (1 if (MAX_SPIKE_50GB > 1 and random.random() < 0.15) else 0)
                for _ in range(n_spike):
                    ts = bounded_random_time(interval.from_dt, spike_window_end)
                    uploads_rows.append({
                        "txn_id": next_txn_id,
                        "user_id": user_id,
                        "item_id": 2,  # force 50 GB in spike
                        "datetime": iso(ts)
                    })
                    next_txn_id += 1

    return uploads_rows, next_txn_id

def main():
    # Users
    users = [{"user_id": uid, "name": f"User {uid:03d}"} for uid in range(1, N_USERS + 1)]

    # Cohorts by user_id blocks of 20
    def cohort_of(uid: int) -> int:
        return (uid - 1) // 20 + 1  # 1..5

    # User-specific rates
    free_rate_per_hour = {u["user_id"]: draw_user_rate(FREE_RATE_DAY_MEAN, FREE_RATE_DAY_STD) for u in users}
    prem_rate_per_hour = {u["user_id"]: draw_user_rate(PREM_RATE_DAY_MEAN, PREM_RATE_DAY_STD) for u in users}

    # Tier intervals
    tiers: list[TierInterval] = []
    for u in users:
        tiers.extend(generate_cohort_tiers(u["user_id"], cohort_of(u["user_id"])))
    tiers.sort(key=lambda t: (t.user_id, t.from_dt))

    # Simulate uploads
    uploads_rows: list[dict] = []
    next_txn_id = 1
    for t in tiers:
        ups, next_txn_id = simulate_uploads_for_interval(
            user_id=t.user_id,
            interval=t,
            free_rate_per_hour=free_rate_per_hour[t.user_id],
            prem_rate_per_hour=prem_rate_per_hour[t.user_id],
            next_txn_id=next_txn_id
        )
        uploads_rows.extend(ups)

    # Sort uploads by time (optional neatness)
    uploads_rows.sort(key=lambda r: (r["user_id"], r["datetime"]))
    # Re-assign txn_ids sequentially
    for i, r in enumerate(uploads_rows, start=1):
        r["txn_id"] = i

    # Write CSVs
    with open(USERS_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["user_id", "name"])
        w.writeheader()
        w.writerows(users)

    with open(TIERS_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["user_id", "from_datetime", "until_datetime", "tier"])
        w.writeheader()
        for t in tiers:
            w.writerow({
                "user_id": t.user_id,
                "from_datetime": iso(t.from_dt),
                "until_datetime": iso(t.until_dt),
                "tier": t.tier
            })

    with open(ITEMS_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["item_id", "size_gb"])
        w.writeheader()
        w.writerows(ITEMS)

    with open(UPLOADS_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["txn_id", "user_id", "item_id", "datetime"])
        w.writeheader()
        w.writerows(uploads_rows)

    # Integrity check: no 50GB while free
    by_user = {}
    for t in tiers:
        by_user.setdefault(t.user_id, []).append(t)

    def tier_at(user_id: int, dt_str: str) -> str:
        dt = datetime.strptime(dt_str, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
        for it in by_user[user_id]:
            if it.from_dt <= dt < it.until_dt:
                return it.tier
        return "unknown"

    violations = []
    for up in uploads_rows:
        t = tier_at(up["user_id"], up["datetime"])
        size = ITEM_SIZE_BY_ID[up["item_id"]]
        if t == "free" and size == 50:
            violations.append(up)

    print(f"Generated: {USERS_CSV}, {TIERS_CSV}, {ITEMS_CSV}, {UPLOADS_CSV}")
    if violations:
        print(f"[WARNING] Found {len(violations)} violations (50GB while free).")
    else:
        print("Integrity check passed: No 50GB uploads while free.")

if __name__ == "__main__":
    main()


Generated: users.csv, tiers.csv, items.csv, uploads.csv
Integrity check passed: No 50GB uploads while free.


In [4]:
def get_user_tier_summary(user_id: int, tiers_csv: str = "tiers.csv"):
    """
    Return a list of (start_iso, end_iso, tier) covering the user’s tier evolution.
    - start/end are ISO-8601 UTC strings: 'YYYY-MM-DDTHH:MM:SSZ'
    - tier is 'free' or 'premium'
    """
    # Load and filter
    tiers = pd.read_csv(tiers_csv)
    tiers = tiers[tiers["user_id"] == user_id].copy()
    if tiers.empty:
        return []  # unknown user_id

    # Parse/normalize
    tiers["from_datetime"] = pd.to_datetime(tiers["from_datetime"], utc=True)
    tiers["until_datetime"] = pd.to_datetime(tiers["until_datetime"], utc=True)
    tiers = tiers.sort_values(["from_datetime", "until_datetime", "tier"])

    # Optional: coalesce adjacent intervals with the same tier
    merged = []
    for _, row in tiers.iterrows():
        start, end, tier = row["from_datetime"], row["until_datetime"], row["tier"]
        if not merged:
            merged.append([start, end, tier])
        else:
            last_start, last_end, last_tier = merged[-1]
            # If same tier and touching (no gap), merge
            if tier == last_tier and start == last_end:
                merged[-1][1] = end
            else:
                merged.append([start, end, tier])

    # Format to tuples with ISO-8601 Zulu
    out = [(s.strftime("%Y-%m-%dT%H:%M:%SZ"),
            e.strftime("%Y-%m-%dT%H:%M:%SZ"),
            t) for s, e, t in merged]

    return out

# --- examples ---
if __name__ == "__main__":
    # Always-Free or Always-Premium users will yield one tuple
    # Mixed cohorts will yield 2 or 3 tuples

    print("User 5:", get_user_tier_summary(5))
    print("User 25:", get_user_tier_summary(25))
    print("User 45:", get_user_tier_summary(45))
    print("User 85:", get_user_tier_summary(85))


User 5: [('2025-03-01T00:00:00Z', '2025-03-11T00:00:00Z', 'premium')]
User 25: [('2025-03-01T00:00:00Z', '2025-03-11T00:00:00Z', 'free')]
User 45: [('2025-03-01T00:00:00Z', '2025-03-07T21:24:08Z', 'premium'), ('2025-03-07T21:24:08Z', '2025-03-11T00:00:00Z', 'free')]
User 85: []


In [169]:
# df = pd.read_csv("tiers.csv")
# df.insert(0, "tier_id", range(1, len(df) + 1))
# df.to_csv("tiers.csv", index=False)

In [5]:
users_df = pd.read_csv(USERS_CSV)
tiers_df = pd.read_csv(TIERS_CSV)
items_df = pd.read_csv(ITEMS_CSV)
uploads_df = pd.read_csv(UPLOADS_CSV)

In [6]:
users_df.head(3)

Unnamed: 0,user_id,name
0,1,User 001
1,2,User 002
2,3,User 003


In [7]:
items_df.head(2)

Unnamed: 0,item_id,size_gb
0,1,5
1,2,50


In [8]:
uploads_df.head(3)
uploads_df = uploads_df.merge(items_df, on="item_id", how="left")
uploads_df.head(3)


Unnamed: 0,txn_id,user_id,item_id,datetime,size_gb
0,1,1,1,2025-03-01T04:13:07Z,5
1,2,1,2,2025-03-01T08:01:35Z,50
2,3,1,1,2025-03-01T08:38:59Z,5


In [9]:
uploads_df.shape

(896, 5)

In [10]:
tiers_df.head()

Unnamed: 0,user_id,from_datetime,until_datetime,tier
0,1,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
1,2,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
2,3,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
3,4,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium
4,5,2025-03-01T00:00:00Z,2025-03-11T00:00:00Z,premium


In [11]:
# Ensure datetime columns are parsed
uploads_df["datetime"] = pd.to_datetime(uploads_df["datetime"], utc=True)
tiers_df["from_datetime"] = pd.to_datetime(tiers_df["from_datetime"], utc=True)
tiers_df["until_datetime"] = pd.to_datetime(tiers_df["until_datetime"], utc=True)

# Merge on user_id
merged = uploads_df.merge(tiers_df[['user_id', 'from_datetime', 'until_datetime', 'tier']], on="user_id", how="left")

# Filter for intervals
uploads_df = merged[
    (merged["datetime"] >= merged["from_datetime"]) &
    (merged["datetime"] <  merged["until_datetime"])
]

In [12]:
uploads_df.drop(columns=["from_datetime", "until_datetime"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uploads_df.drop(columns=["from_datetime", "until_datetime"], inplace=True)


In [13]:
uploads_df.head(3)

Unnamed: 0,txn_id,user_id,item_id,datetime,size_gb,tier
0,1,1,1,2025-03-01 04:13:07+00:00,5,premium
1,2,1,2,2025-03-01 08:01:35+00:00,50,premium
2,3,1,1,2025-03-01 08:38:59+00:00,5,premium


In [14]:
users = rfm.LocalTable(users_df, name="users").infer_metadata()
#tiers = rfm.LocalTable(tiers_df, name="tiers").infer_metadata()
items = rfm.LocalTable(items_df, name="items").infer_metadata()
uploads = rfm.LocalTable(uploads_df, name="uploads").infer_metadata()


Detected primary key 'user_id' in table 'users'
Detected primary key 'item_id' in table 'items'
Detected time column 'datetime' in table 'uploads'


In [15]:
uploads['size_gb'].stype = "numerical"
uploads['txn_id'].stype = "ID"
# Set primary key:
uploads.primary_key = "txn_id"


In [16]:
users.print_metadata()
#tiers.print_metadata()
items.print_metadata()
uploads.print_metadata()

### 🏷️ Metadata of Table `users` (50 rows)

name,dtype,stype,is_primary_key,is_time_column
user_id,int,ID,True,False
name,string,text,False,False


### 🏷️ Metadata of Table `items` (2 rows)

name,dtype,stype,is_primary_key,is_time_column
item_id,int,ID,True,False
size_gb,int,categorical,False,False


### 🏷️ Metadata of Table `uploads` (896 rows)

name,dtype,stype,is_primary_key,is_time_column
txn_id,int,ID,True,False
user_id,int,ID,False,False
item_id,int,ID,False,False
datetime,date,timestamp,False,True
size_gb,int,numerical,False,False
tier,string,categorical,False,False


In [224]:
#graph = rfm.LocalGraph(tables=[users, tiers, items, uploads])
graph = rfm.LocalGraph(tables=[users, items, uploads])

In [225]:
#graph.link(src_table="tiers", fkey="user_id", dst_table="users");

In [226]:
graph.link(src_table="uploads", fkey="item_id", dst_table="items");

In [227]:
graph.link(src_table="uploads", fkey="user_id", dst_table="users");

In [228]:
graph.print_links()

### 🕸️ Graph Links (FK ↔️ PK)

- `uploads.item_id` ↔️ `items.item_id`
- `uploads.user_id` ↔️ `users.user_id`

In [128]:
print("User 10:", get_user_tier_summary(10))
print("User 30:", get_user_tier_summary(30))
print("User 50:", get_user_tier_summary(50))
print("User 70:", get_user_tier_summary(70))
print("User 90:", get_user_tier_summary(90))

User 10: [('2025-03-01T00:00:00Z', '2025-03-11T00:00:00Z', 'premium')]
User 30: [('2025-03-01T00:00:00Z', '2025-03-11T00:00:00Z', 'free')]
User 50: [('2025-03-01T00:00:00Z', '2025-03-02T05:43:15Z', 'premium'), ('2025-03-02T05:43:15Z', '2025-03-11T00:00:00Z', 'free')]
User 70: [('2025-03-01T00:00:00Z', '2025-03-06T19:54:55Z', 'free'), ('2025-03-06T19:54:55Z', '2025-03-11T00:00:00Z', 'premium')]
User 90: [('2025-03-01T00:00:00Z', '2025-03-03T03:27:05Z', 'free'), ('2025-03-03T03:27:05Z', '2025-03-06T23:40:35Z', 'premium'), ('2025-03-06T23:40:35Z', '2025-03-11T00:00:00Z', 'free')]


In [240]:
model = rfm.KumoRFM(graph, verbose=False)

In [244]:
query = """PREDICT MAX(uploads.size_gb, 0, 3, hours)=50 FOR users.user_id = 10"""
model.predict(query, anchor_time=pd.Timestamp("2025-03-08"), verbose=False)

Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED,False_PROB,True_PROB
0,10,2025-03-08T00:00:00,False,0.564697,0.435303


In [248]:
query = """PREDICT MAX(uploads.size_gb, 0, 3, hours)=50 FOR users.user_id = 30"""
model.predict(query, anchor_time=pd.Timestamp("2025-03-05"), verbose=False)

Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED,False_PROB,True_PROB
0,30,2025-03-05T00:00:00,False,0.834076,0.165924


In [265]:
query = """PREDICT MAX(uploads.size_gb, 0, 3, hours)=50 FOR users.user_id = 50"""
model.predict(query, anchor_time=pd.Timestamp("2025-03-01 05:43:15"), verbose=False)



Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED,False_PROB,True_PROB
0,50,2025-03-01T05:43:15,False,0.647296,0.352704


In [270]:

query = """PREDICT MAX(uploads.size_gb, 0, 1, hours)=50 FOR users.user_id = 70"""
model.predict(query, anchor_time=pd.Timestamp("2025-03-06 20:00:00"), verbose=False)

Unnamed: 0,ENTITY,ANCHOR_TIMESTAMP,TARGET_PRED,False_PROB,True_PROB
0,70,2025-03-06T20:00:00,False,0.655193,0.344807
