# The Art of Dataset Design: How to Build Tables That Support Any Analysis
## By William James

## Imports

In [1]:
import sqlite3
import random
import time
import pandas as pd
from datetime import datetime, timedelta

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Agenda
* Super engaging hook story about Spotify launching in the US
* Less engaging agenda walk through
* Short intro to data warehouse best practices
* Introducing `signals` - my favourite analytics table design
* How to create your very own `signals` dataset (warning technical!)

Notebook available at https://github.com/oldpa/presentations

## Step 0: Create mock event data

In [2]:
# Create a connection and a cursor
!rm events.db
conn = sqlite3.connect('events.db')
cursor = conn.cursor()

# Create the `facts.events` table

cursor.execute('''drop table if exists events''')
cursor.execute('''CREATE TABLE IF NOT EXISTS events
                 (timestamp TEXT, user_id INTEGER, event_name TEXT, value REAL)''')

# Generate events for 100 users
event_types = ["dau", "login", "create_todo", "complete_todo", "move_todo", "delete_todo", "view_ad", "revenue"]
num_users = 100
num_days = 180

# Helper function to generate random date within the last 180 days
def random_date():
    end_date = datetime.now()
    start_date = end_date - timedelta(days=num_days)
    random_seconds = random.randint(0, int((end_date - start_date).total_seconds()))
    return start_date + timedelta(seconds=random_seconds)

def random_time(date):
    return date + timedelta(seconds = random.randint(0, 3600 * 24))

for user_id in range(1, num_users + 1):
    # Simulate users having different engagement
    engagement = random.random()
    signup_date = random_date()
    cursor.execute("INSERT INTO events (timestamp, user_id, event_name, value) VALUES (?, ?, ?, ?)", (signup_date, user_id, 'login', None))
    for i in range(num_days):
        date = signup_date + timedelta(i)
        if random.random() > engagement:
            for _ in range(1 + int(engagement * 10)):
                timestamp = random_time(date).strftime("%Y-%m-%d %H:%M:%S")
                event_name = random.choice(event_types)

                if event_name == "revenue":
                    value = round(random.uniform(0.5, 50), 2)
                else:
                    value = None

                cursor.execute("INSERT INTO events (timestamp, user_id, event_name, value) VALUES (?, ?, ?, ?)", (timestamp, user_id, event_name, value))

# Commit the changes and close the connection
conn.commit()



## The starting point is event data, one row per event on user_id level

In [3]:
pd.read_sql_query("SELECT * FROM events limit 10", conn)

Unnamed: 0,timestamp,user_id,event_name,value
0,2022-11-15 21:31:00.511270,1,login,
1,2022-11-16 08:45:40,1,delete_todo,
2,2022-11-17 01:15:40,1,revenue,49.23
3,2022-11-18 10:18:06,1,move_todo,
4,2022-11-18 23:56:35,1,move_todo,
5,2022-11-20 09:14:16,1,complete_todo,
6,2022-11-21 01:47:13,1,move_todo,
7,2022-11-22 19:56:57,1,create_todo,
8,2022-11-23 11:19:14,1,login,
9,2022-11-24 18:04:55,1,view_ad,


## Step 1: Create aggregated events per day and user

In [4]:
conn.execute('drop table if exists daily_events')
conn.execute("""
create table daily_events as
SELECT
user_id,
date(timestamp) as date,
cast(count(case when event_name in ('create_todo', 'complete_todo') then 1 end) > 0 as integer) as dau,
count(case when event_name ='login' then 1 end) as login,
count(case when event_name ='create_todo' then 1 end) as create_todo,
count(case when event_name ='complete_todo' then 1 end) as complete_todo,
count(case when event_name ='move_todo' then 1 end) as move_todo,
count(case when event_name ='delete_todo' then 1 end) as delete_todo,
count(case when event_name ='view_ad' then 1 end) as view_ad,
coalesce(sum(case when event_name ='revenue' then value end), 0) as revenue
from events
group by 1,2
""");

In [5]:
pd.read_sql_query("select * from daily_events limit 10", conn)

Unnamed: 0,user_id,date,dau,login,create_todo,complete_todo,move_todo,delete_todo,view_ad,revenue
0,1,2022-11-15,0,1,0,0,0,0,0,0.0
1,1,2022-11-16,0,0,0,0,0,1,0,0.0
2,1,2022-11-17,0,0,0,0,0,0,0,49.23
3,1,2022-11-18,0,0,0,0,2,0,0,0.0
4,1,2022-11-20,1,0,0,1,0,0,0,0.0
5,1,2022-11-21,0,0,0,0,1,0,0,0.0
6,1,2022-11-22,1,0,1,0,0,0,0,0.0
7,1,2022-11-23,0,1,0,0,0,0,0,0.0
8,1,2022-11-24,0,0,0,0,0,0,1,0.0
9,1,2022-11-25,0,1,0,0,0,0,0,0.0


## Step 2: Create signals dataset

## First create a cross join between all dates and all users

In [6]:

SIGNALS_QUERY = """
create table signals as
WITH RECURSIVE
    -- Create a list of consecutive dates
    dates (date) AS (
        SELECT date(julianday('now') - 179)
        UNION ALL
        SELECT date(julianday(date) + 1)
        FROM dates
        WHERE date < date('now')
    ),
    -- Create a list of unique users
    users AS (
        SELECT user_id, min(date) as signup_date
        FROM daily_events group by 1
    -- Cross join all users and dates, one row per user_id and date
    ), user_dates as (
        SELECT users.user_id, dates.date, users.signup_date
        FROM users
        CROSS JOIN dates
        where dates.date >= users.signup_date
    )
"""

## Left join this data with the daily events

In [7]:
SIGNALS_QUERY += """
    SELECT
    ud.user_id,
    ud.date,
    round(julianday(ud.date) - julianday(ud.signup_date)) as days_since_signup,
    
    --<events go here>--
    
    current_timestamp as etl_at
    FROM user_dates ud
    left join daily_events e on e.user_id = ud.user_id and e.date = ud.date
"""

## Add login related events

In [8]:
partition = "partition by ud.user_id order by ud.date rows between"
EVENTS_QUERY = f"""
--login events
sum(e.login) over ({partition} current row and current row) as login_today,
sum(e.login) over ({partition} 1 preceding and 1 preceding) as login_yesterday,
sum(e.login) over ({partition} 6 preceding and current row) as login_last_7_days,
sum(e.login) over ({partition} 27 preceding and current row) as login_last_28_days,
sum(e.login) over ({partition} unbounded preceding and 0 following) as login_first_day,
sum(e.login) over ({partition} unbounded preceding and 6 following) as login_first_7_days,
"""

## Add for all events

In [9]:
ALL_EVENTS_QUERY = ""
event_types = ["dau", "login", "create_todo", "complete_todo", "move_todo", "delete_todo", "view_ad", "revenue"]
for event in event_types:
    ALL_EVENTS_QUERY += EVENTS_QUERY.replace('login', event)
SIGNALS_QUERY = SIGNALS_QUERY.replace('--<events go here>--', ALL_EVENTS_QUERY)
conn.execute('drop table if exists signals');
conn.execute(SIGNALS_QUERY);

# Easily calculate the basics: DAU, MAU

In [10]:
pd.read_sql_query("""
select
    date,
    count(case when dau_today > 0 then 1 end) as dau,
    count(case when dau_last_28_days > 0 then 1 end) as mau
from 
    signals
group by 1
order by 1 desc
""", conn).head(3)

Unnamed: 0,date,dau,mau
0,2023-03-28,34,92
1,2023-03-27,33,92
2,2023-03-26,27,92


# Easily filter out users based on a complex combination of signals
## Users that have dropped off in the funnel

In [11]:
pd.read_sql_query("""
select
    count(1)
from 
    signals
    where
    days_since_signup = 7 and
    create_todo_last_7_days > 0
    and complete_todo_last_7_days = 0
""", conn)

Unnamed: 0,count(1)
0,10


# Revenue (LTV) 28 days after signup

In [12]:
pd.read_sql_query("""
select
    date,
    sum(revenue_last_28_days) as ltv_d28
from 
    signals
    where
    days_since_signup = 27
group by 1 order by 1 desc
""", conn).head(3)

Unnamed: 0,date,ltv_d28
0,2023-03-28,291.05
1,2023-03-25,184.48
2,2023-03-24,0.0


# Questions?