In [None]:
import pandas as pd
import json

from google.cloud import bigquery

In [None]:
client = bigquery.Client()

In [None]:
sql = """
with boost_offers as (
        select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
        from ops.all_user_events 
        where event_type like 'BOOST_CREATED%'

), boost_redemptions as (
        select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
        from ops.all_user_events 
        where event_type = 'BOOST_REDEEMED'
)
select boost_offers.user_id, boost_offers.event_type, boost_offers.context, 
    boost_offers.creation_timestamp as boost_creation_time, boost_redemptions.creation_timestamp as boost_redemption_time,  
    TIMESTAMP_DIFF(boost_redemptions.creation_timestamp, boost_offers.creation_timestamp, HOUR) as time_from_boost_to_last_redeem
from boost_offers left join boost_redemptions on boost_offers.user_id = boost_redemptions.user_id
    where TIMESTAMP_DIFF(boost_redemptions.creation_timestamp, boost_offers.creation_timestamp, HOUR) < 0 or
    TIMESTAMP_DIFF(boost_redemptions.creation_timestamp, boost_offers.creation_timestamp, HOUR) is null
"""

df = client.query(sql).to_dataframe()

In [None]:
df.count()

In [None]:
df.head()

In [None]:
df.time_from_boost_to_last_redeem.isna().sum()

In [None]:
# extract a bunch of context from the boosts    
df["parsed_context"] = df.context.apply(json.loads)
df["boost_id"] = df["parsed_context"].apply(lambda context: context["boostId"])

In [None]:
df["boost_user_id"] = df["boost_id"] + "::" + df["user_id"]

In [None]:
df.count()

In [None]:
adjusted_df = df.sort_values("time_from_boost_to_last_redeem").groupby("boost_user_id", as_index=False).last()

In [None]:
adjusted_df.count()

In [None]:
adjusted_df.boost_user_id.nunique()

In [None]:
adjusted_df["has_prior_redeemed"] = adjusted_df.boost_redemption_time.notna()

In [None]:
adjusted_df.has_prior_redeemed.value_counts()