In [10]:
from google.cloud import bigquery
import pandas as pd
import plotly.express as px

In [2]:
key_path = '../credentials/dbt_service_account.json'
client = bigquery.Client.from_service_account_json(key_path)

In [6]:
query = """
SELECT
    SUM(CASE WHEN sessions IS NULL THEN 1 ELSE 0 END) AS null_sessions,
    SUM(CASE WHEN purchases IS NULL THEN 1 ELSE 0 END) AS null_purchases,
    SUM(CASE WHEN revenue IS NULL THEN 1 ELSE 0 END) AS null_revenue,
    COUNT(*) AS total_rows
FROM `amiable-dynamo-461319-g1.ga4_demo.fact_ab_cohort`
"""

nulls_df = client.query(query).to_dataframe()
nulls_df

Unnamed: 0,null_sessions,null_purchases,null_revenue,total_rows
0,0,0,167940,179085


Explore null revenue cases

In [7]:
query_null_revenue = """
SELECT
    sessions,
    purchases,
    COUNT(*) AS users_with_null_revenue
FROM `amiable-dynamo-461319-g1.ga4_demo.fact_ab_cohort`
WHERE revenue IS NULL
GROUP BY sessions, purchases
ORDER BY users_with_null_revenue DESC
"""
null_revenue_exploration_df = client.query(query_null_revenue).to_dataframe()

In [8]:
null_revenue_exploration_df

Unnamed: 0,sessions,purchases,users_with_null_revenue
0,1,0,140434
1,2,0,18683
2,3,0,4627
3,4,0,1951
4,5,0,959
5,6,0,498
6,7,0,311
7,8,0,184
8,9,0,89
9,10,0,70


In [9]:
query_null_revenue_overview = """
WITH null_rev AS (
    SELECT
        CASE 
            WHEN purchases = 0 THEN '0 purchases'
            ELSE '> purchases'
        END AS purchase_bucket,
    FROM `amiable-dynamo-461319-g1.ga4_demo.fact_ab_cohort`
    WHERE revenue IS NULL
) 

SELECT
    purchase_bucket,
    COUNT(*) AS users,
    ROUND( COUNT(*) / SUM(COUNT(*)) OVER(), 4) AS pct_nulls
FROM null_rev
GROUP BY purchase_bucket
ORDER BY purchase_bucket
    
"""
null_revenue_overview_df = client.query(query_null_revenue_overview).to_dataframe()
null_revenue_overview_df

Unnamed: 0,purchase_bucket,users,pct_nulls
0,0 purchases,167912,0.9998
1,> purchases,28,0.0002


In [15]:
fig = px.bar(
    null_revenue_overview_df,
    x = 'purchase_bucket',
    y = 'users',
    text = null_revenue_overview_df["pct_nulls"].apply(lambda v: f"{v:.2%}"),
    title='Percentage of Users with Null Revenue by Purchase Bucket', 
    labels = {"pct_nulls":"Percent of NULL-revenue users", "purchase_bucket":""}
)
fig.update_layout(yaxis_tickformat=',.0f', yaxis_title='Number of Users')
fig.show()

In [19]:
fig_pie = px.pie(
    null_revenue_overview_df,
    values='users',
    names='purchase_bucket',
    title='Percentage of Users with Null Revenue by Purchase Bucket',
    labels={"pct_nulls": "Percent of NULL-revenue users", "purchase_bucket": ""}
)
fig_pie.update_traces(texttemplate='%{label}<br>%{percent:.2%}')
fig_pie.show()


Most null revenue cases correspond to the 0 performed purchases. These revenue sitautions should be 0 and not NULL. In other cases, the purchase has occured - further investigation of these cases needed.