In [3]:
from google.cloud import bigquery, bigquery_storage_v1
import pandas as pd, plotly.express as px

In [16]:
key_path = "../credentials/dbt_service_account.json"
client = bigquery.Client.from_service_account_json(key_path)

### 1. Shard summary

In [24]:
shards = client.query(open("../analysis/01_shard_inventory.sql").read()).to_dataframe()
shards["size_cum_gb"] = shards["size_mb"].cumsum() / 1024
fig = px.line(shards, x="shard", y="size_cum_gb", title="Cumulative table size (GB)")
fig.update_layout(
    xaxis_title="Shard",
    yaxis_title="Cumulative Size (GB)",
    xaxis_tickangle=-45,
    yaxis=dict(tickformat=".2f")
    )
fig.show()

In [25]:
# Number of shards
print(f"Number of shards: {len(shards)}")
# Total size in GB
print(f"Total size: {shards['size_cum_gb'].max():.2f} GB")
# Average size per shard in GB
print(f"Average size per shard: {shards['size_cum_gb'].mean():.2f} GB")
# Largest shard
largest_shard = shards.loc[shards['size_cum_gb'].idxmax()]
print(f"Largest shard: {largest_shard['shard']} with size {largest_shard['size_cum_gb']:.2f} GB")
# Smallest shard
smallest_shard = shards.loc[shards['size_cum_gb'].idxmin()]
print(f"Smallest shard: {smallest_shard['shard']} with size {smallest_shard['size_cum_gb']:.2f} GB")

Number of shards: 92
Total size: 3.50 GB
Average size per shard: 1.88 GB
Largest shard: events_20210131 with size 3.50 GB
Smallest shard: events_20201101 with size 0.02 GB


### 2. Column dictionary

In [19]:
cols = client.query(open("../analysis/02_column_dictionary.sql").read()).to_dataframe()
cols

Unnamed: 0,column_name,data_type
0,app_info,"STRUCT<id STRING, version STRING, install_stor..."
1,device,"STRUCT<category STRING, mobile_brand_name STRI..."
2,ecommerce,"STRUCT<total_item_quantity INT64, purchase_rev..."
3,event_bundle_sequence_id,INT64
4,event_date,STRING
5,event_dimensions,STRUCT<hostname STRING>
6,event_name,STRING
7,event_params,"ARRAY<STRUCT<key STRING, value STRUCT<string_v..."
8,event_previous_timestamp,INT64
9,event_server_timestamp_offset,INT64


### 3. Event histogram

In [22]:
events = client.query(open("../analysis/03_event_histogram.sql").read()).to_dataframe()
px.bar(events, x="event_name", y="approx_hits", title="Event Inventory").show()

In [27]:
events.event_name.to_list()

['page_view',
 'user_engagement',
 'scroll',
 'view_item',
 'session_start',
 'first_visit',
 'view_promotion',
 'add_to_cart',
 'begin_checkout',
 'select_item',
 'view_search_results',
 'add_shipping_info',
 'add_payment_info',
 'select_promotion',
 'purchase',
 'click',
 'view_item_list']