In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from google.cloud import bigquery

In [50]:
def estimate_bigquery_query_cost(bq_client, query):
    
    job_config = bigquery.QueryJobConfig()
    job_config.dry_run = True
    job_config.use_query_cache = False
    query_job = bq_client.query(

        (
           query
        ),
        job_config=job_config,
    )
    
    cost_euros = (query_job.total_bytes_processed / 1024 ** 4) * 6

    print(f"{query_job.total_bytes_processed} bytes will be processed , cost ~{cost_euros}$")

In [3]:
bigquery_client= bigquery.Client(project="ingka-energy-analytics-dev") # ingka-energy-solar-dev

In [51]:
# Download query results.
query_string = """
    SELECT 
        internal_search_type
        , count(1) as count_cat
    FROM `ingka-web-analytics-prod.web_data_v2.hits_events_and_pages` 
    WHERE 
        date_hit = '2022-07-02' 
        AND internal_search_type is not Null
    GROUP BY internal_search_type
    ORDER BY count_cat DESC
    LIMIT 1000
"""

estimate_bigquery_query_cost(bigquery_client, query_string) 

2510418036 bytes will be processed , cost ~0.013699271417863201$


In [31]:
df = (
    bigquery_client.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)

df

Unnamed: 0,internal_search_type,count_cat
0,hard_search,12228316
1,autocomplete,1847696
2,product_recent,448228
3,product_suggest,325364
4,related_searches,258568
5,category_suggest,192140
6,historical_searches,101270
7,popular_searches,30281
8,content_suggest,19775
9,planner_suggest,17032


In [27]:
indexing = df['internal_search_type'].str.contains('search') & ~df['internal_search_type'].str.contains('search').isna()
df.loc[indexing , 'event_action']

9                                            hard_search
57                                        search_actions
84                                      related_searches
102                               stock_check_search_box
149                                  historical_searches
171                                     popular_searches
180                                    loc_search_result
189                             collect_check_search_box
222                                     refined_searches
257                                      visual_searches
336                                               search
538                                       product_search
833        https://app.ltl.xpo.com/appjs/tracking/search
988    https://www.seeacareerwithus.com/search-jobs#s...
Name: event_action, dtype: object

In [53]:
# Download query results.
query_string = """
SELECT *
FROM (
    SELECT 
        date_hit
        , visitor_id
        , session_id
        , website_market_short
        , website_language_short
        , STRING_AGG(DISTINCT internal_search_term) as internal_search_terms
        -- , page_title
    FROM `ingka-web-analytics-prod.web_data_v2.hits_events_and_pages` 
    WHERE 
        date_hit = '2022-07-02' 
        AND website_market_short in ('us', 'se')
        AND event_category like '%search%'
        AND internal_search_type = 'hard_search'
        AND internal_search_term is not Null
    GROUP BY
        date_hit
        , visitor_id
        , session_id
        , website_market_short
        , website_language_short
    ORDER BY visitor_id
    LIMIT 1000
) searches
WHERE 
    "solar" in UNNEST(searches.internal_search_terms)
    
"""

estimate_bigquery_query_cost(bigquery_client, query_string) 

24252107253 bytes will be processed , cost ~0.13234297832059383$


In [37]:
df = (
    bigquery_client.query(query_string)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)

df

Unnamed: 0,hit_timestamp,visitor_id,session_id,event_category,website_market_short,website_language_short,page_title,internal_search_type,internal_search_term
0,2022-07-02 07:24:08+00:00,1000001759760574127,10000017597605741271656746648,search,se,sv,hemlagad - Sök - IKEA,hard_search,hemlagad
1,2022-07-02 19:29:09+00:00,100000335093935525,1000003350939355251656790149,search,pl,pl,komoda - Wyszukiwarka - IKEA,hard_search,komoda
2,2022-07-02 09:28:01+00:00,1000005307427983223,10000053074279832231656754039,search,ch,en,mattress - Search - IKEA,hard_search,mattress
3,2022-07-02 09:28:27+00:00,1000005307427983223,10000053074279832231656754039,search,ch,en,bookcase - Search - IKEA,hard_search,bookcase
4,2022-07-02 09:35:01+00:00,1000005307427983223,10000053074279832231656754039,search,ch,en,knife block - Search - IKEA,hard_search,knife block
...,...,...,...,...,...,...,...,...,...
995,2022-07-02 13:32:27+00:00,1001302709673934291,10013027096739342911656768427,search,gb,en,plant - Search - IKEA,hard_search,plant
996,2022-07-02 13:32:31+00:00,1001302709673934291,10013027096739342911656768427,search,gb,en,plant - Search - IKEA,hard_search,plant
997,2022-07-02 01:58:17+00:00,1001308860064116925,10013088600641169251656727093,search,ca,en,risbyn - Search - IKEA,hard_search,risbyn
998,2022-07-02 08:04:02+00:00,1001322698451712980,10013226984517129801656749011,search,ro,ro,ceas de perete - Caută - IKEA,hard_search,ceas de perete
