In [59]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from google.cloud import bigquery

In [2]:
def estimate_bigquery_query_cost(bq_client, query):
    
    job_config = bigquery.QueryJobConfig()
    job_config.dry_run = True
    job_config.use_query_cache = False
    query_job = bq_client.query(

        (
           query
        ),
        job_config=job_config,
    )
    
    cost_euros = (query_job.total_bytes_processed / 1024 ** 4) * 6

    print(f"{query_job.total_bytes_processed} bytes will be processed , cost ~{cost_euros}$")

In [3]:
bigquery_client= bigquery.Client(project="ingka-energy-analytics-dev") # ingka-energy-solar-dev

## Landings data

In [57]:
# Visited pages
query_string = """
SELECT 
    date_hit
    , visitor_id
    , session_id
    , page_urls
FROM (
    SELECT 
    date_hit
    , visitor_id
    , session_id
    , page_urls
    , EXISTS(SELECT * FROM UNNEST(page_urls) AS x WHERE REGEXP_CONTAINS(x, ".+?clean-energy.+")) as exist
    FROM (
        SELECT 
            date_hit
            , visitor_id
            , session_id
            , ARRAY_AGG(page_url) as page_urls
        FROM `ingka-web-analytics-prod.web_data_v2.hits_events_and_pages` 
        WHERE 
            date_hit = '2022-07-01' 
            -- AND website_market_short in ('se')
            AND event_category like '%page%'
            AND page_url is not NULL
        GROUP BY
            date_hit
            , visitor_id
            , session_id
        ORDER BY visitor_id
    ) pages
)
WHERE exist is TRUE
LIMIT 1000
"""

estimate_bigquery_query_cost(bigquery_client, query_string) 

35755206733 bytes will be processed , cost ~0.19511502650675538$


## Searches data

In [61]:
# Get SQL string filter condition
product_string_queries = [
    'solar', 'sol', 'sunpower', 'svea', 'solstrale', 'SOLSTRÅLE', 'solpaneler',
]
string_exp = ' OR '.join([f'"{exp.lower()}" in UNNEST(internal_search_terms)' for exp in product_string_queries])
print(string_exp,'\n')


# Download query results.
query_string = """
SELECT *
FROM (
    SELECT 
        date_hit
        , visitor_id
        , session_id
        , website_market_short
        , website_language_short
        , ARRAY_AGG(DISTINCT LOWER(internal_search_term)) as internal_search_terms
    FROM `ingka-web-analytics-prod.web_data_v2.hits_events_and_pages` 
    WHERE 
        date_hit >= '2022-07-02' 
        AND website_market_short in ('us','se')
        AND event_category like '%search%'
        AND internal_search_type = 'hard_search'
        AND internal_search_term is not Null
    GROUP BY
        date_hit
        , visitor_id
        , session_id
        , website_market_short
        , website_language_short
    ORDER BY visitor_id
) searches
WHERE 
    {exp}
""".format(exp=string_exp)

estimate_bigquery_query_cost(bigquery_client, query_string) 

"solar" in UNNEST(internal_search_terms) OR "sol" in UNNEST(internal_search_terms) OR "sunpower" in UNNEST(internal_search_terms) OR "svea" in UNNEST(internal_search_terms) OR "solstrale" in UNNEST(internal_search_terms) OR "solstråle" in UNNEST(internal_search_terms) OR "solpaneler" in UNNEST(internal_search_terms) 

294315395349 bytes will be processed , cost ~1.6060697563207214$


In [17]:
# Write to table
table_id="ingka-energy-analytics-dev.ces_da_playground.searches_temp"

job_config = bigquery.QueryJobConfig(
    allow_large_results=True, destination=table_id, use_legacy_sql=False,
    write_disposition = "WRITE_TRUNCATE"
)


bigquery_client.query(
    query_string, 
    job_config=job_config
).result()

<google.cloud.bigquery.table.RowIterator at 0x17f36b0a0>

In [58]:
job_config = bigquery.QueryJobConfig(
    use_legacy_sql=False
)

df = (
    bigquery_client.query(query_string, job_config)
    .result()
    .to_dataframe(
        # Optionally, explicitly request to use the BigQuery Storage API. As of
        # google-cloud-bigquery version 1.26.0 and above, the BigQuery Storage
        # API is used by default.
        create_bqstorage_client=True,
    )
)

df

BadRequest: 400 Resources exceeded during query execution: The query could not be executed in the allotted memory. Peak usage: 123% of limit.
Top memory consumer(s):
  ORDER BY operations: 99%
  other/unattributed: 1%


Location: EU
Job ID: d7a36c40-02a1-441e-a781-e02b192b843a
