In [None]:
# prep imports
from google.cloud import bigquery
import requests
import datetime
import warnings
import pandas as pd
warnings.filterwarnings('ignore')


In [None]:
# Set up variables
project_id    = 'atscale'
dataset_id    = 'atscale_dataset'
table_queries = 'atscale_queries'
table_queries_full = project_id + '.' + dataset_id + '.' + table_queries

hostname = "{https://atscale}"
bearer_token = '{api token }'
full_load_hours = 167 #max limit from atscale <168

In [None]:
#Create GBQ Client
client = bigquery.Client()

In [None]:
#get last load max datetime

sql_query = f'SELECT main_startTime FROM `{table_queries_full}` order by main_startTime DESC limit 1;'

query_job = client.query(sql_query)
try:
  results = query_job.result()
  for row in results:
    lastload = row

except:
  lastload = None
lastload

In [None]:
if not(lastload):
  lastload = str(datetime.datetime.now() - datetime.timedelta(hours=full_load_hours))
else:
  lastload = str(lastload['main_startTime'].strftime("%Y-%m-%d, %H:%M:%S.%f"))


In [None]:
#load log from atscale
url = hostname + "/api/queries?showCanaries=false&startDate="+ lastload
print(url)
headers = {
  'Accept': 'application/json',
  'Authorization': 'Bearer ' + bearer_token
}

records = []
# Start at Page 1
current_page = 1

# Loop through all the pages
while True:
    response = requests.request("GET", url, headers=headers, verify=False, params={'page': current_page })
    if response.status_code <= 299:
      records += response.json()['results']
    else:
      print(response.text)
      raise SystemExit("Stop notebook due to API error")

    # End loop if the current page we requested is the same as the last page the API returns
    if current_page == response.json()['totalPages']:
        break

    # Increment the page to request
    #current_page =response.json()['totalPages']
    current_page += 1

records


In [None]:
# create flat structure from recods -> events -> subqueries

rows = []
main_fields = ['traceId','queryId','startTime','duration','modelId','catalogId','userId','queryType','catalogName','modelName','user']
subquery_fields = ['name','duration','subqueryId']
# appending rows
for data in records:
  events = data['events']
  print(events)
  for event in events:
    for main_field in main_fields:
      try:
        event['main_'+main_field] = data[main_field]
      except:
        break

    if'subqueries' in event.keys():
      subqueries = event['subqueries']
      for subquery in subqueries:
        for subquery_field in subquery_fields:
          try:
            event['subquery_'+subquery_field] = subquery[subquery_field]
          except:
            break

    else:
      event['subquery_subqueryId'] = None

    rows.append(event)


df = pd.DataFrame(rows)

#define the keys to remove
df.drop(columns=['subqueries'], inplace=True)
df.rename(columns={'name': 'event_name', 'startTime': 'event_startTime', 'duration': 'event_duration', 'queryId': 'event_queryId'}, inplace=True)
df['main_startTime'] = df['main_startTime'].apply(lambda x:datetime.datetime.fromtimestamp(x/1000.0))
df['event_startTime'] = df['event_startTime'].apply(lambda x:datetime.datetime.fromtimestamp(x/1000.0))


In [None]:
#Create Big Query Schema
from google.cloud import bigquery
from google.cloud.bigquery.schema import SchemaField
from typing import List

def generate_bigquery_schema(df: pd.DataFrame) -> List[SchemaField]:
    TYPE_MAPPING = {
        "i": "INTEGER",
        "u": "NUMERIC",
        "b": "BOOLEAN",
        "f": "FLOAT",
        "O": "STRING",
        "S": "STRING",
        "U": "STRING",
        "M": "TIMESTAMP",
    }
    schema = []
    for column, dtype in df.dtypes.items():
        val = df[column].iloc[0]
        mode = "REPEATED" if isinstance(val, list) else "NULLABLE"

        if isinstance(val, dict) or (mode == "REPEATED" and isinstance(val[0], dict)):
            fields = generate_bigquery_schema(pd.json_normalize(val))
        else:
            fields = ()

        type = "RECORD" if fields else TYPE_MAPPING.get(dtype.kind)
        schema.append(
            SchemaField(
                name=column,
                field_type=type,
                mode=mode,
                fields=fields,
            )
        )
    return schema
bigquery_schema = generate_bigquery_schema(df)


In [None]:
#Write query log records to gbq

job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=bigquery_schema,
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_APPEND",
)

job = client.load_table_from_dataframe(

    df, table_queries_full, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

table = client.get_table(table_queries_full)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_queries_full
    )
)


In [None]:
del df