This is for the Power BI API

Admin - Reports GetReportsAsAdmin

https://learn.microsoft.com/en-us/rest/api/power-bi/admin/reports-get-reports-as-admin



In [44]:
import requests
import json
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, from_json, when, isnotnull, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BooleanType
import logging
from typing import Dict, List, Optional
from delta.tables import DeltaTable
import random
import time
from datetime import datetime

StatementMeta(, 91348d4f-537b-489b-87ac-8aad18f36676, 46, Finished, Available, Finished)

In [45]:
lakehouseTableName = "pbi_admin_reports"

url = f"https://api.powerbi.com/v1.0/myorg/admin/reports"

schema = StructType([
  StructField("appId", StringType(), True),
  StructField("createdBy", StringType(), True),
  StructField("createdDateTime", TimestampType(), True),
  StructField("datasetId", StringType(), True),
  StructField("description", StringType(), True),
  StructField("embedUrl", StringType(), True),
  StructField("id", StringType(), True),
  StructField("modifiedBy", StringType(), True),
  StructField("modifiedDateTime", TimestampType(), True),
  StructField("name", StringType(), True),
  StructField("originalReportId", StringType(), True),
  StructField("reportType", StringType(), True),
  StructField("webUrl", StringType(), True),
  StructField("workspaceId", StringType(), True),
  StructField("extraction_timestamp", TimestampType(), True),
])


StatementMeta(, 91348d4f-537b-489b-87ac-8aad18f36676, 47, Finished, Available, Finished)

In [46]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Just making a dictionary of names from the schema above
# This will be used:
#   As a set of "required fields"
#   Sets the order of fields in the dataframe
#     and lakehouse
columns = {field.name for field in schema}

# ==================================================================================================
# The reason for this function, is that I wanted the Lakehouse to actually use a date/time datatype for dates and times rather than just strings.
# Because of that, the dataframes used to put data into the Lakehouse need to have their schema be something other than all StringType()
# TimestampType() works, but expects the string date to be able to convert by using something like to_timestamp()
# to_timestamp() has expectations as well.
# The string date coming ouf of the API is not in a good and standard format for these things.
# So, this function is to try and clean up the data BEFORE it gets to the dataframes
# It uses standard python functions to try and clean up the strings. Like:
#   Removing "Z" at the end of the date string (00:00:00.00Z -> 00:00:00.00)
#   Making sure the fraction of seconds:
#     Is 6 numbers long, so it pads zeros (00:00:00.00 -> 00:00:00.000000)
#     Exists by adding in missing zeros (00:00:00 -> 00:00:00.000000)
# Finally, it converts that cleaned up string to a python datetime type and returns it.
def clean_for_timestamp(dtstring):
    if dtstring is None:
        return None
    else:
        dtstring = dtstring.rstrip("Z")
        parts = dtstring.split(".")

        if len(parts) > 1:
            clean = f"{parts[0]}.{parts[1].ljust(6, '0')}"
        else:
            clean = f"{dtstring}.000000"

        return datetime.strptime(clean, "%Y-%m-%dT%H:%M:%S.%f")

# ==================================================================================================
# This function is used to prepare the response data by converting strings to types before loading a dataframe
def clean_response_data(input_list):
    # Sometimes an API might not return all possible fields. Fill in any missing ones with None.
    normalized_data = [{key: d.get(key, None) for key in columns} for d in input_list]

    # Go through the API schema and get a list of columns that are not StringType() or something special.
    # For now, we are only handling TimestampType().
    # So, if the schema has a timestamp, then it'll try to convert the corrosponding key's data in
    #   each dictionary (each dictionary in the list is a row of data coming out of the API)

    # Get the ScructFields from the schema that are TimestampType
    timestamp_fields = [field for field in schema if isinstance(field.dataType, TimestampType)]

    for d in normalized_data:
        for field in timestamp_fields:
            d[field.name] = clean_for_timestamp(d[field.name])

    return normalized_data

StatementMeta(, 91348d4f-537b-489b-87ac-8aad18f36676, 48, Finished, Available, Finished)

In [47]:
# Keeping the actual API call in a separate cell
# That way, during debugging, you can skip running this cell and prevent hitting an API call limit
# Just manually run each cell needed and skip this one
# The data in the response variable will stay there for a while (until the session goes idle??)

token = notebookutils.credentials.getToken("pbi")
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
response = requests.get(url, headers=headers)

StatementMeta(, 91348d4f-537b-489b-87ac-8aad18f36676, 49, Finished, Available, Finished)

In [48]:
# response.json() returns a dictionary {}
response_dictionary = response.json()

# get the "value" item from the dictionary. "value" is a list of dictionaries [{},{},{},...]
response_list = response_dictionary["value"]

for item in response_list:
    item["extraction_timestamp"] = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

# clean up and convert any data while keeping it list/dictionary form
cleaned_data = clean_response_data(response_list)

# create the dataframe
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(cleaned_data, schema=schema)

# Show the dataframe or overwrite the table in the lakehouse.
#df.show()
df.write.mode("overwrite").format("delta").saveAsTable(lakehouseTableName)


StatementMeta(, 91348d4f-537b-489b-87ac-8aad18f36676, 50, Finished, Available, Finished)