## Grab PowerBI usage data data from REST API */admin*

- Purpose: 
    - query timestamp of latest data point from etl table,
    - obtain a service pricipal OAUTH token,
    - For each day from last point until today: gather transactional JSON data (paginated),
    - concat and process data,
    - insert data to interface table,
    - add loading success/failure and stats to etl table.
- Author: vsm
- Date: 2025-06-30
- Team: GS - BI/ERP

### Requirements

Python package pyodbc needs a preinstalled ODBC driver on Linux, e.g., Ubuntu `sudo apt install unixodbc` and the
actual Microsoft ODBC Driver 17 for SQL Server:
```bash
# 1. Get underlying packages
sudo apt-get update
sudo apt-get install -y curl apt-transport-https software-properties-common

# 2. Save Microsoft GPG-Key
curl -sSL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/keyrings/microsoft.gpg > /dev/null

# 3. Add repository (for Ubuntu 22.04 „jammy“)
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/microsoft.gpg] https://packages.microsoft.com/ubuntu/22.04/prod jammy main" | sudo tee /etc/apt/sources.list.d/mssql-release.list

# 4. Reload package list
sudo apt-get update

# 5. Install
sudo ACCEPT_EULA=Y apt-get install -y msodbcsql17 unixodbc-dev

```

In [None]:
import requests
import json
import hashlib
import pyodbc
from datetime import datetime, timedelta, timezone

from dotenv import dotenv_values

import sys
import logging

# remove orphaned logging handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s")

env_dict = dotenv_values("./.env")

API_CREDS = {
    "TENANT_ID": env_dict['TENANT_ID'],
    "CLIENT_ID": env_dict['CLIENT_ID'],
    "CLIENT_SECRET": env_dict['CLIENT_SECRET'],
    "SCOPE": env_dict['SCOPE'],
    "OAUTH_URL": env_dict['OAUTH_URL'].replace('$TENANT_ID', env_dict['TENANT_ID']),
    "REST_URL": env_dict['REST_URL'],
}

BI_META = {
    "BI_SERVICE_NAME": env_dict['BI_SERVICE_NAME'],
    "BI_STAGING_TABLE": env_dict['BI_STAGING_TABLE'],
    "BI_LOG_TABLE": env_dict['BI_LOG_TABLE'],
    "BI_INGEST_TS": env_dict['BI_INGEST_TS'],
}

SQL_CREDS = {
    "SQL_SERVER": env_dict['SQL_SERVER'],
    "SQL_PORT": env_dict['SQL_PORT'],
    "SQL_DB": env_dict['SQL_DB'],
    "SQL_SCHEMA": env_dict['SQL_SCHEMA'],
    "SQL_USER": env_dict['SQL_USER'],
    "SQL_PWD": env_dict['SQL_PWD'],
}

logging.info("Loading Modules")


In [None]:
# === SQL Server Connection ===
def get_sql_conn():
    logging.info(f"Connecting to DB")
    return pyodbc.connect(
        f"DRIVER={{ODBC Driver 17 for SQL Server}};"
        f"SERVER={SQL_CREDS['SQL_SERVER']},{SQL_CREDS['SQL_PORT']};"
        f"PORT={SQL_CREDS['SQL_PORT']};"        
        f"DATABASE={SQL_CREDS['SQL_DB']};"
        f"UID={SQL_CREDS['SQL_USER']};"
        f"PWD={SQL_CREDS['SQL_PWD']}"
    )

# === Get latest timestamp ===
def get_latest_timestamp(conn):
    logging.info(f"Getting maximum event timestamp")
    cursor = conn.cursor()
    cursor.execute(f"SELECT COALESCE(MAX(event_time), '2025-06-27') FROM {SQL_CREDS['SQL_SCHEMA']}.{BI_META['BI_STAGING_TABLE']}")
    result = cursor.fetchone()
    return result[0] if result else datetime(2025, 6, 27)

# === Get Service Principal Token ===
def get_access_token():
    logging.info(f"Getting OAuth")
    url = API_CREDS['OAUTH_URL']
    data = {
        'grant_type': 'client_credentials',
        'client_id': API_CREDS['CLIENT_ID'],
        'client_secret': API_CREDS['CLIENT_SECRET'],
        'scope': API_CREDS['SCOPE']
    }
    res = requests.post(url, data=data)
    res.raise_for_status()
    return res.json()['access_token']

# === Hash JSON row ===
def hash_row(row):
    return hashlib.sha256(json.dumps(row, sort_keys=True).encode()).hexdigest()

# === Fetch paginated PowerBI usage data ===
def fetch_usage_data(date_str, hour_str, token):
    logging.info(f"Getting REST Data")
    rows  = []
    url = f"{API_CREDS['REST_URL']}?startDateTime='{date_str}T{hour_str}:00:00Z'&endDateTime='{date_str}T23:59:59Z'"
    logging.info(f"""Getting REST Data for url:\n"""
                 f"""{url}""")
    headers = {"Authorization": f"Bearer {token}"}
    counter = 1
    while url:
        logging.info(f"Page_{counter:02}")
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        data = resp.json()
        rows.extend(data.get('activityEventEntities', []))
        url = data.get('continuationUri')
        counter += 1

    return rows

# === Insert new data (with hashdiff) ===
def insert_new_data(conn, rows):
    logging.info(f"Inserting relevant data to DB")
    inserted_count = 0
    duplicate_count =0

    cursor = conn.cursor()

    max_tf_ts = datetime(1900, 1, 1)

    for row in rows:
        tf_ts_str = row.get(BI_META['BI_INGEST_TS'])
        if not tf_ts_str:
            logging.warning(f"JSON payload does not contain {BI_META['BI_INGEST_TS']} key")
            continue  # skip if missing transaction timestamp

        try:
            tf_ts = datetime.fromisoformat(tf_ts_str)
        except ValueError:
            logging.warning(f"Row:{inserted_count + 1}: Timestamp {BI_META['BI_INGEST_TS']} malformed")
            continue  # skip malformed timestamp

        hash_val = hash_row(row)
        data_str = json.dumps(row)

        # Skip duplicates
        cursor.execute(f"SELECT 1 FROM {SQL_CREDS['SQL_SCHEMA']}.{BI_META['BI_STAGING_TABLE']} WHERE hash = ?", (hash_val,))
        if cursor.fetchone():
            duplicate_count += 1
            continue

        # Insert new record
        cursor.execute(
            f"INSERT INTO {SQL_CREDS['SQL_SCHEMA']}.{BI_META['BI_STAGING_TABLE']} (event_time, data_json, hash) VALUES (?, ?, ?)",
            (tf_ts, data_str, hash_val)
        )
        inserted_count += 1

        # Update max TF_TIMESTAMP
        if not max_tf_ts or tf_ts > max_tf_ts:
            max_tf_ts = tf_ts

    conn.commit()
    logging.info(f"{inserted_count} rows inserted to DB, {duplicate_count} duplicates skipped")
    return inserted_count, max_tf_ts

# === Log ETL run result ===
def log_etl_result(conn, success, inserted_rows, max_ts):
    logging.info(f"Logging ETL run to DB")
    cursor = conn.cursor()
    cursor.execute(
        f"INSERT INTO {SQL_CREDS['SQL_SCHEMA']}.{BI_META['BI_LOG_TABLE']} (run_time, service_name, success, inserted_rows, max_event_time) VALUES (?, ?, ?, ?, ?)",
        (datetime.now(timezone.utc), BI_META['BI_SERVICE_NAME'], success, inserted_rows, max_ts)
    )
    conn.commit()

In [None]:
# === Main ETL logic ===
def run_etl():
    conn = get_sql_conn()

    # read maximum timestamp in staging_table
    latest_ts = get_latest_timestamp(conn)
    today = datetime.now(timezone.utc).date()
    hour_str = '00'
    logging.info(f"Latest timestamp is: {latest_ts}")
    
    inserted_total = 0
    max_data_ts = latest_ts

    try:
        token = get_access_token()

        for day in range(0, (today - latest_ts.date()).days + 1):
            
            if day == 0:
                hour_str = f"{latest_ts.hour:02d}"

            date_str = (latest_ts.date() + timedelta(days=day)).isoformat()

            logging.info(f"Daily extract for: {date_str}, with offset {hour_str} hours.")
            #daily_rows = []
            daily_rows = fetch_usage_data(date_str, hour_str, token)

            if daily_rows:
                inserted, max_tf_ts = insert_new_data(conn, daily_rows)
                inserted_total += inserted
                if max_tf_ts and max_tf_ts > max_data_ts:
                    max_data_ts = max_tf_ts

        
        log_etl_result(conn, True, inserted_total, max_data_ts)
        logging.info(f"✅ Success: Inserted {inserted_total} records.")
    
    except Exception as e:
        log_etl_result(conn, False, inserted_total, max_data_ts)
        logging.info(f"❌ Failure: {e}")

    finally:
        conn.close()

In [None]:
run_etl()