# Reverse ETL and Personalization Model Building

In [None]:
#Install packages
%pip install azure-cosmos
%pip install azure-core

In [None]:
#Imports and config values
import base64, json
from typing import Any, Optional

#from azure.cosmos.aio import CosmosClient why aio
from azure.cosmos import CosmosClient, PartitionKey, ThroughputProperties
from azure.core.credentials import TokenCredential, AccessToken


COSMOS_ENDPOINT = '' # The Cosmos DB endpoint, found in Settings (gear icon) > Connection
COSMOS_DATABASE_NAME = 'fc_commerce_cosmos' 
COSMOS_CONTAINER_NAME = 'customers'

In [None]:
class FabricTokenCredential(TokenCredential):
    """Token credential for Fabric Cosmos DB access with automatic refresh and retry logic."""
    
    def get_token(self, *scopes: str, claims: Optional[str] = None, tenant_id: Optional[str] = None,
                  enable_cae: bool = False, **kwargs: Any) -> AccessToken:
        access_token = notebookutils.credentials.getToken("https://cosmos.azure.com/.default")
        parts = access_token.split(".")
        if len(parts) < 2:
            raise ValueError("Invalid JWT format")
        payload_b64 = parts[1]
        # Fix padding
        padding = (-len(payload_b64)) % 4
        if padding:
            payload_b64 += "=" * padding
        payload_json = base64.urlsafe_b64decode(payload_b64.encode("utf-8")).decode("utf-8")
        payload = json.loads(payload_json)
        exp = payload.get("exp")
        if exp is None:
            raise ValueError("exp claim missing in token")
        return AccessToken(token=access_token, expires_on=exp)

In [None]:
# Initialize Cosmos DB cosmos client
COSMOS_CLIENT = CosmosClient(COSMOS_ENDPOINT, FabricTokenCredential())

# Initialize Cosmos DB database client
DATABASE_CLIENT = COSMOS_CLIENT.get_database_client(COSMOS_DATABASE_NAME)

# Intialize Cosmos DB container client
CONTAINER_CLIENT = DATABASE_CLIENT.get_container_client(COSMOS_CONTAINER_NAME) # Default is SampleData
     

In [None]:
import pandas as pd

queryText = """SELECT c.customerId,
        (SELECT VALUE COUNT(1) FROM r IN c.recommendations) AS recommendationSets,
        (SELECT VALUE COUNT(1) FROM r IN c.recommendations JOIN mi IN r.menuItems) AS totalRecommendedItems,
        (SELECT VALUE ROUND(AVG(r.score), 4) FROM r IN c.recommendations) AS avgRecScore,
        (SELECT VALUE MIN(r.expiresAt) FROM r IN c.recommendations) AS nextExpiryUtc
       FROM customers c"""

results = CONTAINER_CLIENT.query_items(
    query=queryText,
    parameters=[
    ],
    enable_cross_partition_query=True,
)

results_list = list(results)
df = spark.createDataFrame(results_list)
# pd.DataFrame(results_list)

print("Retrieved", df.count(), "records from Cosmos.")
# df.head()

    

In [None]:
from pyspark.sql import functions as F

WAREHOUSE = "fc_commerce_warehouse"  
SERVER    = "" # go to warehouse > settings > copy sql endpoint (e.g., "abcd1234-...-workspace.z01.datawarehouse.fabric.microsoft.com")

# 1) Get an Entra ID token from the Fabric runtime 
from notebookutils.mssparkutils import credentials
token = credentials.getToken("pbi")  # ~60–90 min lifetime
# 2) Build JDBC URL & properties
jdbc_url = (
    f"jdbc:sqlserver://{SERVER}:1433;"
    f"database={WAREHOUSE};"
    "encrypt=true;"
    "trustServerCertificate=false;"
    "hostNameInCertificate=*.datawarehouse.fabric.microsoft.com;"
    "loginTimeout=30"
)
props = {
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
    "accessToken": token
}



dw_fact_sales = "dbo.FactSales"
dw_fact_sales_line = "dbo.FactSalesLineItems"
dw_dim_customer = "dbo.DimCustomer"

def read_dw_table(table: str):
    return spark.read.jdbc(jdbc_url, table=table, properties=props)

print(read_dw_table(dw_fact_sales))

fact_sales  = read_dw_table(dw_fact_sales)  
fact_line_items  = read_dw_table(dw_fact_sales_line)  

In [None]:


# Aggregate loyalty deltas from FactSales
loyalty_agg = (fact_sales
    .groupBy("CustomerKey")
    .agg(
        F.coalesce(F.sum("LoyaltyPointsEarned"), F.lit(0)).alias("PointsEarned"),
        F.coalesce(F.sum("LoyaltyPointsRedeemed"), F.lit(0)).alias("PointsRedeemed"),
        F.max("CreatedAt").alias("LastPurchaseUtc")
    )
    .withColumn("NewLoyaltyPoints", F.col("PointsEarned") - F.col("PointsRedeemed"))
)

In [None]:
queryText = "select * from c"

results = CONTAINER_CLIENT.query_items(
    query=queryText,
    parameters=[
    ],
    enable_cross_partition_query=True,
)

results_list = list(results)
customers_df = spark.createDataFrame(results_list)

print("Retrieved", customers_df.count(), "records from Cosmos.")
customers_df.select("customerId","name","loyaltyPoints","lastPurchaseDate").show(5)

In [None]:
# Bring CustomerId to join with Cosmos (via DimCustomer)
dim_customer = read_dw_table(dw_dim_customer).select("CustomerKey","CustomerId")

loyalty_by_id = (loyalty_agg
    .join(dim_customer, on="CustomerKey", how="inner")
    .select("CustomerId","NewLoyaltyPoints","LastPurchaseUtc")
)

# Join with existing customer docs and produce upserts
# Keep all fields in the doc, only replace loyaltyPoints / lastPurchaseDate / updatedAt
cust_with_loyalty = (customers_df.alias("c")
    .join(loyalty_by_id.alias("l"), F.col("c.customerId") == F.col("l.CustomerId"), "left")
    .withColumn("loyaltyPoints", F.when(F.col("l.NewLoyaltyPoints").isNotNull(), F.col("l.NewLoyaltyPoints")).otherwise(F.col("c.loyaltyPoints")))
    .withColumn("lastPurchaseDate", F.when(F.col("l.LastPurchaseUtc").isNotNull(), F.col("l.LastPurchaseUtc").cast("timestamp")).otherwise(F.col("c.lastPurchaseDate")))
    .withColumn("updatedAt", F.current_timestamp())
    .drop("NewLoyaltyPoints","LastPurchaseUtc","CustomerKey")
)

display(cust_with_loyalty.limit(10))



In [None]:
# Upsert back to Cosmos
from datetime import datetime
from pyspark.sql import functions as F

# 1) Filter to "purchased today" in UTC
today_updates = (loyalty_by_id
    .filter(F.to_date("LastPurchaseUtc") == F.current_date())
    .select("CustomerId", "NewLoyaltyPoints", "LastPurchaseUtc")
)

print("Customers with purchases today:", today_updates.count())



In [None]:
# 2) Patch each customer document in Cosmos
#    Adjust partition_key below to match your container (e.g., /customerId or /id).
patched, failed = 0, 0

for row in today_updates.toLocalIterator():
    cust_id  = row["CustomerId"]
    points   = int(row["NewLoyaltyPoints"] or 0)
    last_dt  = row["LastPurchaseUtc"]              # Py datetime (UTC) from Spark
    last_iso = last_dt.isoformat().replace("+00:00","Z") if last_dt.tzinfo else last_dt.isoformat() + "Z"

    ops = [
        {"op":"set", "path":"/loyaltyPoints", "value":points},
        {"op":"set", "path":"/lastPurchaseDate", "value":last_iso},
        {"op":"set", "path":"/updatedAt", "value":datetime.utcnow().isoformat() + "Z"}
    ]

    try:
        CONTAINER_CLIENT.patch_item(item=cust_id, partition_key=cust_id, patch_operations=ops)
        patched += 1
    except Exception as e:
        failed += 1
        print(f"⚠️ Patch failed for {cust_id}: {e}")

print(f"✅ Patched: {patched} | ❌ Failed: {failed}")


In [None]:
# Read back a few updated customers
check_query = """
SELECT TOP 10 c.customerId, c.name, c.loyaltyPoints, c.lastPurchaseDate, c.updatedAt
FROM c
WHERE IS_DEFINED(c.updatedAt)
ORDER BY c.updatedAt DESC
"""

try:
    results_iter = CONTAINER_CLIENT.query_items(
        query=check_query,
        enable_cross_partition_query=True
    )
    results_list = list(results_iter)
    df_check = spark.createDataFrame(results_list)
    print(f"✅ Retrieved {df_check.count()} recently updated customers")
    display(df_check)  # Fabric notebook pretty table
except Exception as e:
    print("⚠️ Query failed:", e)
