In [0]:
from pyspark.sql.functions import col, avg, stddev, count, lag, unix_timestamp
from pyspark.sql.window import Window
from delta.tables import *

# Tabelas
raw_table = "default.raw_transactions"
core_table = "default.user_behavior_profiles"

raw_df = spark.read.table(raw_table)

In [0]:

windowSpec = Window.partitionBy("user_id").orderBy("timestamp_utc")

df_with_previous_tx = raw_df.withColumn(
    "previous_timestamp", lag("timestamp_utc", 1).over(windowSpec)
)

df_with_time_diff = df_with_previous_tx.withColumn(
    "time_since_last_tx_sec",
    unix_timestamp(col("timestamp_utc").cast("timestamp")) - unix_timestamp(col("previous_timestamp").cast("timestamp"))
)

display(df_with_time_diff.filter("user_id = 2002"))

In [0]:
user_profiles_df = df_with_time_diff.groupBy("user_id").agg(
    avg("amount_brl").alias("avg_amount"),
    stddev("amount_brl").alias("stddev_amount"),
    count("transaction_id").alias("transaction_count"),
    avg("time_since_last_tx_sec").alias("avg_time_between_tx_sec")
)
user_profiles_df = user_profiles_df.na.fill(0.0, subset=["stddev_amount"]) # Tratando nulos no desvio padrão

display(user_profiles_df)

In [0]:
print(f"Salvando perfis atualizados na tabela '{core_table}'...")

user_profiles_df.write \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(core_table)

print(f"Tabela de perfis '{core_table}' atualizada com sucesso.")

In [0]:
%sql SELECT * FROM default.user_behavior_profiles