In [0]:
from pyspark.sql.functions import col, array_contains, when, count, col, lit, current_date

In [0]:
df= spark.sql(
  """
  SELECT * FROM processed.ct.account
  """
)


id,account_id,limit,products,ingest_date,create_date
5ca4bbc7a2dd94ee581625eb,50948,10000,"List(Derivatives, InvestmentStock)",2024-05-03T18:59:50.65Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee58162602,51080,10000,"List(Commodity, InvestmentStock)",2024-05-03T18:59:50.651Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee58162881,51253,10000,"List(Derivatives, InvestmentStock, Brokerage, CurrencyService)",2024-05-03T18:59:50.668Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee581624e4,51474,10000,"List(Brokerage, Commodity, InvestmentStock, InvestmentFund)",2024-05-03T18:59:50.642Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee58162980,51617,10000,"List(InvestmentStock, Brokerage, InvestmentFund, CurrencyService)",2024-05-03T18:59:50.672Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee5816281a,51645,10000,"List(InvestmentFund, InvestmentStock, Commodity, Derivatives)",2024-05-03T18:59:50.665Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee58162740,51822,10000,"List(Derivatives, Brokerage, Commodity, CurrencyService, InvestmentStock)",2024-05-03T18:59:50.66Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee581623ef,53124,10000,"List(Commodity, InvestmentStock)",2024-05-03T18:59:50.55Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee5816248d,54368,10000,"List(Derivatives, Commodity, InvestmentFund, Brokerage, InvestmentStock)",2024-05-03T18:59:50.636Z,2024-05-13T05:42:53.026Z
5ca4bbc7a2dd94ee58162734,54685,10000,"List(Brokerage, InvestmentStock)",2024-05-03T18:59:50.66Z,2024-05-13T05:42:53.026Z


In [0]:
df_defined = df\
    .withColumn("has_investment_stock", array_contains(col("account.products"), "InvestmentStock"))\
    .withColumn("has_brokerage", array_contains(col("account.products"), "Brokerage"))\
    .withColumn("has_commodity", array_contains(col("account.products"), "Commodity"))\
    .withColumn("has_investment_fund", array_contains(col("account.products"), "InvestmentFund"))\
    .withColumn("has_currency_service", array_contains(col("account.products"), "CurrencyService"))\
    .withColumn("has_dervatives", array_contains(col("account.products"), "has_dervatives"))\
    .select(col("account_id"), col("limit"), col("has_investment_stock"), col("has_brokerage"), col("has_commodity"),
            col("has_investment_fund"), col("has_currency_service"), col("has_dervatives")
            )
    

In [0]:
df_updated = df_defined\
    .withColumn("start_date", lit(current_date()))\
    .withColumn("end_date", lit(None))\
    .withColumn("is_current", lit(True))\
    .withColumn("source_system", lit("mongoDB_1"))

df_current = spark.sql(
    """
    SELECT * EXCEPT (account_sk)
    FROM presentation.ct.dim_account
    WHERE is_current = 1
    """
)

if df_current.count() == 0:
    df_final = df_updated
    df_final.createOrReplaceTempView("source_data")
else:
    df_upd_j_cur = df_updated.alias("upd").join(df_current.alias("cur"), df_updated.account_id == df_current.account_id, "outer")

    df_hist_records = df_upd_j_cur\
        .filter(col("cur.account_id").isNotNull())\
        .filter(
            (col("upd.has_investment_stock") != col("cur.has_investment_stock")) |
            (col("upd.has_brokerage") != col("cur.has_brokerage")) |
            (col("upd.has_commodity") != col("cur.has_commodity")) |
            (col("upd.has_investment_fund") != col("cur.has_investment_fund")) |
            (col("upd.has_currency_service") != col("cur.has_currency_service")) |
            (col("upd.has_dervatives") != col("cur.has_dervatives"))
            )\
        .select("cur.*")\
        .withColumn("is_current", lit(False))\
        .withColumn("end_date", lit(current_date()))
        

    id_list = [row.account_id for row in df_hist_records.select("cur.account_id").collect()]

    df_new_records = df_upd_j_cur\
        .withColumn("upd.start_date", when((~col("upd.account_id").isin(id_list)) & (col("cur.start_date") != 'null') , col("cur.start_date")))\
        .select("upd.*")
    df_final = df_new_records.union(df_hist_records)

    df_final.createOrReplaceTempView("source_data")


In [0]:
display(df_updated.filter(col("account_id") == 627788))

account_id,limit,has_investment_stock,has_brokerage,has_commodity,has_investment_fund,has_currency_service,has_dervatives,start_date,end_date,is_current,source_system
627788,10000,True,True,True,False,True,False,2024-05-13,,True,mongoDB_1


In [0]:
%sql
MERGE INTO presentation.ct.dim_account AS trgt
USING source_data AS src
ON trgt.account_id = src.account_id AND src.is_current = 1
WHEN MATCHED THEN UPDATE 
SET
  account_id = src.account_id,
  limit = src.limit,
  has_investment_stock = src.has_investment_stock,
  has_brokerage = src.has_brokerage,
  has_commodity = src.has_commodity,
  has_investment_fund = src.has_investment_fund,
  has_currency_service = src.has_currency_service,
  has_dervatives = src.has_dervatives,
  start_date = src.start_date,
  end_date = src.end_date,
  is_current = src.is_current,
  source_system = src.source_system

WHEN NOT MATCHED THEN
INSERT(
  account_id,
  limit,
  has_investment_stock,
  has_brokerage,
  has_commodity,
  has_investment_fund,
  has_currency_service,
  has_dervatives,
  start_date,
  end_date,
  is_current,
  source_system
)
VALUES(
  src.account_id,
  src.limit,
  src.has_investment_stock,
  src.has_brokerage,
  src.has_commodity,
  src.has_investment_fund,
  src.has_currency_service,
  src.has_dervatives,
  src.start_date,
  src.end_date,
  src.is_current,
  src.source_system
)



num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
1746,0,0,1746
