In [0]:
from datetime import datetime
import json
from azure.storage.blob import BlobServiceClient

class Tracker:
    def __init__(self, jobname, config):
        self.jobname = jobname
        self.config = config

        # Get values from config.ini or variables
        self.storage_account_name = config.get("azure", "storage_account_name")
        self.account_key = config.get("azure", "storage_account_key")  # NEW: account key
        self.container_name = config.get("job_tracking", "storage_container")
        self.blob_name = config.get("job_tracking", "tracking_file")

        # Construct connection string
        connection_str = (
            f"DefaultEndpointsProtocol=https;"
            f"AccountName={self.storage_account_name};"
            f"AccountKey={self.account_key};"
            f"EndpointSuffix=core.windows.net"
        )
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_str)

    def assign_job_id(self):
        return f"{self.jobname}_{datetime.now().strftime('%Y%m%d%H%M%S')}"

    def update_job_status(self, status):
        job_id = self.assign_job_id()
        update_time = datetime.now().isoformat()
        blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=self.blob_name)

        try:
            job_data = json.loads(blob_client.download_blob().readall().decode("utf-8"))
        except:
            job_data = {}

        job_data[job_id] = {"status": status, "updated_time": update_time}
        blob_client.upload_blob(json.dumps(job_data, indent=4), overwrite=True)
        print(f"Job {job_id} updated to {status}")



In [0]:
# Cell 2
from configparser import ConfigParser

def run_reporter_etl(config):
    trade_date = config.get("production", "processing_date")
    eod_dir = config.get("azure", "eod_directory")  # You must define this in config.ini
    tracker = Tracker("analytical_etl", config)

    try:
        # Your actual ETL logic here (e.g., reading from staging tables, creating final outputs)
        # Example:
        print("Running ETL for:", trade_date)
        tracker.update_job_status("success")
    except Exception as e:
        print(f"ETL Failed: {e}")
        tracker.update_job_status("failed")


In [0]:
from datetime import datetime, timedelta, date
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg, broadcast
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("AnalyticalETL").enableHiveSupport().getOrCreate()

# Azure Storage Account Setup
storage_account_name = "trial25"
container = "equity-data"
storage_account_key = ""

# Set Spark Configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",storage_account_key)

trade_file_location = "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=T/"
quote_file_location = "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=Q/"
'''
trade_file_locations = [
                         "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=T/part-00000-tid-8466613106631209932-64d5144f-a949-4968-b37b-bbfdf22332fb-192-2.c000.snappy.parquet",
                        "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=T/part-00001-tid-8466613106631209932-64d5144f-a949-4968-b37b-bbfdf22332fb-193-2.c000.snappy.parquet"
                       ]
quote_file_locations = [
                        "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=Q/part-00000-tid-8466613106631209932-64d5144f-a949-4968-b37b-bbfdf22332fb-192-1.c000.snappy.parquet",
                        "wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=Q/                       part-00001-tid-8466613106631209932-64d5144f-a949-4968-b37b-bbfdf22332fb-193-1.c000.snappy.parquet"
                       ]
'''

# Define trade date (Example: 2020-08-06)
#trade_date = "2020-08-06"


# trade_file_locations = [
#                          "wasbs://equity-data@trial25.blob.core.windows.net/trade/trade_dt={}".format("2020-08-06"),
#                          "wasbs://equity-data@trial25.blob.core.windows.net/trade/trade_dt={}".format("2020-08-05")
#                        ]

# Read Parquet Files From Azure Blob Storage Partition
# Read Trade Parquet
trade_df = spark.read.parquet("wasbs://equity-data@trial25.blob.core.windows.net/trade")
#"wasbs://equity-data@trial25.blob.core.windows.net/trade/trade_dt={}".format("2020-08-06")

# Read Quote Parquet
quote_df = spark.read.parquet("wasbs://equity-data@trial25.blob.core.windows.net/output_dir/partition=Q/")

In [0]:
trade_df.printSchema()

root
 |-- trade_dt: date (nullable = true)
 |-- symbol: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- event_tm: timestamp (nullable = true)
 |-- event_seq_nb: integer (nullable = true)
 |-- arrival_tm: timestamp (nullable = true)
 |-- trade_pr: decimal(10,2) (nullable = true)



In [0]:
spark.sql("show tables").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|     temp_last_trade|      false|
| default|temp_trade_moving...|      false|
|        |              quotes|       true|
|        |      tmp_last_trade|       true|
|        |tmp_trade_moving_avg|       true|
|        |              trades|       true|
+--------+--------------------+-----------+



In [0]:
trade_df.createOrReplaceTempView("trades")

In [0]:
quote_df.createOrReplaceTempView("quotes")

In [0]:
spark.sql("show tables").show(truncate=False)

+--------+---------------------+-----------+
|database|tableName            |isTemporary|
+--------+---------------------+-----------+
|default |temp_last_trade      |false      |
|default |temp_trade_moving_avg|false      |
|        |quotes               |true       |
|        |tmp_last_trade       |true       |
|        |tmp_trade_moving_avg |true       |
|        |trades               |true       |
+--------+---------------------+-----------+



In [0]:
# Create Trade Staging Table
# Use Spark To Read The Trade Table With Date Partition “2020-08-06”
trade_df = spark.sql("""
                     SELECT    trade_dt,
                               symbol,
                               exchange,
                               event_tm,
                               event_seq_nb,
                               trade_pr 
                        FROM trades 
                        WHERE trade_dt = '2020-08-06'
                        ORDER BY symbol,exchange,trade_dt,event_tm
                     """)
display(trade_df)

trade_dt,symbol,exchange,event_tm,event_seq_nb,trade_pr
2020-08-06,SYMA,NYSE,2020-08-06T10:49:37.345Z,10,74.49
2020-08-06,SYMA,NYSE,2020-08-06T12:00:11.545Z,20,76.16
2020-08-06,SYMA,NYSE,2020-08-06T13:11:57.308Z,30,76.9
2020-08-06,SYMA,NYSE,2020-08-06T14:27:13.014Z,40,77.12
2020-08-06,SYMA,NYSE,2020-08-06T15:39:08.521Z,50,76.37
2020-08-06,SYMA,NYSE,2020-08-06T16:58:58.633Z,60,78.32
2020-08-06,SYMA,NYSE,2020-08-06T18:14:28.899Z,70,78.23
2020-08-06,SYMA,NYSE,2020-08-06T19:28:22.629Z,80,76.53
2020-08-06,SYMA,NYSE,2020-08-06T20:49:10.946Z,90,75.71
2020-08-06,SYMA,NYSE,2020-08-06T22:00:18.406Z,100,76.31


In [0]:
# Create A Spark Temporary View
trade_df.createOrReplaceTempView("tmp_trade_moving_avg")

In [0]:
# Calculate The 30-min Moving Average Using The Spark Temp View
mov_avg_df = spark.sql("""
    SELECT 
        trade_dt,
        symbol, 
        exchange, 
        event_tm, 
        event_seq_nb, 
        trade_pr,
        -- Compute 30-minute moving average based on time
        AVG(trade_pr) OVER (
            PARTITION BY symbol, exchange 
            ORDER BY event_tm 
            RANGE BETWEEN INTERVAL 30 MINUTES PRECEDING AND CURRENT ROW
        ) AS mov_avg_pr
    FROM tmp_trade_moving_avg
""")

display(mov_avg_df.take(mov_avg_df.count()))

trade_dt,symbol,exchange,event_tm,event_seq_nb,trade_pr,mov_avg_pr
2020-08-06,SYMA,NYSE,2020-08-06T10:49:37.345Z,10,74.49,74.49
2020-08-06,SYMA,NYSE,2020-08-06T12:00:11.545Z,20,76.16,76.16
2020-08-06,SYMA,NYSE,2020-08-06T13:11:57.308Z,30,76.9,76.9
2020-08-06,SYMA,NYSE,2020-08-06T14:27:13.014Z,40,77.12,77.12
2020-08-06,SYMA,NYSE,2020-08-06T15:39:08.521Z,50,76.37,76.37
2020-08-06,SYMA,NYSE,2020-08-06T16:58:58.633Z,60,78.32,78.32
2020-08-06,SYMA,NYSE,2020-08-06T18:14:28.899Z,70,78.23,78.23
2020-08-06,SYMA,NYSE,2020-08-06T19:28:22.629Z,80,76.53,76.53
2020-08-06,SYMA,NYSE,2020-08-06T20:49:10.946Z,90,75.71,75.71
2020-08-06,SYMA,NYSE,2020-08-06T22:00:18.406Z,100,76.31,76.31


In [0]:
# Save The Temporary View Into Hive Table For Staging
mov_avg_df.write.mode("overwrite").saveAsTable("temp_trade_moving_avg")

In [0]:
trade_date2 = date(2020, 8, 6).isoformat()
print(trade_date2)

2020-08-06


In [0]:
# Create Staging Table For The Prior Day’s Last Trade
# Get The Previous Date Value
trade_date = date(2020, 8, 6)#datetime.strptime('2020-08-06', '%Y-%m-%d')
#[use datetime utility to calculate previous date]
prev_date_str = (trade_date - timedelta(days=1)).isoformat()#.strftime("%Y-%m-%d")

print("Trade_date:",trade_date)
print("Prev_Date_Str:",prev_date_str)

Trade_date: 2020-08-06
Prev_Date_Str: 2020-08-05


In [0]:
# Use Spark To Read The Trade Table With Date Partition “2020-08-05”
prev_trade_df = spark.sql(f"""
    SELECT symbol, exchange, event_tm, event_seq_nb, trade_pr 
    FROM trades 
    WHERE trade_dt = '{prev_date_str}'
""")
prev_trade_df.show()

+------+--------+--------------------+------------+--------+
|symbol|exchange|            event_tm|event_seq_nb|trade_pr|
+------+--------+--------------------+------------+--------+
|  SYMA|    NYSE|2020-08-05 10:37:...|          10|   79.19|
|  SYMA|    NYSE|2020-08-05 11:56:...|          20|   76.49|
|  SYMA|    NYSE|2020-08-05 13:09:...|          30|   75.05|
|  SYMA|    NYSE|2020-08-05 14:24:...|          40|   78.43|
|  SYMA|    NYSE|2020-08-05 15:31:...|          50|   78.15|
|  SYMA|    NYSE|2020-08-05 16:37:...|          60|   79.19|
|  SYMA|    NYSE|2020-08-05 17:49:...|          70|   77.07|
|  SYMA|    NYSE|2020-08-05 19:04:...|          80|   75.48|
|  SYMA|    NYSE|2020-08-05 20:21:...|          90|   74.60|
|  SYMA|    NYSE|2020-08-05 21:30:...|         100|   77.79|
|  SYMB|    NYSE|2020-08-05 10:43:...|          10|   34.98|
|  SYMB|    NYSE|2020-08-05 12:02:...|          20|   33.18|
|  SYMB|    NYSE|2020-08-05 13:10:...|          30|   34.18|
|  SYMB|    NYSE|2020-08

In [0]:
# Create Spark Temporary View
prev_trade_df.createOrReplaceTempView("tmp_last_trade")

In [0]:
display(spark.sql("select * from temp_trade_moving_avg"))

symbol,exchange,event_tm,event_seq_nb,trade_pr,mov_avg_pr,trade_dt
SYMA,NYSE,2020-08-06T10:49:37.345Z,10,74.49,74.49,2020-08-06
SYMA,NYSE,2020-08-06T12:00:11.545Z,20,76.16,76.16,2020-08-06
SYMA,NYSE,2020-08-06T13:11:57.308Z,30,76.9,76.9,2020-08-06
SYMA,NYSE,2020-08-06T14:27:13.014Z,40,77.12,77.12,2020-08-06
SYMA,NYSE,2020-08-06T15:39:08.521Z,50,76.37,76.37,2020-08-06
SYMA,NYSE,2020-08-06T16:58:58.633Z,60,78.32,78.32,2020-08-06
SYMA,NYSE,2020-08-06T18:14:28.899Z,70,78.23,78.23,2020-08-06
SYMA,NYSE,2020-08-06T19:28:22.629Z,80,76.53,76.53,2020-08-06
SYMA,NYSE,2020-08-06T20:49:10.946Z,90,75.71,75.71,2020-08-06
SYMA,NYSE,2020-08-06T22:00:18.406Z,100,76.31,76.31,2020-08-06


In [0]:
display(mov_avg_df)

trade_dt,symbol,exchange,event_tm,event_seq_nb,trade_pr,mov_avg_pr
2020-08-06,SYMA,NYSE,2020-08-06T10:49:37.345Z,10,74.49,74.49
2020-08-06,SYMA,NYSE,2020-08-06T12:00:11.545Z,20,76.16,76.16
2020-08-06,SYMA,NYSE,2020-08-06T13:11:57.308Z,30,76.9,76.9
2020-08-06,SYMA,NYSE,2020-08-06T14:27:13.014Z,40,77.12,77.12
2020-08-06,SYMA,NYSE,2020-08-06T15:39:08.521Z,50,76.37,76.37
2020-08-06,SYMA,NYSE,2020-08-06T16:58:58.633Z,60,78.32,78.32
2020-08-06,SYMA,NYSE,2020-08-06T18:14:28.899Z,70,78.23,78.23
2020-08-06,SYMA,NYSE,2020-08-06T19:28:22.629Z,80,76.53,76.53
2020-08-06,SYMA,NYSE,2020-08-06T20:49:10.946Z,90,75.71,75.71
2020-08-06,SYMA,NYSE,2020-08-06T22:00:18.406Z,100,76.31,76.31


In [0]:
'''
last_pr_df = spark.sql("""select symbol, exchange, last_pr from (select
symbol, exchange, event_tm, event_seq_nb, trade_pr,
# [logic to derive last 30 min moving average price] AS last_pr
FROM tmp_trade_moving_avg) a
""")
'''
'''
last_pr_df = spark.sql("""
                       select symbol,
                              exchange,
                              last_pr 
                        from (select symbol,
                                     exchange,
                                     event_tm,
                                     event_seq_nb, 
                                     trade_pr
                                    --# [logic to derive last 30 min moving average price] AS last_pr
                                    FROM tmp_trade_moving_avg) a
                        """)
'''


{"ts": "2025-03-29 00:27:20,627", "level": "ERROR", "logger": "SQLQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `last_pr` cannot be resolved. Did you mean one of the following? [`trade_pr`, `event_tm`, `symbol`, `exchange`, `event_seq_nb`]. SQLSTATE: 42703", "context": {"errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o404.sql.\n: org.apache.spark.sql.catalyst.ExtendedAnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `last_pr` cannot be resolved. Did you mean one of the following? [`trade_pr`, `event_tm`, `symbol`, `exchange`, `event_seq_nb`]. SQLSTATE: 42703; line 4 pos 30;\n'Project [symbol#1020, exchange#1021, 'last_pr]\n+- SubqueryAlias a\n   +- Project [symbol#1020, exchange#1021, event_tm#1022, event_seq_nb#1023, trade_pr#1025]\n      +- SubqueryAlias tmp_trade_moving_avg\

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8857559167695151>, line 9[0m
[1;32m      1[0m [38;5;124;03m'''[39;00m
[1;32m      2[0m [38;5;124;03mlast_pr_df = spark.sql("""select symbol, exchange, last_pr from (select[39;00m
[1;32m      3[0m [38;5;124;03msymbol, exchange, event_tm, event_seq_nb, trade_pr,[39;00m
[0;32m   (...)[0m
[1;32m      6[0m [38;5;124;03m""")[39;00m
[1;32m      7[0m [38;5;124;03m'''[39;00m
[0;32m----> 9[0m last_pr_df [38;5;241m=[39m spark[38;5;241m.[39msql([38;5;124m"""[39m
[1;32m     10[0m [38;5;124m                       select symbol,[39m
[1;32m     11[0m [38;5;124m                              exchange,[39m
[1;32m     12[0m [38;5;124m                              last_pr [39m
[1;32m     13[0m [38;5;124m                        from (select symbol,[39m
[1;32m     14[0

In [0]:
spark.sql("""
    SELECT 
        *
        FROM temp_trade_moving_avg
        ORDER BY event_tm DESC
        LIMIT 1
""").show()

+------+--------+--------------------+------------+--------+----------+----------+
|symbol|exchange|            event_tm|event_seq_nb|trade_pr|mov_avg_pr|  trade_dt|
+------+--------+--------------------+------------+--------+----------+----------+
|  SYMA|    NYSE|2020-08-06 22:00:...|         100|   76.31| 76.310000|2020-08-06|
+------+--------+--------------------+------------+--------+----------+----------+



In [0]:
#Calculate Last Trade Price Using The Spark Temp View

# Need to MODIFY this query for calculating last trade price
last_pr_df = spark.sql("""
    SELECT 
        symbol, 
        exchange, 
        last_pr 
    FROM (
        SELECT 
            symbol, 
            exchange, 
            event_tm, 
            event_seq_nb, 
            trade_pr, 
            LAST(trade_pr) OVER (
                PARTITION BY symbol, exchange 
                ORDER BY event_tm
            ) AS last_pr
        FROM temp_trade_moving_avg
    ) a
""")


last_pr_df.show()

+------+--------+-------+
|symbol|exchange|last_pr|
+------+--------+-------+
|  SYMA|    NYSE|  74.49|
|  SYMA|    NYSE|  76.16|
|  SYMA|    NYSE|  76.90|
|  SYMA|    NYSE|  77.12|
|  SYMA|    NYSE|  76.37|
|  SYMA|    NYSE|  78.32|
|  SYMA|    NYSE|  78.23|
|  SYMA|    NYSE|  76.53|
|  SYMA|    NYSE|  75.71|
|  SYMA|    NYSE|  76.31|
|  SYMB|    NYSE|  33.86|
|  SYMB|    NYSE|  32.93|
|  SYMB|    NYSE|  33.69|
|  SYMB|    NYSE|  35.07|
|  SYMB|    NYSE|  34.83|
|  SYMB|    NYSE|  33.32|
|  SYMB|    NYSE|  33.57|
|  SYMB|    NYSE|  33.11|
|  SYMB|    NYSE|  32.64|
|  SYMB|    NYSE|  35.92|
+------+--------+-------+
only showing top 20 rows


In [0]:
#Calculate Last Trade Price Using The Spark Temp View

# Need to MODIFY this query for caculating last trade price
last_pr_df = spark.sql("""
    SELECT 
        symbol, 
        exchange, 
        last_pr 
    FROM (
        SELECT 
            symbol, 
            exchange, 
            event_tm, 
            event_seq_nb, 
            trade_pr, 
            FIRST_VALUE(trade_pr) OVER (
                PARTITION BY symbol, exchange 
                ORDER BY event_tm DESC
            ) AS last_pr
        FROM tmp_last_trade
    ) a
""")

last_pr_df.show()

+------+--------+-------+
|symbol|exchange|last_pr|
+------+--------+-------+
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMA|    NYSE|  77.79|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
|  SYMB|    NYSE|  33.96|
+------+--------+-------+
only showing top 20 rows


In [0]:
# Save The Temporary View Into Hive Table For Staging
last_pr_df.write.mode("overwrite").saveAsTable("temp_last_trade")

In [0]:
display(spark.sql("SELECT trade_dt From temp_trade_moving_avg LIMIT 5"))

trade_dt
2020-08-06
2020-08-06
2020-08-06
2020-08-06
2020-08-06


In [0]:
'''
4.4 Populate The Latest Trade and Latest Moving Average Trade Price To The Quote
Records
Now that you’ve produced both staging tables, join them with the main table “quotes” to
populate trade related information.
'''

'\n4.4 Populate The Latest Trade and Latest Moving Average Trade Price To The Quote\nRecords\nNow that you’ve produced both staging tables, join them with the main table “quotes” to\npopulate trade related information.\n'

In [0]:
#display(spark.sql("SELECT * FROM trades_casted LIMIT 5"))

{"ts": "2025-03-29 00:28:36,835", "level": "ERROR", "logger": "SQLQueryContextLogger", "msg": "[TABLE_OR_VIEW_NOT_FOUND] The table or view `trades_casted` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.\nTo tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS. SQLSTATE: 42P01", "context": {"errorClass": "TABLE_OR_VIEW_NOT_FOUND"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o404.sql.\n: org.apache.spark.sql.catalyst.ExtendedAnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `trades_casted` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.\nTo tolerate the error on drop use DROP V

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8046733653995269>, line 1[0m
[0;32m----> 1[0m display(spark[38;5;241m.[39msql([38;5;124m"[39m[38;5;124mSELECT * FROM trades_casted LIMIT 5[39m[38;5;124m"[39m))

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     45[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     46[0m [38;5;28;01mtry[39;00m:
[0;32m---> 47[0m     res [38;5;241m=[39m func([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)
[1;32m     48[0m     logger[38;5;241m.[39mlog_success(
[1;32m     49[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;241m-[39m start, signature
[1;32m     50[0m     )
[1;32m     51[0m     

In [0]:
display(spark.sql("DESCRIBE temp_trade_moving_avg"))
display(spark.sql("DESCRIBE quotes"))

col_name,data_type,comment
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
trade_pr,"decimal(10,2)",
mov_avg_pr,"decimal(14,6)",
trade_dt,date,


col_name,data_type,comment
trade_dt,date,
rec_type,string,
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
arrival_tm,timestamp,
trade_pr,"decimal(10,2)",
bid_pr,"decimal(10,2)",
bid_size,int,


In [0]:
'''4.4.1 Join With Table temp_trade_moving_avg
You need to join “quotes” and “temp_trade_moving_avg” to populate trade_pr and mov_avg_pr
into quotes. However, you cannot use equality join in this case; trade events don’t happen at the
same quote time. You want the latest in time sequence. This is a typical time sequence
analytical use case. A good method for this problem is to merge both tables in a common time sequence.'''

'''4.4.1.1 Define A Common Schema Holding “quotes” and “temp_trade_moving_avg”
Records
This is a necessary step before the union of two datasets which have a different schema
(denormalization). The schema needs to include all the fields of quotes and
temp_trade_mov_avg so that no information gets lost.'''
display(spark.sql("DESCRIBE quotes"))
display(spark.sql("DESCRIBE temp_trade_moving_avg"))

from pyspark.sql.types import *

unified_schema = StructType([
    StructField("trade_dt", DateType(), True),
    StructField("symbol", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("event_tm", TimestampType(), True),
    StructField("event_seq_nb", IntegerType(), True),
    StructField("bid_pr", DecimalType(10, 2), True),
    StructField("bid_size", IntegerType(), True),
    StructField("ask_pr", DecimalType(10, 2), True),
    StructField("ask_size", IntegerType(), True),
    StructField("arrival_tm", TimestampType(), True),
    StructField("trade_pr", DecimalType(10, 2), True),
    StructField("mov_avg_pr", DecimalType(14, 6), True),
    StructField("rec_type", StringType(), True)
    
])

quote_schema = StructType([
    StructField("trade_dt", DateType(), True),
    StructField("rec_type", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("event_tm", TimestampType(), True),
    StructField("event_seq_nb", IntegerType(), True),
    StructField("arrival_tm", TimestampType(), True),
    StructField("trade_pr", DecimalType(10, 2), True),
    StructField("bid_pr", DecimalType(10, 2), True),
    StructField("bid_size", IntegerType(), True),
    StructField("ask_pr", DecimalType(10, 2), True),
    StructField("ask_size", IntegerType(), True),
    #StructField("rec_type", StringType(), True),=
    # StructField("trade_pr", DecimalType(10, 2), True),
    # StructField("mov_avg_pr", DecimalType(14, 6), True)    
])

temp_trade_moving_avg_schema = StructType([
    StructField("symbol", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("event_tm", TimestampType(), True),
    StructField("event_seq_nb", IntegerType(), True),
    StructField("trade_pr", DecimalType(10, 2), True),
    StructField("mov_avg_pr", DecimalType(14, 6), True),
    StructField("trade_dt", DateType(), True)
    #StructField("rec_type", StringType(), True)
])


display(spark.sql("DESCRIBE quotes"))

quotes_df = spark.sql("SELECT * FROM quotes")
quotes_casted = spark.createDataFrame(quotes_df.collect(), schema=quote_schema)
quotes_casted.createOrReplaceTempView("quotes_casted")

trades_df = spark.sql("SELECT * FROM temp_trade_moving_avg")
trades_casted = spark.createDataFrame(trades_df.collect(), schema=temp_trade_moving_avg_schema)
trades_casted.createOrReplaceTempView("trades_casted")


col_name,data_type,comment
trade_dt,date,
rec_type,string,
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
arrival_tm,timestamp,
trade_pr,"decimal(10,2)",
bid_pr,"decimal(10,2)",
bid_size,int,


col_name,data_type,comment
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
trade_pr,"decimal(10,2)",
mov_avg_pr,"decimal(14,6)",
trade_dt,date,


col_name,data_type,comment
trade_dt,date,
rec_type,string,
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
arrival_tm,timestamp,
trade_pr,"decimal(10,2)",
bid_pr,"decimal(10,2)",
bid_size,int,


In [0]:
# 4.4.1.2 Create Spark Temp View To Union Both Tables
from pyspark.sql.functions import last
# Step 1: Create a unified time-ordered table with all quote and trade fields
quote_union = spark.sql("""
    SELECT * FROM (
        SELECT
            trade_dt,
            symbol,
            exchange,
            event_tm,
            event_seq_nb,
            bid_pr,
            bid_size,
            ask_pr,
            ask_size,
            arrival_tm,
            NULL AS trade_pr,
            NULL AS mov_avg_pr,
            'Q' AS rec_type
        FROM quotes_casted
        WHERE trade_dt IN (SELECT DISTINCT(trade_dt) FROM temp_trade_moving_avg)

        UNION ALL

        SELECT
            trade_dt,
            symbol,
            exchange,
            event_tm,
            NULL AS event_seq_nb,
            NULL AS bid_pr,
            NULL AS bid_size,
            NULL AS ask_pr,
            NULL AS ask_size,
            NULL AS arrival_tm,
            trade_pr,
            mov_avg_pr,
            'T' AS rec_type
        FROM trades_casted
    )
    ORDER BY trade_dt, event_tm
    
""")

quote_union.createOrReplaceTempView("quote_union")
#display(quote_union)

# Step 2: Apply window function to fill forward trade_pr and mov_avg_pr
window_spec = Window.partitionBy("symbol", "exchange") \
                    .orderBy("trade_dt", "event_tm") \
                    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# 4.4.1.3 Populate The Latest trade_pr and mov_avg_pr
quote_union_update = quote_union \
    .withColumn("latest_trade_pr", last("trade_pr", ignorenulls=True).over(window_spec)) \
    .withColumn("latest_mov_avg_pr", last("mov_avg_pr", ignorenulls=True).over(window_spec)) 

quote_union_update.createOrReplaceTempView("quote_union_update")

#4.4.1.4 Filter For Quote Records
quote_update = quote_union_update.filter("rec_type = 'Q'")  # keep only quotes

quote_update.createOrReplaceTempView("quote_update")

# Step 3: Show results or use as needed
display(quote_update.select(
    "trade_dt","symbol", "exchange", "event_tm", "bid_pr", "ask_pr", "arrival_tm",
    "latest_trade_pr", "latest_mov_avg_pr"))

trade_dt,symbol,exchange,event_tm,bid_pr,ask_pr,arrival_tm,latest_trade_pr,latest_mov_avg_pr
2020-08-06,SYMA,NYSE,2020-08-06T09:39:01.293Z,77.68,78.44,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T09:47:20.398Z,76.53,76.94,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T09:56:26.402Z,75.12,75.39,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:03:59.522Z,74.86,75.77,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:09:53.165Z,77.78,78.8,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:18:55.618Z,75.03,75.69,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:25:54.959Z,75.36,75.46,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:34:34.929Z,78.59,78.6,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:41:24.664Z,75.71,76.89,2020-08-06T09:30:00Z,,
2020-08-06,SYMA,NYSE,2020-08-06T10:55:42.576Z,78.31,79.48,2020-08-06T09:30:00Z,74.49,74.49


In [0]:
display(spark.sql('SELECT event_tm FROM quotes LIMIT 5'))

event_tm
2020-08-06T09:39:01.293Z
2020-08-06T09:47:20.398Z
2020-08-06T09:56:26.402Z
2020-08-06T10:03:59.522Z
2020-08-06T10:09:53.165Z


In [0]:
display(spark.sql('SELECT event_tm FROM tmp_trade_moving_avg LIMIT 5'))

event_tm
2020-08-06T10:49:37.345Z
2020-08-06T12:00:11.545Z
2020-08-06T13:11:57.308Z
2020-08-06T14:27:13.014Z
2020-08-06T15:39:08.521Z


In [0]:

display(spark.sql("DESCRIBE quote_union_update").show(truncate=False))

+-----------------+-------------+-------+
|col_name         |data_type    |comment|
+-----------------+-------------+-------+
|trade_dt         |date         |NULL   |
|symbol           |string       |NULL   |
|exchange         |string       |NULL   |
|event_tm         |timestamp    |NULL   |
|event_seq_nb     |int          |NULL   |
|bid_pr           |decimal(10,2)|NULL   |
|bid_size         |int          |NULL   |
|ask_pr           |decimal(10,2)|NULL   |
|ask_size         |int          |NULL   |
|arrival_tm       |timestamp    |NULL   |
|trade_pr         |decimal(10,2)|NULL   |
|mov_avg_pr       |decimal(14,6)|NULL   |
|rec_type         |string       |NULL   |
|latest_trade_pr  |decimal(10,2)|NULL   |
|latest_mov_avg_pr|decimal(14,6)|NULL   |
+-----------------+-------------+-------+



In [0]:
display(spark.sql("DESCRIBE quote_union"))

col_name,data_type,comment
trade_dt,date,
symbol,string,
exchange,string,
event_tm,timestamp,
event_seq_nb,int,
bid_pr,"decimal(10,2)",
bid_size,int,
ask_pr,"decimal(10,2)",
ask_size,int,
arrival_tm,timestamp,


In [0]:
#4.4.2 Join With Table temp_last_trade To Get The Prior Day Close Price
from pyspark.sql.functions import broadcast, expr

# Load the prior day close price table
last_trade_df = spark.sql("SELECT * FROM temp_last_trade")

# Broadcast join with corrected column name
quote_final = quote_update.alias("q").join(
    broadcast(last_trade_df).alias("lt"),
    on=["symbol", "exchange"],
    how="left"
).select(
    "q.trade_dt",
    "q.symbol",
    "q.event_tm",
    "q.event_seq_nb",
    "q.exchange",
    "q.bid_pr",
    "q.bid_size",
    "q.ask_pr",
    "q.ask_size",
    "q.latest_trade_pr",
    "q.latest_mov_avg_pr",
    expr("q.bid_pr - lt.last_pr").alias("bid_pr_mv"),
    expr("q.ask_pr - lt.last_pr").alias("ask_pr_mv")
)

quote_final.createOrReplaceTempView("quote_final")

# Preview the final result
display(quote_final)


trade_dt,symbol,event_tm,event_seq_nb,exchange,bid_pr,bid_size,ask_pr,ask_size,latest_trade_pr,latest_mov_avg_pr,bid_pr_mv,ask_pr_mv
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65
2020-08-06,SYMA,2020-08-06T09:39:01.293Z,1,NYSE,77.68,100,78.44,100,,,-0.11,0.65


In [0]:
# Write The Final Dataframe Into Azure Blob Storage At Corresponding Partition
#trade_date_str = trade_date.strftime("%Y-%m-%d")
output_path = f"wasbs://{container}@{storage_account_name}.blob.core.windows.net/quote-trade-analytical/date={trade_date}"
quote_final.write.mode("overwrite").parquet(output_path)

In [0]:
# Cell 3
from configparser import ConfigParser

# Read config file from workspace path (upload via UI if needed)
config_path = "/Workspace/Users/marchuai@outlook.com/Equity Project/capstone_project/config/config.ini"
config = ConfigParser()
config.read(config_path)

run_reporter_etl(config)


Running ETL for: 2020-08-06
Job analytical_etl_20250403050415 updated to success
