In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("EOD Data Load") \
    .getOrCreate()

In [None]:
# Step 3.1.1: Read Trade Partition Dataset From Its Temporary Location
#trade_common = spark.read.parquet("output_dir/partition=T")

In [16]:
# Step 1: Read the CSV file
csv_file_path = r"C:\Users\march\Dropbox\DE Springboard\Guided Capstone\output_dir\partition=T\sample_trade_data_with_tm.csv"  # Adjust the path accordingly
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Show the DataFrame to verify it's read correctly
df.show(5)

+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+
|  trade_dt|rec_type|symbol|exchange|           event_tm|event_seq_nb|         arrival_tm|trade_pr|            file_tm|partition|
+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+
|2020-07-29|       T|  AAPL|  NASDAQ|2020-07-29 09:30:00|           1|2020-07-29 09:30:05|   150.0|2020-07-29 09:30:10|        T|
|2020-07-29|       T|  MSFT|  NASDAQ|2020-07-29 09:31:00|           2|2020-07-29 09:31:05|   230.0|2020-07-29 09:31:10|        T|
|2020-07-29|       T|  GOOG|  NASDAQ|2020-07-29 09:32:00|           3|2020-07-29 09:32:05|  2750.0|2020-07-29 09:32:10|        T|
+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+



In [17]:
# Step 2: Write the DataFrame to Parquet format
output_parquet_path = r"C:\Users\march\Downloads\trades.parquet"  # Adjust the path accordingly
df.write.mode("overwrite").parquet(output_parquet_path)

# Step 3: Read the Parquet file
trade_common = spark.read.parquet(output_parquet_path)

# Show the DataFrame from the Parquet file to verify
trade_common.show(5)

+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+
|  trade_dt|rec_type|symbol|exchange|           event_tm|event_seq_nb|         arrival_tm|trade_pr|            file_tm|partition|
+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+
|2020-07-29|       T|  AAPL|  NASDAQ|2020-07-29 09:30:00|           1|2020-07-29 09:30:05|   150.0|2020-07-29 09:30:10|        T|
|2020-07-29|       T|  MSFT|  NASDAQ|2020-07-29 09:31:00|           2|2020-07-29 09:31:05|   230.0|2020-07-29 09:31:10|        T|
|2020-07-29|       T|  GOOG|  NASDAQ|2020-07-29 09:32:00|           3|2020-07-29 09:32:05|  2750.0|2020-07-29 09:32:10|        T|
+----------+--------+------+--------+-------------------+------------+-------------------+--------+-------------------+---------+



In [18]:
# Step 3.1.2: Select Necessary Columns for Trade Records
trade = trade_common.select(
    "trade_dt", 
    "symbol", 
    "exchange", 
    "event_tm",
    "event_seq_nb", 
    "file_tm",
    "trade_pr"
)

# Display the schema of the selected DataFrame
trade.printSchema()

root
 |-- trade_dt: date (nullable = true)
 |-- symbol: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- event_tm: timestamp (nullable = true)
 |-- event_seq_nb: integer (nullable = true)
 |-- file_tm: timestamp (nullable = true)
 |-- trade_pr: double (nullable = true)



In [19]:
# Step 3.1.3: Apply Data Correction
# Define the function to apply the latest records based on the arrival time
def applyLatest(df):
    # Group by the unique ID and get the latest arrival time
    return df.groupBy("trade_dt", "symbol", "exchange", "event_tm", "event_seq_nb") \
        .agg(F.last("file_tm").alias("latest_file_tm"), 
        F.last("trade_pr").alias("trade_pr")) \
        .withColumnRenamed("latest_file_tm", "file_tm")

# Apply the function to the trade DataFrame
trade_corrected = applyLatest(trade)

# Show the corrected DataFrame
trade_corrected.show()

+----------+------+--------+-------------------+------------+-------------------+--------+
|  trade_dt|symbol|exchange|           event_tm|event_seq_nb|            file_tm|trade_pr|
+----------+------+--------+-------------------+------------+-------------------+--------+
|2020-07-29|  GOOG|  NASDAQ|2020-07-29 09:32:00|           3|2020-07-29 09:32:10|  2750.0|
|2020-07-29|  MSFT|  NASDAQ|2020-07-29 09:31:00|           2|2020-07-29 09:31:10|   230.0|
|2020-07-29|  AAPL|  NASDAQ|2020-07-29 09:30:00|           1|2020-07-29 09:30:10|   150.0|
+----------+------+--------+-------------------+------------+-------------------+--------+



In [20]:
# Step 3.1.4: Write The Trade Dataset Back To Parquet On Azure Blob Storage
trade_date = "2020-07-29"  # Example date for partitioning

cloud_storage_path =  r"C:\Users\march\Downloads\trade_dt={}".format(trade_date) 
#cloud-storage-path/trade/trade_dt={}".format(trade_date)

# Write the corrected trade DataFrame back to Azure Blob Storage
trade_corrected.write.parquet(cloud_storage_path)

# Summary
# In this notebook, we have practiced data normalization and using cloud storage with Spark output.

In [None]:
# If you want to run SQL queries against trade and quote data on Azure, how would you
# do that?

In [22]:
# Step 3: Use Spark SQL to run queries on the temporary view
# Example query: Get all trades --where trade price is greater than 200

# Step 2: Create a temporary view
trade_corrected.createOrReplaceTempView("trades_view")

query_result = spark.sql("""
    SELECT *
    FROM trades_view 
""")
#WHERE trade_pr > 200

# Show the results of the query
query_result.show()

+----------+------+--------+-------------------+------------+-------------------+--------+
|  trade_dt|symbol|exchange|           event_tm|event_seq_nb|            file_tm|trade_pr|
+----------+------+--------+-------------------+------------+-------------------+--------+
|2020-07-29|  GOOG|  NASDAQ|2020-07-29 09:32:00|           3|2020-07-29 09:32:10|  2750.0|
|2020-07-29|  MSFT|  NASDAQ|2020-07-29 09:31:00|           2|2020-07-29 09:31:10|   230.0|
+----------+------+--------+-------------------+------------+-------------------+--------+



In [None]:
# Use Azure Data Bricks

%sql
SELECT *
FROM trade_table #change table name
WHERE exchange = 'NASDAQ';