In [0]:
!pip install findspark
!pip install pyspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Collecting pyspark
  Downloading pyspark-3.5.5.tar.gz (317.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/317.2 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/317.2 MB[0m [31m7.1 MB/s[0m eta [36m0:00:45[0m
[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/317.2 MB[0m [31m73.3 MB/s[0m eta [36m0:00:05[0m
[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/317.2 MB[0m [31m148.5 MB/s[0m eta [36m0:00:03[0m
[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [0]:
from datetime import datetime
import json
from azure.storage.blob import BlobServiceClient

class Tracker:
    def __init__(self, jobname, config):
        self.jobname = jobname
        self.config = config

        # Get values from config.ini or variables
        self.storage_account_name = config.get("azure", "storage_account_name")
        self.account_key = config.get("azure", "storage_account_key")  # NEW: account key
        self.container_name = config.get("job_tracking", "storage_container")
        self.blob_name = config.get("job_tracking", "tracking_file")

        # Construct connection string
        connection_str = (
            f"DefaultEndpointsProtocol=https;"
            f"AccountName={self.storage_account_name};"
            f"AccountKey={self.account_key};"
            f"EndpointSuffix=core.windows.net"
        )
        self.blob_service_client = BlobServiceClient.from_connection_string(connection_str)

    def assign_job_id(self):
        return f"{self.jobname}_{datetime.now().strftime('%Y%m%d%H%M%S')}"

    def update_job_status(self, status):
        job_id = self.assign_job_id()
        update_time =datetime.now().isoformat()
        blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=self.blob_name)

        try:
            job_data = json.loads(blob_client.download_blob().readall().decode("utf-8"))
        except:
            job_data = {}

        job_data[job_id] = {"status": status, "updated_time": update_time}
        blob_client.upload_blob(json.dumps(job_data, indent=4), overwrite=True)
        print(f"Job {job_id} updated to {status}")



In [0]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("app").getOrCreate()
#spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType, DecimalType

# Define the schema using StructType
schema = StructType([
    StructField("trade_dt", DateType(), True),
    StructField("rec_type", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("exchange", StringType(), True),
    StructField("event_tm", TimestampType(), True),
    StructField("event_seq_nb", IntegerType(), True),
    StructField("arrival_tm", TimestampType(), True),
    StructField("trade_pr", DecimalType(10, 2), True),  # DecimalType(10, 2) is an example; adjust precision and scale as needed
    StructField("bid_pr", DecimalType(10, 2), True),  # Adjust precision and scale as needed
    StructField("bid_size", IntegerType(), True),
    StructField("ask_pr", DecimalType(10, 2), True),  # Adjust precision and scale as needed
    StructField("ask_size", IntegerType(), True),
    StructField("partition", StringType(), True)
])


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

# Commented code

Column names in order

columns_tuple = ("trade_dt", "rec_type", "symbol", "exchange", "event_tm", "event_seq_nb", 
                 "arrival_tm", "trade_pr", "bid_pr", "bid_size", "ask_pr", "ask_size", "partition")

Sample Data

2020-08-05,2020-08-05 09:30:00.0,T,SYMA,2020-08-05 10:37:21.581,10,NYSE,79.19488165597565,912

2020-08-05,2020-08-05 09:30:00.0,Q,SYMA,2020-08-05 09:34:51.505,1,NYSE,75.30254839137037,100,75.35916738004924,100

# Start here

In [0]:
'''
from datetime import datetime
from decimal import Decimal

def parse_csv(line: str):
    record_type_pos = 2  # Position of record type in CSV
    record = line.split(",")

    try:
        # Extract common fields
        trade_dt ="2020-02-01"# record[0]  # Trade date
        arrival_tm = "2020-02-01 00:00:00"#record[1]  # Arrival timestamp
        rec_type = record[2]  # Record type ("T" for trade, "Q" for quote)
        symbol = record[3]  # Stock symbol
        event_tm = "2020-02-01 00:00:00"#record[4]  # Event timestamp
        event_seq_nb = int(record[5])  # Event sequence number
        exchange = record[6]  # Exchange name

        # [logic to parse records]
        if rec_type == "T":  # If record type is trade
            trade_pr = Decimal(record[7])  # Trade price
            bid_pr = None  # No bid price for trade records
            bid_size = None  # No bid size for trade records
            ask_pr = None  # No ask price for trade records
            ask_size = int(record[8])  # Ask size 
            partition = "T"  # Partition indicator for trade
            return (trade_dt, rec_type, symbol, exchange, event_tm, event_seq_nb, 
                                arrival_tm, trade_pr, bid_pr, bid_size, ask_pr, ask_size, partition)

        elif rec_type == "Q":  # If record type is quote
            bid_pr = Decimal(record[7])  # Bid price
            bid_size = int(record[8])  # Bid size
            ask_pr = Decimal(record[9])  # Ask price
            ask_size = int(record[10])  # Ask size
            trade_pr = None  # No trade price for quote records
            partition = "Q"  # Partition indicator for quote
            return (trade_dt, rec_type, symbol, exchange, event_tm, event_seq_nb, 
                                arrival_tm, trade_pr, bid_pr, bid_size, ask_pr, ask_size, partition)

    except Exception as e:
        # [save record to dummy event in bad partition]
        # [fill in the fields as None or empty string]
        return (None, "B", None, None, None, None, None, None, None, None, None, None, "BAD_RECORD")
'''

In [0]:
from datetime import datetime
from decimal import Decimal

def parse_csv(line: str):
    record = line.split(",")

    try:
        #if len(record) < 9:
        #    return (None, None, None, None, None, None, None, None, None, None, None, None, "B")

        # Convert Date and Timestamp fields correctly
        trade_dt = datetime.strptime(record[0], "%Y-%m-%d").date() if record[0] else None
        arrival_tm = datetime.strptime(record[1], "%Y-%m-%d %H:%M:%S.%f") if record[1] else None
        event_tm = datetime.strptime(record[4], "%Y-%m-%d %H:%M:%S.%f") if record[4] else None

        rec_type = record[2]
        symbol = record[3]
        event_seq_nb = int(record[5]) if record[5].isdigit() else None
        exchange = record[6]

        if rec_type == "T":
            trade_pr = Decimal(record[7]) if record[7] else None
            bid_pr, bid_size, ask_pr, ask_size = None, None, None, None
            partition = "T"

        elif rec_type == "Q" and len(record) >= 11:
            trade_pr = None
            bid_pr = Decimal(record[7]) if record[7] else None
            bid_size = int(record[8]) if record[8].isdigit() else None
            ask_pr = Decimal(record[9]) if record[9] else None
            ask_size = int(record[10]) if record[10].isdigit() else None
            partition = "Q"

        else:
            return (None, None, None, None, None, None, None, None, None, None, None, None, "B")

        return (trade_dt, rec_type, symbol, exchange, event_tm, event_seq_nb, arrival_tm, 
                trade_pr, bid_pr, bid_size, ask_pr, ask_size, partition)

    except Exception:
        return (None, None, None, None, None, None, None, None, None, None, None, None, "B")


In [0]:
storage_account_name = "trial25"
storage_account_key = ""

In [0]:
# Set Spark Configuration for Azure Blob Storage
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",storage_account_key)

file_path = "wasbs://equity-data@trial25.blob.core.windows.net/"
file_paths = [
    "wasbs://equity-data@trial25.blob.core.windows.net/part-00000-214fff0a-f408-466c-bb15-095cd8b648dc-c000.txt",
    "wasbs://equity-data@trial25.blob.core.windows.net/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt"
]
# Read the file
#sc = spark.sparkContext
raw = spark.read.text(file_paths)

In [0]:
print(raw.take(5))

[Row(value='2020-08-06,2020-08-06 09:30:00.0,Q,SYMA,2020-08-06 09:39:01.293,1,NYSE,77.67912845315918,100,78.43735795018011,100'), Row(value='2020-08-06,2020-08-06 09:30:00.0,Q,SYMA,2020-08-06 09:47:20.398,2,NYSE,76.53373290281257,100,76.94424897980778,100'), Row(value='2020-08-06,2020-08-06 09:30:00.0,Q,SYMA,2020-08-06 09:56:26.402,3,NYSE,75.12060723753581,100,75.39408189677017,100'), Row(value='2020-08-06,2020-08-06 09:30:00.0,Q,SYMA,2020-08-06 10:03:59.522,4,NYSE,74.86368997537707,100,75.76860454779845,100'), Row(value='2020-08-06,2020-08-06 09:30:00.0,Q,SYMA,2020-08-06 10:09:53.165,5,NYSE,77.77650010059894,100,78.8010914613886,100')]


In [0]:
#Check if fille connection to data blob is working
display(dbutils.fs.ls("wasbs://equity-data@trial25.blob.core.windows.net/"))

path,name,size,modificationTime
wasbs://equity-data@trial25.blob.core.windows.net/output_dir/,output_dir/,0,1742414019000
wasbs://equity-data@trial25.blob.core.windows.net/part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt,part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt,75613,1742413265000
wasbs://equity-data@trial25.blob.core.windows.net/part-00000-214fff0a-f408-466c-bb15-095cd8b648dc-c000.txt,part-00000-214fff0a-f408-466c-bb15-095cd8b648dc-c000.txt,34241,1742323921000
wasbs://equity-data@trial25.blob.core.windows.net/part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt,part-00000-5e4ced0a-66e2-442a-b020-347d0df4df8f-c000.txt,34241,1741739184000
wasbs://equity-data@trial25.blob.core.windows.net/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt,part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt,75600,1742413302000
wasbs://equity-data@trial25.blob.core.windows.net/trade/,trade/,0,1742328836000


In [0]:
# Parse each line using `parse_csv`
parsed_rdd = raw.rdd.map(lambda row: parse_csv(row.value))  # Fix: Use `row.value` instead of `row["value"]`
parsed_rdd.take(1)

[(datetime.date(2020, 8, 6),
  'Q',
  'SYMA',
  'NYSE',
  datetime.datetime(2020, 8, 6, 9, 39, 1, 293000),
  1,
  datetime.datetime(2020, 8, 6, 9, 30),
  None,
  Decimal('77.67912845315918'),
  100,
  Decimal('78.43735795018011'),
  100,
  'Q')]

In [0]:
# Create DataFrame from RDD
data = spark.createDataFrame(parsed_rdd, schema=schema)

# Show parsed data
data.show()

+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|bid_pr|bid_size|ask_pr|ask_size|partition|
+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:39:...|           1|2020-08-06 09:30:00|    NULL| 77.68|     100| 78.44|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:47:...|           2|2020-08-06 09:30:00|    NULL| 76.53|     100| 76.94|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:56:...|           3|2020-08-06 09:30:00|    NULL| 75.12|     100| 75.39|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 10:03:...|           4|2020-08-06 09:30:00|    NULL| 74.86|     100| 75.77|     100|        Q|

In [0]:
raw.rdd.count()

600

In [0]:
data.count()

600

In [0]:
'''
import json

def parse_json(line: str):
    try:
        record = json.loads(line)  # Parse JSON record

        # Extract common fields
        trade_dt = datetime.strptime(record.get("trade_dt", ""), "%Y-%m-%d").date() if record.get("trade_dt") else None
        arrival_tm = datetime.strptime(record.get("arrival_tm", ""), "%Y-%m-%d %H:%M:%S.%f") if record.get("arrival_tm") else None
        event_tm = datetime.strptime(record.get("event_tm", ""), "%Y-%m-%d %H:%M:%S.%f") if record.get("event_tm") else None

        rec_type = record.get("event_type")
        symbol = record.get("symbol")
        event_seq_nb = int(record.get("event_seq_nb")) if record.get("event_seq_nb") and record.get("event_seq_nb").isdigit() else None
        exchange = record.get("exchange")

        if rec_type == "T":
            trade_pr = Decimal(record.get("trade_pr")) if record.get("trade_pr") else None
            bid_pr, bid_size, ask_pr, ask_size = None, None, None, None
            partition = "T"

        elif rec_type == "Q":
            trade_pr = None
            bid_pr = Decimal(record.get("bid_pr")) if record.get("bid_pr") else None
            bid_size = int(record.get("bid_size")) if record.get("bid_size") and record.get("bid_size").isdigit() else None
            ask_pr = Decimal(record.get("ask_pr")) if record.get("ask_pr") else None
            ask_size = int(record.get("ask_size")) if record.get("ask_size") and record.get("ask_size").isdigit() else None
            partition = "Q"

        else:
            return (None, None, None, None, None, None, None, None, None, None, None, None, "B")

        return (trade_dt, rec_type, symbol, exchange, event_tm, event_seq_nb, arrival_tm, 
                trade_pr, bid_pr, bid_size, ask_pr, ask_size, partition)

    except Exception:
        return (None, None, None, None, None, None, None, None, None, None, None, None, "B")
    '''


In [0]:
'''
# Set Spark Configuration for Azure Blob Storage
file_paths_json = [
    "wasbs://equity-data@trial25.blob.core.windows.net/part-00000-c6c48831-3d45-4887-ba5f-82060885fc6c-c000.txt",
    "wasbs://equity-data@trial25.blob.core.windows.net/part-00000-092ec1db-39ab-4079-9580-f7c7b516a283-c000.txt"
]
# Read the file
raw_json = spark.read.text(file_paths_json)

parsed_json = raw_json.rdd.map(lambda row: parse_json(row.value))

# Create DataFrame from RDD
data_json = spark.createDataFrame(parsed_rdd, schema=schema)

# Show parsed data
data_json.show()
'''

+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|trade_pr|bid_pr|bid_size|ask_pr|ask_size|partition|
+----------+--------+------+--------+--------------------+------------+-------------------+--------+------+--------+------+--------+---------+
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:39:...|           1|2020-08-06 09:30:00|    NULL| 77.68|     100| 78.44|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:47:...|           2|2020-08-06 09:30:00|    NULL| 76.53|     100| 76.94|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 09:56:...|           3|2020-08-06 09:30:00|    NULL| 75.12|     100| 75.39|     100|        Q|
|2020-08-06|       Q|  SYMA|    NYSE|2020-08-06 10:03:...|           4|2020-08-06 09:30:00|    NULL| 74.86|     100| 75.77|     100|        Q|

In [0]:
print(data.count())
#print(data_json.count())

600


In [0]:
# Union the data_json to original data
#data_union = data.union(data_json)

In [0]:
#print(data_union.count())

1200


In [0]:
# Check data types of the dataframe
data.dtypes

[('trade_dt', 'date'),
 ('rec_type', 'string'),
 ('symbol', 'string'),
 ('exchange', 'string'),
 ('event_tm', 'timestamp'),
 ('event_seq_nb', 'int'),
 ('arrival_tm', 'timestamp'),
 ('trade_pr', 'decimal(10,2)'),
 ('bid_pr', 'decimal(10,2)'),
 ('bid_size', 'int'),
 ('ask_pr', 'decimal(10,2)'),
 ('ask_size', 'int'),
 ('partition', 'string')]

In [0]:
#data_union.dtypes

[('trade_dt', 'date'),
 ('rec_type', 'string'),
 ('symbol', 'string'),
 ('exchange', 'string'),
 ('event_tm', 'timestamp'),
 ('event_seq_nb', 'int'),
 ('arrival_tm', 'timestamp'),
 ('trade_pr', 'decimal(10,2)'),
 ('bid_pr', 'decimal(10,2)'),
 ('bid_size', 'int'),
 ('ask_pr', 'decimal(10,2)'),
 ('ask_size', 'int'),
 ('partition', 'string')]

In [0]:
# Group by records type to check counts
event_type_counts = data.groupBy("rec_type").count()
display(event_type_counts)

rec_type,count
Q,540
T,60


In [0]:
#event_type_counts_json = data_json.groupBy("rec_type").count()
#display(event_type_counts_json)

rec_type,count
Q,540
T,60


In [0]:
#event_type_counts_total = data_union.groupBy("rec_type").count()
#display(event_type_counts_total)

rec_type,count
Q,1080
T,120


In [0]:
# Write to folders for each record type in output_dir
data.write.partitionBy("partition").mode("overwrite").parquet(file_path+"output_dir")