In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import (
    ArrayType,
    DateType,
    DecimalType,
    IntegerType,
    LongType,
    StringType,
    StructField,
    StructType,
)

In [2]:
spark: SparkSession = (
    SparkSession.builder.appName("StockStream")
    .master("local")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0,com.amazonaws:aws-java-sdk:1.12.183,org.apache.hadoop:hadoop-aws:3.2.2",
    )
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

In [26]:
hadoopConfig = spark.sparkContext._jsc.hadoopConfiguration()
hadoopConfig.set(
    "fs.s3a.aws.credentials.provider",
    "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
)
hadoopConfig.set("fs.s3a.access.key", "AKIAXYKJUHUXZ7Q4WJFQ")
hadoopConfig.set("fs.s3a.secret.key", "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")

In [5]:
kafka_df = (
    spark.read.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "stock-data")
    .option("startingOffsets", "earliest")
    .option("group.id", "stock-stream-consumer")
    .option("failOnDataLoss", False)
    .load()
)

In [6]:
kafka_df.show()

+-------------+--------------------+----------+---------+------+--------------------+-------------+
|          key|               value|     topic|partition|offset|           timestamp|timestampType|
+-------------+--------------------+----------+---------+------+--------------------+-------------+
|[41 41 50 4C]|[7B 22 63 6F 6D 7...|stock-data|        0|     0|2025-01-04 13:19:...|            0|
+-------------+--------------------+----------+---------+------+--------------------+-------------+



In [30]:
# kafka_df.writeStream.format("console").outputMode("append").start().awaitTermination()

In [7]:
schema = StructType(
    [
        StructField("type", StringType()),
        StructField(
            "data",
            ArrayType(
                StructType(
                    [
                        StructField("p", DecimalType()),
                        StructField("s", StringType()),
                        StructField("t", LongType()),
                        StructField("v", DecimalType()),
                        StructField("c", IntegerType()),
                    ]
                )
            ),
        ),
        StructField(
            "companyProfile",
            StructType(
                [
                    StructField("country", StringType()),
                    StructField("currency", StringType()),
                    StructField("exchange", StringType()),
                    StructField("ipo", DateType()),
                    StructField("marketCapitalization", DecimalType()),
                    StructField("name", StringType()),
                    StructField("phone", StringType()),
                    StructField("shareOutstanding", DecimalType()),
                    StructField("ticker", StringType()),
                    StructField("weburl", StringType()),
                    StructField("logo", StringType()),
                    StructField("finnhubIndustry", StringType()),
                ]
            ),
        ),
    ]
)

In [8]:
kafka_df = kafka_df\
    .withColumn("key", f.col("key").cast(StringType()))\
    .withColumn("value", f.col("value").cast(StringType()))

In [9]:
raw_df = kafka_df.withColumn("symbol",f.col("key")).withColumn("value",f.from_json(f.col("value"),schema)).select("symbol","value.*")

In [10]:
raw_df.show()

+------+-----+--------------------+--------------------+
|symbol| type|                data|      companyProfile|
+------+-----+--------------------+--------------------+
|  AAPL|trade|[{256, AAPL, 1735...|{US, USD, NASDAQ/...|
+------+-----+--------------------+--------------------+



In [11]:
raw_df = raw_df.filter(f.col("symbol").isNotNull())

In [12]:
raw_df.show()

+------+-----+--------------------+--------------------+
|symbol| type|                data|      companyProfile|
+------+-----+--------------------+--------------------+
|  AAPL|trade|[{256, AAPL, 1735...|{US, USD, NASDAQ/...|
+------+-----+--------------------+--------------------+



In [18]:
company_data = raw_df.select("companyProfile.*")

In [19]:
company_data.show()

+-------+--------+--------------------+----------+--------------------+---------+-----------+----------------+------+--------------------+--------------------+---------------+
|country|currency|            exchange|       ipo|marketCapitalization|     name|      phone|shareOutstanding|ticker|              weburl|                logo|finnhubIndustry|
+-------+--------+--------------------+----------+--------------------+---------+-----------+----------------+------+--------------------+--------------------+---------------+
|     US|     USD|NASDAQ/NMS (GLOBA...|1980-12-12|             1415993|Apple Inc|14089961010|            4375|  AAPL|https://www.apple...|https://static.fi...|     Technology|
+-------+--------+--------------------+----------+--------------------+---------+-----------+----------------+------+--------------------+--------------------+---------------+



In [16]:
company_data.printSchema()

root
 |-- country: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- ipo: date (nullable = true)
 |-- marketCapitalization: decimal(10,0) (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- shareOutstanding: decimal(10,0) (nullable = true)
 |-- ticker: string (nullable = true)
 |-- weburl: string (nullable = true)
 |-- logo: string (nullable = true)
 |-- finnhubIndustry: string (nullable = true)



In [21]:
final_company_data = company_data.select(
    "ticker",
    "name",
    "phone",
    "weburl",
    "logo",
    f.col("finnhubIndustry").alias("industry"),
    "marketCapitalization",
    "country",
    "exchange",
    "ipo",
    "shareOutstanding",
)

In [22]:
final_company_data.show()

+------+---------+-----------+--------------------+--------------------+----------+--------------------+-------+--------------------+----------+----------------+
|ticker|     name|      phone|              weburl|                logo|  industry|marketCapitalization|country|            exchange|       ipo|shareOutstanding|
+------+---------+-----------+--------------------+--------------------+----------+--------------------+-------+--------------------+----------+----------------+
|  AAPL|Apple Inc|14089961010|https://www.apple...|https://static.fi...|Technology|             1415993|     US|NASDAQ/NMS (GLOBA...|1980-12-12|            4375|
+------+---------+-----------+--------------------+--------------------+----------+--------------------+-------+--------------------+----------+----------------+



In [28]:
final_company_data.write.mode("append").parquet(
    "s3a://stock-data-output/company-data/final"
)

In [29]:
raw_df.show()

+------+-----+--------------------+--------------------+
|symbol| type|                data|      companyProfile|
+------+-----+--------------------+--------------------+
|  AAPL|trade|[{256, AAPL, 1735...|{US, USD, NASDAQ/...|
+------+-----+--------------------+--------------------+



In [57]:
price_data = raw_df\
    .select("symbol", f.posexplode("data").alias("pos", "data"))\
    .drop("pos")

In [58]:
price_data.show()

+------+--------------------+
|symbol|                data|
+------+--------------------+
|  AAPL|{256, AAPL, 17354...|
|  AAPL|{256, AAPL, 17354...|
+------+--------------------+



In [59]:
price_data = price_data.select(f.col("symbol").alias("ticker"),"data.*")

In [60]:
price_data.show()

+------+---+----+-------------+----+----+
|ticker|  p|   s|            t|   v|   c|
+------+---+----+-------------+----+----+
|  AAPL|256|AAPL|1735451912469|1300|NULL|
|  AAPL|256|AAPL|1735451912469|1300|NULL|
+------+---+----+-------------+----+----+



In [None]:
final_price_data = 