## Import Libraries 

In [14]:
import findspark
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import yfinance as yf
import pandas as pd
from delta import configure_spark_with_delta_pip
from datetime import datetime, timedelta
from pyspark.sql import Window

findspark.init()
findspark.find()

'/opt/homebrew/Cellar/apache-spark/4.0.0/libexec'

In [None]:
builder = SparkSession.builder \
    .appName("DeltaLake4App") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

25/07/28 20:43:17 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:59)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

25/07/28 20:43:36 ERROR Utils: Uncaught exception in thread driver-heartbeater
java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.executor.ProcfsMetricsGetter$
	at org.apache.spark.metrics.ProcessTreeMetrics$.getMetricValues(ExecutorMetricType.scala:94)
	at org.apache.spark.executor.ExecutorMetrics$.$anonfun$getCurrentMetrics$1(ExecutorMetrics.scala:103)
	at org.apache.spark.executor.ExecutorMetrics$.$anonfun$getCurrentMetrics$1$adapted(ExecutorMetrics.scala:102)
	at scala.collection.immutable.Vector.foreach(Vector.scala:2125)
	at org.apache.spark.executor.ExecutorMetrics$.getCurrentMetrics(ExecutorMetrics.scala:102)
	at org.apache.spark.SparkContext.reportHeartBeat(SparkContext.scala:2952)
	at org.apache.spark.SparkContext.$anonfun$new$25(SparkContext.scala:615)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1937)
	at org.apache.spark.Heartbeater$$anon$1.run(Heartbe

In [4]:
spark

## Get and Process Stock Market Data

In [9]:
# Get yesterday's date
yesterday = datetime.now() - timedelta(days=1)
end_date = yesterday.strftime("%Y-%m-%d")

# Get start date (3 years before yesterday)
start_date = (yesterday - timedelta(days=3*365)).strftime("%Y-%m-%d")

tickers = [
    "AAPL", "MSFT", "NVDA", "GOOGL", "AMZN", "META", "AVGO", "BRK-B", "TSLA", "TSM",
    "JPM", "WMT", "LLY", "ORCL", "V", "MA", "NFLX", "XOM", "COST", "JNJ",
    "ABBV", "SAP", "BABA", "GS", "HD", "VRTX", "UNH", "MRK", "PEP", "TMO"
]

# Download stock data
raw = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')

  raw = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')
[*********************100%***********************]  30 of 30 completed


In [12]:
# convert multi-index DataFrame to flat DataFrame
df_list = []

for ticker in tickers:
    ticker_df = raw[ticker].reset_index()
    ticker_df['Ticker'] = ticker
    df_list.append(ticker_df)

flat_df = pd.concat(df_list, ignore_index=True)

# reorder columns 
cols = ['Ticker', 'Date'] + [col for col in flat_df.columns if col not in ['Ticker', 'Date']]
flat_df = flat_df[cols]

flat_df.sample(10)

Price,Ticker,Date,Open,High,Low,Close,Volume
3055,AMZN,2022-10-10,115.099998,116.25,112.43,113.669998,42339700
1546,NVDA,2022-09-29,12.433765,12.485706,11.932339,12.206026,532763000
6687,TSLA,2025-04-11,251.839996,257.73999,241.360001,252.309998,128948100
11048,V,2024-09-12,282.515735,284.505283,279.998959,283.878571,4160800
2778,GOOGL,2024-08-29,163.517909,165.169911,159.477484,161.000107,19699800
14955,JNJ,2025-04-23,155.159831,155.764646,153.018187,154.059265,9099400
9613,LLY,2024-12-17,774.933279,785.035142,769.523717,775.69043,3924800
8080,JPM,2024-11-01,219.989947,222.538583,219.143679,219.379852,6923500
13441,XOM,2025-04-04,108.867318,109.818556,102.882439,103.387779,30841300
22393,TMO,2025-01-07,538.506028,550.353714,537.268368,544.624512,1839300


In [13]:
# store data in a spark dataframe
spark_df = spark.createDataFrame(flat_df)
spark_df.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: long (nullable = true)



### Feature Engineering

In [15]:
# calculate daily returns
window_spec = Window.partitionBy('Ticker').orderBy('Date')

spark_df = spark_df.withColumn('Prev_close', lag('Close').over(window_spec))

spark_df = spark_df.withColumn(
    'Daily_return',
    round(((col('Close') - col('Prev_close')) / col('Prev_close') * 100), 2)
)

In [17]:
# calculate daily volutility rate 
spark_df = spark_df.withColumn(
    'Volatility',
    round(((col('High') - col('Low')) / col('Low') * 100), 2)
)