In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, DoubleType, ArrayType
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("StockDataCleaning") \
    .config("spark.executor.memory", "4g").getOrCreate()

23/11/13 19:56:00 WARN Utils: Your hostname, michaelwoan-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/11/13 19:56:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/13 19:56:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/13 19:56:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
schema = StructType([ \
    StructField("transaction_id", StringType(), True), \
    StructField("stock_id", StringType(), True), \
    StructField("ticker_symbol", StringType(), True), \
    StructField("date", DateType(), True), \
    StructField("low", FloatType(), True), \
    StructField("open", FloatType(), True), \
    StructField("high", FloatType(), True), \
    StructField("volume", IntegerType(), True), \
    StructField("close", FloatType(), True)])

In [4]:
stockData = spark.read.schema(schema).csv("./stocks-data/combined-stocks-data.csv", header=True)

In [5]:
stockData.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- stock_id: string (nullable = true)
 |-- ticker_symbol: string (nullable = true)
 |-- date: date (nullable = true)
 |-- low: float (nullable = true)
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- volume: integer (nullable = true)
 |-- close: float (nullable = true)



In [6]:
stockData.show()

+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|   volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|b91319ee-7e00-422...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|c25c9b76-bba2-42d...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|eb207f83-6265-4e6...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|4b83d021-0fc5-465...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|06a5cd4a-53ff-49b...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.719333|   2.028|257806500|1.588667|
|9fc036ba-1f88-48c...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.7

In [7]:
# Missing Values
missing_values = stockData.select([func.sum(func.col(c).isNull().cast("int")).alias(c + '_missing') for c in stockData.columns]).collect()

print("Missing Values:")
for row in missing_values[0].asDict():
    print(f"{row}: {missing_values[0][row]}")




Missing Values:
transaction_id_missing: 0
stock_id_missing: 0
ticker_symbol_missing: 0
date_missing: 0
low_missing: 0
open_missing: 0
high_missing: 0
volume_missing: 140
close_missing: 0


                                                                                

In [8]:
# Filter rows where 'volume' is missing
missing_volume_rows = stockData.filter(stockData['volume'].isNull())

# Show the missing values
missing_volume_rows.show()


+--------------------+--------------------+-------------+----------+--------+--------+--------+------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+------+--------+
|56bd39f1-60c3-41a...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|869ea8de-1017-40f...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|f1a92d79-219f-424...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|8829ded0-09be-431...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|1f356f8d-6d7b-4d8...|c5e95634-4871-49f...|         AAPL|1997-08-06|0.223214|0.225446|0.247768|  NULL|0.234933|
|8b6b3e2b-f50d-4b6...|c5e95634-4871-49f...|         AAPL|1997-08-06|0.223214|0.225446|0.247768|  NULL|0.

In [9]:
# Fill missing values in the 'volume' column with 0
stockData = stockData.na.fill(0, subset=['volume'])

# Show the DataFrame after filling missing values
stockData.show()


+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|   volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|b91319ee-7e00-422...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|c25c9b76-bba2-42d...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|eb207f83-6265-4e6...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|4b83d021-0fc5-465...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|06a5cd4a-53ff-49b...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.719333|   2.028|257806500|1.588667|
|9fc036ba-1f88-48c...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.7

In [11]:
# Drop duplicate rows based on 'date' and 'close' columns
cleanedStockData = stockData.dropDuplicates(['date', 'close'])

# Check for duplicate values in the 'date' column again
duplicate_rows = cleanedStockData.groupBy('date', 'close').count().filter('count > 1')

# Show the duplicate dates and close prices, if any
if duplicate_rows.count() > 0:
    print("Duplicate dates and close prices found after deduplication:")
    duplicate_rows.show()
else:
    print("No duplicate dates and close prices found.")


No duplicate dates and close prices found.


In [12]:
cleanedStockData.orderBy(func.desc("date")).show()



+--------------------+--------------------+-------------+----------+--------+------+------+---------+------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|  open|  high|   volume| close|
+--------------------+--------------------+-------------+----------+--------+------+------+---------+------+
|f38ed296-ff46-4c1...|0cb4048a-7fe6-441...|         GOOG|2023-11-13|  132.77|133.36|134.11| 14786236|133.64|
|8cbdb4d4-889d-487...|c5e95634-4871-49f...|         AAPL|2023-11-13|  184.21|185.82|186.03| 43553315| 184.8|
|50787f93-b075-47c...|c1fe7393-9e77-407...|         TSLA|2023-11-13|211.6101| 215.6| 225.4|137561292|223.71|
|113f51fd-6da3-4cd...|1ec4324e-7d11-49b...|         MSFT|2023-11-13| 365.915|368.22|368.46| 19318985|366.68|
|bca35b47-8911-453...|0cb4048a-7fe6-441...|         GOOG|2023-11-10|  130.87|131.53|134.27| 20872900|134.06|
|f8a125fa-728b-4b0...|c5e95634-4871-49f...|         AAPL|2023-11-10|  183.53|183.97|186.57| 66133400| 186.4|
|aae8bec2-423c-4c2.

                                                                                

In [15]:
# Updated function to calculate exponential moving average
def calculate_ema(data, alpha):
    ema = data[0]  # Set the initial value as the first data point
    for i in range(1, len(data)):
        ema = alpha * data[i] + (1 - alpha) * ema
    
    return ema

# Convert the calculate_ema_udf function to a Spark UDF
calculate_ema_udf = F.udf(lambda data, alpha: float(calculate_ema(data, alpha)), FloatType())

# Define the window specifications for different moving averages
short_term_window = Window().partitionBy("transaction_id", "stock_id").orderBy(F.desc("date")).rowsBetween(0, 4)
intermediate_term_window = Window().partitionBy("transaction_id", "stock_id").orderBy(F.desc("date")).rowsBetween(0, 19)
fifty_day_window = Window().partitionBy("transaction_id", "stock_id").orderBy(F.desc("date")).rowsBetween(0, 49)
long_term_window = Window().partitionBy("transaction_id", "stock_id").orderBy(F.desc("date")).rowsBetween(0, 199)

# Calculate moving averages for each window
cleanedStockData = cleanedStockData.withColumn("5_days_sma", F.round(F.avg("close").over(short_term_window), 2))
cleanedStockData = cleanedStockData.withColumn("20_days_sma", F.round(F.avg("close").over(intermediate_term_window), 2))
cleanedStockData = cleanedStockData.withColumn("50_days_sma", F.round(F.avg("close").over(fifty_day_window), 2))
cleanedStockData = cleanedStockData.withColumn("200_days_sma", F.round(F.avg("close").over(long_term_window), 2))

# Define the periods and alpha values for exponential moving averages
short_term_period = 5
intermediate_term_period = 20
fifty_day_period = 50
long_term_period = 200
alpha_short = 2 / (short_term_period + 1)
alpha_intermediate = 2 / (intermediate_term_period + 1)
alpha_fifty = 2 / (fifty_day_period + 1)
alpha_long = 2 / (long_term_period + 1)

# Apply the UDF to calculate EMAs
cleanedStockData = cleanedStockData.withColumn("5_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(Window().orderBy(F.desc("date")).rowsBetween(0, short_term_period - 1)), F.lit(alpha_short)), 2))
cleanedStockData = cleanedStockData.withColumn("20_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(Window().orderBy(F.desc("date")).rowsBetween(0, intermediate_term_period - 1)), F.lit(alpha_intermediate)), 2))
cleanedStockData = cleanedStockData.withColumn("50_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(Window().orderBy(F.desc("date")).rowsBetween(0, fifty_day_period - 1)), F.lit(alpha_fifty)), 2))
cleanedStockData = cleanedStockData.withColumn("200_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(Window().orderBy(F.desc("date")).rowsBetween(0, long_term_period - 1)), F.lit(alpha_long)), 2))

# Show the result
cleanedStockData.select(['date', 'close', '5_days_sma', '20_days_sma', '50_days_sma', '200_days_sma', '5_days_ema', '20_days_ema', '50_days_ema', '200_days_ema']).orderBy(F.desc("date")).show()

23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/11/13 20:04:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 49:

+----------+------+----------+-----------+-----------+------------+----------+-----------+-----------+------------+
|      date| close|5_days_sma|20_days_sma|50_days_sma|200_days_sma|5_days_ema|20_days_ema|50_days_ema|200_days_ema|
+----------+------+----------+-----------+-----------+------------+----------+-----------+-----------+------------+
|2023-11-13|366.68|    366.68|     366.68|     366.68|      366.68|    221.31|     224.78|     225.42|       242.4|
|2023-11-13|223.71|    223.71|     223.71|     223.71|      223.71|    209.45|     222.65|     224.57|      222.83|
|2023-11-13| 184.8|     184.8|      184.8|      184.8|       184.8|    219.45|     226.15|     225.99|      217.85|
|2023-11-13|133.64|    133.64|     133.64|     133.64|      133.64|    260.78|     235.55|     229.61|      211.67|
|2023-11-10|214.65|    214.65|     214.65|     214.65|      214.65|    207.41|     221.83|     224.32|      221.65|
|2023-11-10|134.06|    134.06|     134.06|     134.06|      134.06|    2

                                                                                

In [13]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# minus 1 cuz of 0-based index
short_term_date = 4
intermediate_term_date = 19
long_term_date = 199
fifty_day_date = 49
round_to_decimal = 2

partition_cols = ["transaction_id", "stock_id"]

# Define the window specifications for different moving averages
short_term_window = Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, short_term_date)
intermediate_term_window = Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, intermediate_term_date)
fifty_day_window = Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, fifty_day_date)
long_term_window = Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, long_term_date)

# Calculate moving averages for each window
cleanedStockData = cleanedStockData.withColumn("5_days_sma", F.round(F.avg("close").over(short_term_window), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("20_days_sma", F.round(F.avg("close").over(intermediate_term_window), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("50_days_sma", F.round(F.avg("close").over(fifty_day_window), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("200_days_sma", F.round(F.avg("close").over(long_term_window), round_to_decimal))

# Assuming 'cleanedStockData' is your DataFrame
bottom_200_rows = cleanedStockData.select(['ticker_symbol','date', 'close', "5_days_sma", "20_days_sma", "50_days_sma", "200_days_sma"]) \
                                  .orderBy(F.desc("date")).limit(200)

# Show the bottom 200 rows
bottom_200_rows.show()


+-------------+----------+------+----------+-----------+-----------+------------+
|ticker_symbol|      date| close|5_days_sma|20_days_sma|50_days_sma|200_days_sma|
+-------------+----------+------+----------+-----------+-----------+------------+
|         MSFT|2023-11-13|366.68|    366.68|     366.68|     366.68|      366.68|
|         TSLA|2023-11-13|223.71|    223.71|     223.71|     223.71|      223.71|
|         AAPL|2023-11-13| 184.8|     184.8|      184.8|      184.8|       184.8|
|         GOOG|2023-11-13|133.64|    133.64|     133.64|     133.64|      133.64|
|         TSLA|2023-11-10|214.65|    214.65|     214.65|     214.65|      214.65|
|         GOOG|2023-11-10|134.06|    134.06|     134.06|     134.06|      134.06|
|         MSFT|2023-11-10|369.67|    369.67|     369.67|     369.67|      369.67|
|         AAPL|2023-11-10| 186.4|     186.4|      186.4|      186.4|       186.4|
|         TSLA|2023-11-09|209.98|    209.98|     209.98|     209.98|      209.98|
|         AAPL|2

In [None]:
#  Bollinger Bands with SMAs
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

round_to_decimal = 2

# Assuming you have a SparkSession named spark and your DataFrame is named cleanedStockData
intermediate_term_date = 20

# Define the window specification
bollinger_window = Window().orderBy(F.desc("date")).rowsBetween(0, intermediate_term_date - 1)  # Adjust the window size as needed

# Calculate the 20-day simple moving average (SMA)
cleanedStockData = cleanedStockData.withColumn("20_days_sma", F.round(F.avg("close").over(bollinger_window), round_to_decimal))

# Calculate the standard deviation of the close prices over the same window
cleanedStockData = cleanedStockData.withColumn("20_days_stddev", F.round(F.stddev("close").over(bollinger_window), round_to_decimal))

# Calculate the upper and lower Bollinger Bands
cleanedStockData = cleanedStockData.withColumn("upper_band", F.round(F.col("20_days_sma") + 2 * F.col("20_days_stddev"), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("lower_band", F.round(F.col("20_days_sma") - 2 * F.col("20_days_stddev"), round_to_decimal))

# Show the resulting DataFrame
cleanedStockData.select("date", "close", "20_days_sma", "upper_band", "lower_band").show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.window import Window

# Updated function to calculate exponential moving average
def calculate_ema(data, alpha):
    ema = data[0]  # Set the initial value as the first data point
    for i in range(1, len(data)):
        ema = alpha * data[i] + (1 - alpha) * ema
    
    return ema

# Convert the calculate_ema_udf function to a Spark UDF
calculate_ema_udf = func.udf(lambda data, alpha: float(calculate_ema(data, alpha)), FloatType())

# Define the periods and alpha values
short_term_period = 5
intermediate_term_period = 20
fifty_day_period = 50
long_term_period = 200
alpha_short = 2 / (short_term_period + 1)
alpha_intermediate = 2 / (intermediate_term_period + 1)
alpha_fifty = 2 / (fifty_day_period + 1)
alpha_long = 2 / (long_term_period + 1)

round_to_decimal = 2

# Apply the UDF to calculate EMAs
cleanedStockData = cleanedStockData.withColumn("5_days_ema", func.round(calculate_ema_udf(func.collect_list("close"). \
                                                                               over(Window().orderBy(func.desc("date")). \
                                                                                    rowsBetween(0, short_term_period - 1)), func.lit(alpha_short)), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("20_days_ema", func.round(calculate_ema_udf(func.collect_list("close").over(Window().orderBy(func.desc("date")). \
                                                                                                                rowsBetween(0, intermediate_term_period - 1)), func.lit(alpha_intermediate)), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("50_days_ema", func.round(calculate_ema_udf(func.collect_list("close").over(Window().orderBy(func.desc("date")). \
                                                                                                                rowsBetween(0, fifty_day_period - 1)), func.lit(alpha_fifty)), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("200_days_ema", func.round(calculate_ema_udf(func.collect_list("close").over(Window().orderBy(func.desc("date")). \
                                                                                                                 rowsBetween(0, long_term_period - 1)), func.lit(alpha_long)), round_to_decimal))

# Show the result
cleanedStockData.select(['date', 'close', '5_days_ema', '20_days_ema', '50_days_ema', '200_days_ema']).orderBy(func.desc("date")).show()



In [None]:
#  Bollinger Bands with EMAs
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Updated function to calculate exponential moving average
def calculate_ema(data, alpha):
    ema = data[0]  # Set the initial value as the first data point
    for i in range(1, len(data)):
        ema = alpha * data[i] + (1 - alpha) * ema
    
    return ema

# Convert the calculate_ema_udf function to a Spark UDF
calculate_ema_udf = F.udf(lambda data, alpha: float(calculate_ema(data, alpha)), FloatType())


round_to_decimal = 2

# Assuming you have a SparkSession named spark and your DataFrame is named cleanedStockData
intermediate_term_date = 20

# alpha value for 20 days
alpha_intermediate = 2 / (intermediate_term_period + 1)

# Define the window specification
bollinger_window = Window().orderBy(F.desc("date")).rowsBetween(0, intermediate_term_date - 1)  # Adjust the window size as needed

# Calculate the 20-day Exponential moving average (SMA)
cleanedStockData = cleanedStockData.withColumn("20_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(Window().orderBy(F.desc("date")). \
                                                                                                                rowsBetween(0, intermediate_term_period - 1)), func.lit(alpha_intermediate)), round_to_decimal))

# Calculate the standard deviation of the close prices over the same window
cleanedStockData = cleanedStockData.withColumn("20_days_stddev", F.round(F.stddev("close").over(bollinger_window), round_to_decimal))

# Calculate the upper and lower Bollinger Bands
cleanedStockData = cleanedStockData.withColumn("upper_band", F.round(F.col("20_days_ema") + 2 * F.col("20_days_stddev"), round_to_decimal))
cleanedStockData = cleanedStockData.withColumn("lower_band", F.round(F.col("20_days_ema") - 2 * F.col("20_days_stddev"), round_to_decimal))

# Show the resulting DataFrame
cleanedStockData.select("date", "close", "20_days_ema", "upper_band", "lower_band").show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Assuming cleanedStockData is your DataFrame
# Add a unique identifier column "ID"
cleanedStockData = cleanedStockData.withColumn("ID", F.monotonically_increasing_id())

# Define the window specification
partition_cols = ["ID"]
rsi_window = Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, 13)  # Assuming a 14-day RSI, adjust as needed

# Calculate lag without window specification
cleanedStockData = cleanedStockData.withColumn("lag_close", F.lag("close").over(Window().orderBy("ID")))
cleanedStockData = cleanedStockData.withColumn("price_change", F.when(F.isnull(F.col("lag_close")), 0).otherwise(F.col("close") - F.col("lag_close")))

# Calculate the daily gains and losses
gains = F.when(F.col("price_change") > 0, F.col("price_change")).otherwise(0)
losses = F.when(F.col("price_change") < 0, F.abs(F.col("price_change"))).otherwise(0)

# Calculate the average gains and losses over the window
avg_gains = F.avg(gains).over(rsi_window)
avg_losses = F.avg(losses).over(rsi_window)

# Calculate the relative strength (RS)
rs = avg_gains / avg_losses

# Calculate the RSI
rsi = 100 - (100 / (1 + rs))

# Add the RSI column to the DataFrame
cleanedStockData = cleanedStockData.withColumn("rsi_14", F.when(F.isnull(rsi), 0).otherwise(rsi))

# Filter for the latest 200 days
latest_200_days = cleanedStockData.orderBy(F.desc("date")).limit(200)

# Show the resulting DataFrame
latest_200_days.select("ID", "date", "close", "rsi_14").show()
