In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, DoubleType, ArrayType
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("StockDataCleaning") \
    .config("spark.executor.memory", "4g").getOrCreate()

23/11/15 13:26:46 WARN Utils: Your hostname, michaelwoan-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/11/15 13:26:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 13:26:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/15 13:26:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
schema = StructType([ \
    StructField("transaction_id", StringType(), True), \
    StructField("stock_id", StringType(), True), \
    StructField("ticker_symbol", StringType(), True), \
    StructField("date", DateType(), True), \
    StructField("low", FloatType(), True), \
    StructField("open", FloatType(), True), \
    StructField("high", FloatType(), True), \
    StructField("volume", IntegerType(), True), \
    StructField("close", FloatType(), True)])

In [4]:
stockData = spark.read.schema(schema).csv("./stocks-data/combined-stocks-data.csv", header=True)

In [5]:
stockData.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- stock_id: string (nullable = true)
 |-- ticker_symbol: string (nullable = true)
 |-- date: date (nullable = true)
 |-- low: float (nullable = true)
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- volume: integer (nullable = true)
 |-- close: float (nullable = true)



In [6]:
stockData.show()

+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|   volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|b91319ee-7e00-422...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|c25c9b76-bba2-42d...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|eb207f83-6265-4e6...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|4b83d021-0fc5-465...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|06a5cd4a-53ff-49b...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.719333|   2.028|257806500|1.588667|
|9fc036ba-1f88-48c...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.7

In [7]:
# Missing Values
missing_values = stockData.select([func.sum(func.col(c).isNull().cast("int")).alias(c + '_missing') for c in stockData.columns]).collect()

print("Missing Values:")
for row in missing_values[0].asDict():
    print(f"{row}: {missing_values[0][row]}")




Missing Values:
transaction_id_missing: 0
stock_id_missing: 0
ticker_symbol_missing: 0
date_missing: 0
low_missing: 0
open_missing: 0
high_missing: 0
volume_missing: 140
close_missing: 0


                                                                                

In [8]:
# Filter rows where 'volume' is missing
missing_volume_rows = stockData.filter(stockData['volume'].isNull())

# Show the missing values
missing_volume_rows.show()


+--------------------+--------------------+-------------+----------+--------+--------+--------+------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+------+--------+
|56bd39f1-60c3-41a...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|869ea8de-1017-40f...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|f1a92d79-219f-424...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|8829ded0-09be-431...|c5e95634-4871-49f...|         AAPL|1983-09-23| 0.09933|0.111607|0.111607|  NULL|0.108259|
|1f356f8d-6d7b-4d8...|c5e95634-4871-49f...|         AAPL|1997-08-06|0.223214|0.225446|0.247768|  NULL|0.234933|
|8b6b3e2b-f50d-4b6...|c5e95634-4871-49f...|         AAPL|1997-08-06|0.223214|0.225446|0.247768|  NULL|0.

In [9]:
# Fill missing values in the 'volume' column with 0
stockData = stockData.na.fill(0, subset=['volume'])

# Show the DataFrame after filling missing values
stockData.show()


+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|    open|    high|   volume|   close|
+--------------------+--------------------+-------------+----------+--------+--------+--------+---------+--------+
|b91319ee-7e00-422...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|c25c9b76-bba2-42d...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|eb207f83-6265-4e6...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|4b83d021-0fc5-465...|c1fe7393-9e77-407...|         TSLA|2010-06-29|1.169333|1.266667|1.666667|281494500|1.592667|
|06a5cd4a-53ff-49b...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.719333|   2.028|257806500|1.588667|
|9fc036ba-1f88-48c...|c1fe7393-9e77-407...|         TSLA|2010-06-30|1.553333|1.7

In [10]:
# Drop duplicate rows based on 'date' and 'close' columns
cleanedStockData = stockData.dropDuplicates(['date', 'close'])

# Check for duplicate values in the 'date' column again
duplicate_rows = cleanedStockData.groupBy('date', 'close').count().filter('count > 1')

# Show the duplicate dates and close prices, if any
if duplicate_rows.count() > 0:
    print("Duplicate dates and close prices found after deduplication:")
    duplicate_rows.show()
else:
    print("No duplicate dates and close prices found.")


No duplicate dates and close prices found.


In [11]:
cleanedStockData.orderBy(func.desc("date")).show()

+--------------------+--------------------+-------------+----------+--------+------+------+---------+------+
|      transaction_id|            stock_id|ticker_symbol|      date|     low|  open|  high|   volume| close|
+--------------------+--------------------+-------------+----------+--------+------+------+---------+------+
|8cbdb4d4-889d-487...|c5e95634-4871-49f...|         AAPL|2023-11-13|  184.21|185.82|186.03| 43553315| 184.8|
|f38ed296-ff46-4c1...|0cb4048a-7fe6-441...|         GOOG|2023-11-13|  132.77|133.36|134.11| 14786236|133.64|
|50787f93-b075-47c...|c1fe7393-9e77-407...|         TSLA|2023-11-13|211.6101| 215.6| 225.4|137561292|223.71|
|113f51fd-6da3-4cd...|1ec4324e-7d11-49b...|         MSFT|2023-11-13| 365.915|368.22|368.46| 19318985|366.68|
|bca35b47-8911-453...|0cb4048a-7fe6-441...|         GOOG|2023-11-10|  130.87|131.53|134.27| 20872900|134.06|
|f8a125fa-728b-4b0...|c5e95634-4871-49f...|         AAPL|2023-11-10|  183.53|183.97|186.57| 66133400| 186.4|
|aae8bec2-423c-4c2.

## Stock Data Moving Averages Analysis

This PySpark script calculates Simple Moving Averages (SMA) and Exponential Moving Averages (EMA) for different periods on stock data. The analysis includes importing libraries, defining functions, setting parameters, and displaying the results.

- **Moving Averages:**
  - Simple Moving Averages (SMA) for periods: 5, 20, 50, 200.
  - Exponential Moving Averages (EMA) with corresponding alpha values.

- **Data Manipulation:**
  - Utilizes PySpark functions and windows for efficient data processing.

- **Result Display:**
  - Presents the DataFrame with date, close price, SMAs, and EMAs in descending order.


In [12]:
round_to_decimal = 2

def calculate_ema(data, alpha):
    ema = data[0]
    for i in range(1, len(data)):
        ema = alpha * data[i] + (1 - alpha) * ema
    return ema

calculate_ema_udf = F.udf(lambda data, alpha: float(calculate_ema(data, alpha)), FloatType())

periods = [5, 20, 50, 200]
alpha_values = [2 / (p + 1) for p in periods]

partition_cols = ["stock_id", "ticker_symbol"]

windows = [Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, p - 1) for p in periods]

# Calculate simple moving averages
for p in periods:
    cleanedStockData = cleanedStockData.withColumn(f"{p}_days_sma", F.round(F.avg("close").over(windows[periods.index(p)]), 2))

# Calculate exponential moving averages using UDF
for p, alpha in zip(periods, alpha_values):
    cleanedStockData = cleanedStockData.withColumn(f"{p}_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(windows[periods.index(p)]), F.lit(alpha)), round_to_decimal))

# Show the result
cleanedStockData.select(['date', 'close'] + [f"{p}_days_sma" for p in periods] + [f"{p}_days_ema" for p in periods]).orderBy(F.desc("date")).show()


[Stage 21:>                                                         (0 + 1) / 1]

+----------+------+----------+-----------+-----------+------------+----------+-----------+-----------+------------+
|      date| close|5_days_sma|20_days_sma|50_days_sma|200_days_sma|5_days_ema|20_days_ema|50_days_ema|200_days_ema|
+----------+------+----------+-----------+-----------+------------+----------+-----------+-----------+------------+
|2023-11-13|366.68|    364.15|     343.91|     333.07|      311.35|    363.26|     341.24|     334.66|      305.37|
|2023-11-13|223.71|    218.53|      216.9|     241.25|       221.1|    219.92|     220.72|     246.85|       210.2|
|2023-11-13|133.64|    133.01|     132.08|     134.56|      119.17|    132.89|     133.69|     135.18|      114.48|
|2023-11-13| 184.8|    183.66|     176.04|     176.38|      172.73|    183.19|     175.99|     177.81|      169.21|
|2023-11-10|134.06|    132.57|     132.42|     134.62|       119.0|    132.47|      134.4|      135.3|       114.4|
|2023-11-10|369.67|    362.12|     342.21|     332.31|      310.76|    3

                                                                                

## Bollinger Bands Calculation Explanation

This Jupiter Notebook cell performs the computation of Bollinger Bands on stock data for volatility analysis. The breakdown includes critical steps and considerations:

- **Decimal Rounding:**
  - All numerical values are rounded to two decimal places for consistency and readability.

- **Bollinger Bands Periods:**
  - The Bollinger Bands are computed for four distinct periods: 5, 20, 50, and 200 days, providing insights into short-term and long-term volatility.

- **Partitioning for Accuracy:**
  - The data is partitioned by "stock_id" and "ticker_symbol" to ensure accurate calculations for individual stocks. This is crucial for meaningful stock market analysis.

- **Reuse of Exponential Moving Averages (EMAs):**
  - Existing EMA values, previously calculated, are reused in the Bollinger Bands computation. This approach optimizes computational efficiency and maintains consistency with prior analyses.

- **Upper and Lower Band Calculation:**
  - The upper and lower bands are determined by adding and subtracting twice the standard deviation of closing prices from the corresponding EMAs. This methodology aligns with the standard Bollinger Bands formula.

- **Result Presentation:**
  - The final DataFrame includes the date, close price, upper bands, and lower bands for each specified period, providing a comprehensive view of the stock's volatility.

This code enhances the dataset with Bollinger Bands, aiding in the identification of potential market trends and volatility patterns.

In [13]:
# Number of decimal places
round_to_decimal = 2

# Define the Bollinger Bands periods
bollinger_periods = [5, 20, 50, 200]

partition_cols = ["stock_id", "ticker_symbol"]

# Define the windows for Bollinger Bands
windows = [Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, p - 1) for p in bollinger_periods]

# Reuse the existing EMA values for Bollinger Bands
for p in bollinger_periods:
    upper_band_col = F.col(f"{p}_days_ema") + (2 * F.stddev("close").over(windows[bollinger_periods.index(p)]))
    lower_band_col = F.col(f"{p}_days_ema") - (2 * F.stddev("close").over(windows[bollinger_periods.index(p)]))

    cleanedStockData = cleanedStockData.withColumn(f"upper_band_{p}", F.round(upper_band_col, round_to_decimal))
    cleanedStockData = cleanedStockData.withColumn(f"lower_band_{p}", F.round(lower_band_col, round_to_decimal))

# Show the result
selected_columns = ['date', 'close'] + [f"upper_band_{p}" for p in bollinger_periods] + [f"lower_band_{p}" for p in bollinger_periods]
cleanedStockData.select(selected_columns).orderBy(F.desc("date")).show()


[Stage 31:>                                                         (0 + 1) / 1]

+----------+------+------------+-------------+-------------+--------------+------------+-------------+-------------+--------------+
|      date| close|upper_band_5|upper_band_20|upper_band_50|upper_band_200|lower_band_5|lower_band_20|lower_band_50|lower_band_200|
+----------+------+------------+-------------+-------------+--------------+------------+-------------+-------------+--------------+
|2023-11-13|366.68|      371.19|       370.61|       363.13|        367.55|      355.33|       311.87|       306.19|        243.19|
|2023-11-13|223.71|       231.8|       247.22|       292.76|        283.59|      208.04|       194.22|       200.94|        136.81|
|2023-11-13|133.64|      134.81|       145.01|       144.72|        144.18|      130.97|       122.37|       125.64|         84.78|
|2023-11-13| 184.8|      186.98|       187.04|       187.03|         195.9|       179.4|       164.94|       168.59|        142.52|
|2023-11-10|134.06|      134.65|       146.31|       144.86|        144.15| 

                                                                                

In [29]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType

def calculate_rsi(data, n, default_rsi=14, max_rsi=70, epsilon=1e-6):
    # Calculate price changes
    price_diff = F.col("close") - F.lag("close", n).over(Window().partitionBy("stock_id", "ticker_symbol").orderBy("date"))
    
    # Separate gains and losses
    gains = F.when(price_diff > 0, price_diff).otherwise(0)
    losses = F.when(price_diff < 0, -price_diff).otherwise(0)
    
    # Calculate average gains and losses over n periods
    avg_gains = F.avg(gains).over(Window().partitionBy("stock_id", "ticker_symbol").orderBy("date").rowsBetween(-n+1, 0))
    avg_losses = F.avg(losses).over(Window().partitionBy("stock_id", "ticker_symbol").orderBy("date").rowsBetween(-n+1, 0))
    
    # Handle NULL values
    rs = F.when(avg_gains.isNull(), default_rsi).otherwise(avg_gains / (avg_losses + epsilon))

    # Handle 0 values
    rs = F.when((avg_gains == 0), default_rsi).otherwise(rs)

    # Calculate RSI and cap at max_rsi
    rsi = 100 - (100 / (1 + rs))
    
    return rsi

rsi_periods = [14, 20, 50, 200]

# Calculate RSI for each period with default values
for n in rsi_periods:
    cleanedStockData = cleanedStockData.withColumn(f"{n}_days_rsi", F.round(calculate_rsi(cleanedStockData, n), 2))

# Show the result
cleanedStockData.select(['ticker_symbol', 'date', 'close'] + [f"{n}_days_rsi" for n in rsi_periods]).orderBy(F.desc("date")).show()


[Stage 91:>                                                         (0 + 1) / 1]

+-------------+----------+------+-----------+-----------+-----------+------------+
|ticker_symbol|      date| close|14_days_rsi|20_days_rsi|50_days_rsi|200_days_rsi|
+-------------+----------+------+-----------+-----------+-----------+------------+
|         MSFT|2023-11-13|366.68|       70.0|       70.0|      51.09|        70.0|
|         GOOG|2023-11-13|133.64|       70.0|      25.17|       70.0|        70.0|
|         TSLA|2023-11-13|223.71|       2.49|       70.0|       5.77|       41.15|
|         AAPL|2023-11-13| 184.8|      48.72|      59.71|       1.05|        70.0|
|         MSFT|2023-11-10|369.67|       70.0|       70.0|      48.06|        70.0|
|         GOOG|2023-11-10|134.06|       2.78|      27.24|       70.0|        70.0|
|         TSLA|2023-11-10|214.65|        0.9|       70.0|       5.78|       40.51|
|         AAPL|2023-11-10| 186.4|       42.2|      54.76|       1.52|        70.0|
|         GOOG|2023-11-09|131.69|       6.08|       28.3|       70.0|        70.0|
|   

                                                                                