In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, DoubleType, ArrayType
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql.window import Window

In [2]:
SparkSession.stop()
spark = SparkSession.builder.appName("StockDataCleaning") \
    .config("spark.executor.memory", "4g").getOrCreate()

NameError: name 'spark' is not defined

In [None]:
schema = StructType([ \
    StructField("transaction_id", StringType(), True), \
    StructField("stock_id", StringType(), True), \
    StructField("ticker_symbol", StringType(), True), \
    StructField("date", DateType(), True), \
    StructField("low", FloatType(), True), \
    StructField("open", FloatType(), True), \
    StructField("high", FloatType(), True), \
    StructField("volume", IntegerType(), True), \
    StructField("close", FloatType(), True)])

In [None]:
stockData = spark.read.schema(schema).csv("./stocks-data/combined-stocks-data.csv", header=True)

In [None]:
stockData.printSchema()

In [None]:
stockData.show()

In [None]:
# Missing Values
missing_values = stockData.select([func.sum(func.col(c).isNull().cast("int")).alias(c + '_missing') for c in stockData.columns]).collect()

print("Missing Values:")
for row in missing_values[0].asDict():
    print(f"{row}: {missing_values[0][row]}")


In [None]:
# Filter rows where 'volume' is missing
missing_volume_rows = stockData.filter(stockData['volume'].isNull())

# Show the missing values
missing_volume_rows.show()


In [None]:
# Fill missing values in the 'volume' column with 0
stockData = stockData.na.fill(0, subset=['volume'])

# Show the DataFrame after filling missing values
stockData.show()


In [None]:
# Drop duplicate rows based on 'date' and 'close' columns
cleanedStockData = stockData.dropDuplicates(['date', 'close'])

# Check for duplicate values in the 'date' column again
duplicate_rows = cleanedStockData.groupBy('date', 'close').count().filter('count > 1')

# Show the duplicate dates and close prices, if any
if duplicate_rows.count() > 0:
    print("Duplicate dates and close prices found after deduplication:")
    duplicate_rows.show()
else:
    print("No duplicate dates and close prices found.")


In [None]:
cleanedStockData.orderBy(func.desc("date")).show()

## Stock Data Moving Averages Analysis

This PySpark script calculates Simple Moving Averages (SMA) and Exponential Moving Averages (EMA) for different periods on stock data. The analysis includes importing libraries, defining functions, setting parameters, and displaying the results.

- **Moving Averages:**
  - Simple Moving Averages (SMA) for periods: 5, 20, 50, 200.
  - Exponential Moving Averages (EMA) with corresponding alpha values.

- **Data Manipulation:**
  - Utilizes PySpark functions and windows for efficient data processing.

- **Result Display:**
  - Presents the DataFrame with date, close price, SMAs, and EMAs in descending order.


In [None]:
round_to_decimal = 2

def calculate_ema(data, alpha):
    ema = data[0]
    for i in range(1, len(data)):
        ema = alpha * data[i] + (1 - alpha) * ema
    return ema

calculate_ema_udf = F.udf(lambda data, alpha: float(calculate_ema(data, alpha)), FloatType())

periods = [5, 20, 50, 200]
alpha_values = [2 / (p + 1) for p in periods]

partition_cols = ["stock_id", "ticker_symbol"]

windows = [Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, p - 1) for p in periods]

# Calculate simple moving averages
for p in periods:
    cleanedStockData = cleanedStockData.withColumn(f"{p}_days_sma", F.round(F.avg("close").over(windows[periods.index(p)]), 2))

# Calculate exponential moving averages using UDF
for p, alpha in zip(periods, alpha_values):
    cleanedStockData = cleanedStockData.withColumn(f"{p}_days_ema", F.round(calculate_ema_udf(F.collect_list("close").over(windows[periods.index(p)]), F.lit(alpha)), round_to_decimal))

# Show the result
cleanedStockData.select(['date', 'close'] + [f"{p}_days_sma" for p in periods] + [f"{p}_days_ema" for p in periods]).orderBy(F.desc("date")).show()


## Bollinger Bands Calculation Explanation

This Jupiter Notebook cell performs the computation of Bollinger Bands on stock data for volatility analysis. The breakdown includes critical steps and considerations:

- **Decimal Rounding:**
  - All numerical values are rounded to two decimal places for consistency and readability.

- **Bollinger Bands Periods:**
  - The Bollinger Bands are computed for four distinct periods: 5, 20, 50, and 200 days, providing insights into short-term and long-term volatility.

- **Partitioning for Accuracy:**
  - The data is partitioned by "stock_id" and "ticker_symbol" to ensure accurate calculations for individual stocks. This is crucial for meaningful stock market analysis.

- **Reuse of Exponential Moving Averages (EMAs):**
  - Existing EMA values, previously calculated, are reused in the Bollinger Bands computation. This approach optimizes computational efficiency and maintains consistency with prior analyses.

- **Upper and Lower Band Calculation:**
  - The upper and lower bands are determined by adding and subtracting twice the standard deviation of closing prices from the corresponding EMAs. This methodology aligns with the standard Bollinger Bands formula.

- **Result Presentation:**
  - The final DataFrame includes the date, close price, upper bands, and lower bands for each specified period, providing a comprehensive view of the stock's volatility.

This code enhances the dataset with Bollinger Bands, aiding in the identification of potential market trends and volatility patterns.

In [None]:
# Number of decimal places
round_to_decimal = 2

# Define the Bollinger Bands periods
bollinger_periods = [5, 20, 50, 200]

partition_cols = ["stock_id", "ticker_symbol"]

# Define the windows for Bollinger Bands
windows = [Window().partitionBy(partition_cols).orderBy(F.desc("date")).rowsBetween(0, p - 1) for p in bollinger_periods]

# Reuse the existing EMA values for Bollinger Bands
for p in bollinger_periods:
    upper_band_col = F.col(f"{p}_days_ema") + (2 * F.stddev("close").over(windows[bollinger_periods.index(p)]))
    lower_band_col = F.col(f"{p}_days_ema") - (2 * F.stddev("close").over(windows[bollinger_periods.index(p)]))

    cleanedStockData = cleanedStockData.withColumn(f"upper_band_{p}", F.round(upper_band_col, round_to_decimal))
    cleanedStockData = cleanedStockData.withColumn(f"lower_band_{p}", F.round(lower_band_col, round_to_decimal))

# Show the result
selected_columns = ['date', 'close'] + [f"upper_band_{p}" for p in bollinger_periods] + [f"lower_band_{p}" for p in bollinger_periods]
cleanedStockData.select(selected_columns).orderBy(F.desc("date")).show()


In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType

# Assuming you already have a SparkSession named spark

# Define a function to calculate RSI
def calculate_rsi(data, period):
    # Calculate daily price changes
    price_diff = F.col("close") - F.lag("close", 1).over(Window().orderBy("date"))

    # Separate gains and losses
    gains = F.when(price_diff > 0, price_diff).otherwise(0)
    losses = F.when(price_diff < 0, -price_diff).otherwise(0)

    # Calculate average gains and losses over the specified period
    avg_gain = F.avg(gains).over(Window().orderBy("date").rowsBetween(-period, Window.currentRow))
    avg_loss = F.avg(losses).over(Window().orderBy("date").rowsBetween(-period, Window.currentRow))

    # Calculate relative strength (RS)
    rs = F.when(avg_loss != 0, avg_gain / avg_loss).otherwise(float("inf"))

    # Calculate RSI
    rsi = 100 - (100 / (1 + rs))

    return F.round(rsi, 2)

# Register the RSI UDF
calculate_rsi_udf = F.udf(lambda data, period: float(calculate_rsi(data, period)), FloatType())

# Define the periods for RSI calculation
rsi_periods = [5, 20, 50, 200]

# Calculate RSI for each period
for period in rsi_periods:
    cleanedStockData = cleanedStockData.withColumn(f"{period}_days_rsi", calculate_rsi_udf(F.collect_list("close").over(Window().orderBy("date")).alias("close_list"), F.lit(period)))

# Show the result
cleanedStockData.select(['ticker_symbol', 'date', 'close'] + [f"{p}_days_rsi" for p in rsi_periods]).orderBy(F.desc("date")).show()
