# Analyze Parquet Files with PySpark

This notebook demonstrates how to use local PySpark to analyze Parquet files generated by the predictive trading app.

In [10]:
# Section 1: Set Up Local PySpark Environment
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ParquetAnalysis") \
    .master("local[*]") \
    .getOrCreate()

print(f"Spark version: {spark.version}")

Spark version: 4.0.1


In [11]:
# Section 2: Read Parquet Files
import os
from glob import glob

parquet_files = glob(os.path.join('output', '*.parquet'))
if not parquet_files:
    print('No Parquet files found in output/.')
else:
    print(f'Found Parquet files: {parquet_files}')
    df = spark.read.parquet(parquet_files[0])  # Load first file for demo

Found Parquet files: ['output/AAPL_facts.parquet']


In [12]:
# Section 3: Explore Data Schema
if 'df' in locals():
    df.printSchema()
    df.show(5)
else:
    print('No DataFrame loaded.')

root
 |-- fact: string (nullable = true)
 |-- value: string (nullable = true)

+--------------------+-----------------+
|                fact|            value|
+--------------------+-----------------+
| Title of each class|Trading symbol(s)|
|Common Stock, $0....|             AAPL|
|0.000% Notes due ...|                —|
|1.625% Notes due ...|                —|
|2.000% Notes due ...|                —|
+--------------------+-----------------+
only showing top 5 rows


In [13]:
# Section 4: Perform Basic Data Analysis
if 'df' in locals():
    print('Total facts:', df.count())
    if 'fact' in df.columns:
        df.groupBy('fact').count().show()
    df.describe().show()
else:
    print('No DataFrame loaded.')

Total facts: 10
+--------------------+-----+
|                fact|count|
+--------------------+-----+
|3.600% Notes due ...|    1|
|             Item 1.|    1|
| Title of each class|    1|
|0.000% Notes due ...|    1|
|0.500% Notes due ...|    1|
|1.625% Notes due ...|    1|
|2.000% Notes due ...|    1|
|Common Stock, $0....|    1|
|1.375% Notes due ...|    1|
|3.050% Notes due ...|    1|
+--------------------+-----+

+--------------------+-----+
|                fact|count|
+--------------------+-----+
|3.600% Notes due ...|    1|
|             Item 1.|    1|
| Title of each class|    1|
|0.000% Notes due ...|    1|
|0.500% Notes due ...|    1|
|1.625% Notes due ...|    1|
|2.000% Notes due ...|    1|
|Common Stock, $0....|    1|
|1.375% Notes due ...|    1|
|3.050% Notes due ...|    1|
+--------------------+-----+

+-------+--------------------+-----+
|summary|                fact|value|
+-------+--------------------+-----+
|  count|                  10|   10|
|   mean|             

In [14]:
# Section 5: Visualize Results
import matplotlib.pyplot as plt
import seaborn as sns

if 'df' in locals():
    pdf = df.toPandas()
    if 'fact' in pdf.columns:
        plt.figure(figsize=(10,6))
        sns.countplot(y='fact', data=pdf, order=pdf['fact'].value_counts().index)
        plt.title('Top Facts Distribution')
        plt.xlabel('Count')
        plt.ylabel('Fact')
        plt.show()
else:
    print('No DataFrame loaded.')

ModuleNotFoundError: No module named 'matplotlib'