In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.functions import col, from_json

# ✅ Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("RealTimeBankingTransactions") \
    .config("spark.sql.streaming.schemaInference", "false") \
    .getOrCreate()

# ✅ Step 2: Define Explicit JSON Schema (Matches File Structure)
schema = StructType([
    StructField("account_id", StringType(), True),
    StructField("transaction_amount", DoubleType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("location", StringType(), True),
    StructField("timestamp", StringType(), True)
])

# ✅ Step 3: Define Input Directory for Streaming JSON Files
input_path = "/home/labuser/Documents/Level3/Day3/StreamingData/"  # Update if needed

# ✅ Step 4: Read Streaming JSON Data
streaming_df = spark.readStream \
    .format("json") \
    .schema(schema) \
    .option("maxFilesPerTrigger", 1) \
    .option("multiline", "true") \
    .load(input_path)

# ✅ Step 5: Print Schema for Debugging
print("✅ Detected Schema:")
streaming_df.printSchema()

# ✅ Step 6: Debug If Any Data Is Being Read
def check_data_count(df, epoch_id):
    count = df.count()
    print(f"✅ Number of records in batch {epoch_id}: {count}")

streaming_df.writeStream \
    .foreachBatch(check_data_count) \
    .start()

# ✅ Step 7: Write Output to Console for Debugging
query = streaming_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", False) \
    .start()

# ✅ Step 8: Keep Streaming Running
query.awaitTermination()


✅ Detected Schema:
root
 |-- account_id: string (nullable = true)
 |-- transaction_amount: double (nullable = true)
 |-- transaction_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- timestamp: string (nullable = true)

✅ Number of records in batch 0: 1
✅ Number of records in batch 184: 1
✅ Number of records in batch 1: 1
✅ Number of records in batch 2: 1
✅ Number of records in batch 3: 1
✅ Number of records in batch 4: 1
✅ Number of records in batch 5: 1
✅ Number of records in batch 6: 1
✅ Number of records in batch 7: 1
✅ Number of records in batch 8: 1
✅ Number of records in batch 9: 1
✅ Number of records in batch 10: 1
✅ Number of records in batch 11: 1
✅ Number of records in batch 12: 1
✅ Number of records in batch 13: 1
✅ Number of records in batch 14: 1
✅ Number of records in batch 15: 1
✅ Number of records in batch 16: 1
✅ Number of records in batch 17: 1
✅ Number of records in batch 18: 1
✅ Number of records in batch 19: 1
✅ Number of records in batch

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/envs/sp3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/envs/sp3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/anaconda3/envs/sp3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [9]:

schema = StructType([
    StructField("account_id", StringType(), True),
    StructField("transaction_amount", DoubleType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("location", StringType(), True),
    StructField("timestamp", StringType(), True)  # Kept as String to prevent parsing errors
])   
    
df = spark.read.schema(schema).json("/home/labuser/Documents/Level3/Day3/StreamingData/transaction_*.json")
df.show(truncate=False)
df.printSchema()



+----------+------------------+----------------+--------+---------+
|account_id|transaction_amount|transaction_type|location|timestamp|
+----------+------------------+----------------+--------+---------+
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |null    |null     |
|null      |null              |null            |