In [1]:
pip install numpy pymongo

Collecting numpy
  Downloading numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pymongo
  Downloading pymongo-4.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading numpy-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pymongo-4.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: numpy, dnspython, pymongo
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
#Imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, TimestampType
import signal
import time
import sys

In [None]:
# Initialize Spark Session with proper configurations
spark = SparkSession.builder \
    .appName("DockerSparkToLocalMongo") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.mongodb.output.uri", "mongodb://host.docker.internal:27017/movie_lens.recommendations") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint") \
    .getOrCreate()


# Define schema for incoming Kafka messages
rating_schema = StructType([
    StructField("userId", IntegerType()),  # Changed from user_id
    StructField("movieId", IntegerType()),  # Changed from movie_id
    StructField("rating", FloatType()),
    StructField("timestamp", TimestampType())
])

# Function to process each batch of data
def process_batch(batch_df, batch_id):
    if not batch_df.isEmpty():
        try:
            # Add processing metadata
            result_df = batch_df.withColumn("processing_time", F.current_timestamp()) \
                              .withColumn("batch_id", F.lit(batch_id))
            
            # Write to MongoDB
            (result_df.write
                .format("mongo")
                .mode("append")
                .option("database", "movie_lens")
                .option("collection", "recommendations")
                .save())
            
            print(f"Successfully processed batch {batch_id} with {batch_df.count()} records")
        except Exception as e:
            print(f"Error processing batch {batch_id}: {str(e)}", file=sys.stderr)

# Create Kafka source stream
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "namenode:9092") \
    .option("subscribe", "movie_rating") \
    .option("startingOffsets", "latest") \
    .option("failOnDataLoss", "false") \
    .option("maxOffsetsPerTrigger", "1000") \
    .load()

# Parse the JSON data from Kafka
processed_df = df.selectExpr("CAST(value AS STRING)") \
                .select(F.from_json(F.col("value"), rating_schema).alias("data")) \
                .select("data.*")

# Start the streaming query
query = processed_df.writeStream \
    .foreachBatch(process_batch) \
    .outputMode("update") \
    .option("checkpointLocation", "file:///tmp/checkpoint_movies") \
    .trigger(processingTime='10 seconds') \
    .start()

# Graceful shutdown handler
def handle_shutdown(signum, frame):
    print("\nShutting down gracefully...")
    query.stop()
    spark.stop()
    sys.exit(0)

signal.signal(signal.SIGINT, handle_shutdown)
signal.signal(signal.SIGTERM, handle_shutdown)

# Monitoring loop
try:
    while query.isActive:
        progress = query.lastProgress
        if progress:
            print(f"Batch ID: {progress['batchId']}, "
                  f"Input rows: {progress['numInputRows']}, "
                  f"Processed: {progress['processedRowsPerSecond']:.1f} rows/sec")
        time.sleep(5)
except Exception as e:
    print(f"Streaming query failed: {str(e)}", file=sys.stderr)
    handle_shutdown(None, None)

25/05/02 10:15:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/02 10:15:08 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/05/02 10:15:16 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 10019 milliseconds
[Stage 0:>                                                          (0 + 1) / 1]

Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec


[Stage 1:>                                                          (0 + 1) / 1]

Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec
Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec
Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec
Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec
Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec
Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec


25/05/02 10:15:56 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal

Batch ID: 0, Input rows: 0, Processed: 0.0 rows/sec


25/05/02 10:15:56 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 40133 milliseconds
[Stage 3:>                                                          (0 + 1) / 1]

Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 1, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:16:28 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal

Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:17:01 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 5)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal

Batch ID: 2, Input rows: 1, Processed: 0.0 rows/sec


[Stage 7:>                                                          (0 + 1) / 1]

Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 3, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:17:33 ERROR Executor: Exception in task 0.0 in stage 7.0 (TID 7)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal

Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 4, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:18:05 ERROR Executor: Exception in task 0.0 in stage 9.0 (TID 9)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal

Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec


[Stage 11:>                                                         (0 + 1) / 1]

Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 5, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:18:37 ERROR Executor: Exception in task 0.0 in stage 11.0 (TID 11)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.intern

Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 6, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:19:10 ERROR Executor: Exception in task 0.0 in stage 13.0 (TID 13)
com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=host.docker.internal:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.ConnectException: Connection refused (Connection refused)}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.intern

Batch ID: 7, Input rows: 1, Processed: 0.0 rows/sec


[Stage 15:>                                                         (0 + 1) / 1]

Batch ID: 7, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 7, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 7, Input rows: 1, Processed: 0.0 rows/sec
Batch ID: 7, Input rows: 1, Processed: 0.0 rows/sec


25/05/02 10:19:35 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 24806 milliseconds


Batch ID: 8, Input rows: 33, Processed: 1.3 rows/sec


[Stage 23:>                                                         (0 + 1) / 1]

Batch ID: 9, Input rows: 25, Processed: 7.8 rows/sec
Batch ID: 9, Input rows: 25, Processed: 7.8 rows/sec


25/05/02 10:19:51 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 11205 milliseconds


Batch ID: 10, Input rows: 5, Processed: 0.4 rows/sec


                                                                                

Batch ID: 11, Input rows: 11, Processed: 5.5 rows/sec


                                                                                

Batch ID: 11, Input rows: 11, Processed: 5.5 rows/sec


                                                                                

Batch ID: 12, Input rows: 11, Processed: 2.9 rows/sec


                                                                                

Batch ID: 12, Input rows: 11, Processed: 2.9 rows/sec


                                                                                

Batch ID: 13, Input rows: 11, Processed: 3.7 rows/sec


                                                                                

Batch ID: 13, Input rows: 11, Processed: 3.7 rows/sec


                                                                                

Batch ID: 14, Input rows: 9, Processed: 2.9 rows/sec


                                                                                

Batch ID: 14, Input rows: 9, Processed: 2.9 rows/sec
Batch ID: 15, Input rows: 11, Processed: 3.5 rows/sec
