In [4]:
pip install numpy pymongo

Collecting pymongo
  Downloading pymongo-4.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [pymongo]m1/2[0m [pymongo]
[1A[2KSuccessfully installed dnspython-2.7.0 pymongo-4.12.1
[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
#Imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, TimestampType
import signal
import time
import sys

In [9]:
# Initialize Spark Session with proper configurations
spark = SparkSession.builder \
    .appName("DockerSparkToLocalMongo") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.mongodb.output.uri", "mongodb://host.docker.internal:27017/movie_lens.recommendations") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint") \
    .getOrCreate()


# Define schema for incoming Kafka messages
rating_schema = StructType([
    StructField("userId", IntegerType()),  # Changed from user_id
    StructField("movieId", IntegerType()),  # Changed from movie_id
    StructField("rating", FloatType()),
    StructField("timestamp", TimestampType())
])

# Function to process each batch of data
def process_batch(batch_df, batch_id):
    if not batch_df.isEmpty():
        try:
            # Add processing metadata
            result_df = batch_df.withColumn("processing_time", F.current_timestamp()) \
                              .withColumn("batch_id", F.lit(batch_id))
            
            # Write to MongoDB
            (result_df.write
                .format("mongo")
                .mode("append")
                .option("database", "movie_lens")
                .option("collection", "recommendations")
                .save())
            
            print(f"Successfully processed batch {batch_id} with {batch_df.count()} records")
        except Exception as e:
            print(f"Error processing batch {batch_id}: {str(e)}", file=sys.stderr)

# Create Kafka source stream
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "namenode:9092") \
    .option("subscribe", "movie_rating") \
    .option("startingOffsets", "latest") \
    .option("failOnDataLoss", "false") \
    .option("maxOffsetsPerTrigger", "1000") \
    .load()

# Parse the JSON data from Kafka
processed_df = df.selectExpr("CAST(value AS STRING)") \
                .select(F.from_json(F.col("value"), rating_schema).alias("data")) \
                .select("data.*")

# Start the streaming query
query = processed_df.writeStream \
    .foreachBatch(process_batch) \
    .outputMode("update") \
    .option("checkpointLocation", "/tmp/checkpoint_movies") \
    .trigger(processingTime='10 seconds') \
    .start()

# Graceful shutdown handler
def handle_shutdown(signum, frame):
    print("\nShutting down gracefully...")
    query.stop()
    spark.stop()
    sys.exit(0)

signal.signal(signal.SIGINT, handle_shutdown)
signal.signal(signal.SIGTERM, handle_shutdown)

# Monitoring loop
try:
    while query.isActive:
        progress = query.lastProgress
        if progress:
            print(f"Batch ID: {progress['batchId']}, "
                  f"Input rows: {progress['numInputRows']}, "
                  f"Processed: {progress['processedRowsPerSecond']:.1f} rows/sec")
        time.sleep(5)
except Exception as e:
    print(f"Streaming query failed: {str(e)}", file=sys.stderr)
    handle_shutdown(None, None)

25/05/02 09:56:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/02 09:56:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/02 09:56:27 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

Batch ID: 19, Input rows: 13, Processed: 3.7 rows/sec
Batch ID: 20, Input rows: 3, Processed: 1.6 rows/sec
Batch ID: 21, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 21, Input rows: 11, Processed: 4.4 rows/sec


                                                                                

Batch ID: 22, Input rows: 11, Processed: 4.2 rows/sec
Batch ID: 22, Input rows: 11, Processed: 4.2 rows/sec


                                                                                

Batch ID: 23, Input rows: 11, Processed: 6.7 rows/sec
Batch ID: 23, Input rows: 11, Processed: 6.7 rows/sec


                                                                                

Batch ID: 24, Input rows: 11, Processed: 5.9 rows/sec
Batch ID: 24, Input rows: 11, Processed: 5.9 rows/sec
Batch ID: 25, Input rows: 11, Processed: 3.9 rows/sec
Batch ID: 25, Input rows: 11, Processed: 3.9 rows/sec
Batch ID: 26, Input rows: 11, Processed: 4.0 rows/sec
Batch ID: 26, Input rows: 11, Processed: 4.0 rows/sec
Batch ID: 27, Input rows: 11, Processed: 3.9 rows/sec
Batch ID: 27, Input rows: 11, Processed: 3.9 rows/sec
Batch ID: 28, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 28, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 29, Input rows: 11, Processed: 4.9 rows/sec
Batch ID: 29, Input rows: 11, Processed: 4.9 rows/sec


                                                                                

Batch ID: 30, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 30, Input rows: 11, Processed: 4.4 rows/sec


                                                                                

Batch ID: 31, Input rows: 11, Processed: 4.5 rows/sec
Batch ID: 31, Input rows: 11, Processed: 4.5 rows/sec
Batch ID: 32, Input rows: 9, Processed: 4.0 rows/sec
Batch ID: 32, Input rows: 9, Processed: 4.0 rows/sec


                                                                                

Batch ID: 33, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 33, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 34, Input rows: 11, Processed: 4.9 rows/sec
Batch ID: 34, Input rows: 11, Processed: 4.9 rows/sec


                                                                                

Batch ID: 35, Input rows: 11, Processed: 6.2 rows/sec
Batch ID: 35, Input rows: 11, Processed: 6.2 rows/sec
Batch ID: 36, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 36, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 37, Input rows: 11, Processed: 5.3 rows/sec
Batch ID: 37, Input rows: 11, Processed: 5.3 rows/sec


                                                                                

Batch ID: 38, Input rows: 11, Processed: 4.9 rows/sec
Batch ID: 38, Input rows: 11, Processed: 4.9 rows/sec
Batch ID: 39, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 39, Input rows: 11, Processed: 4.8 rows/sec
Batch ID: 40, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 40, Input rows: 11, Processed: 4.4 rows/sec
Batch ID: 41, Input rows: 11, Processed: 4.1 rows/sec

Shutting down gracefully...


SystemExit: 0