In [1]:
from pyspark.sql import SparkSession

scala_version = '2.12'  # TODO: Ensure this is correct
spark_version = '3.5.5'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.2.0',
    'org.mongodb.spark:mongo-spark-connector_2.12:10.4.1'
]
spark = SparkSession.builder \
    .master("local") \
    .appName("realtime-rag-ingestion-pipeline") \
    .config("spark.jars.packages", ",".join(packages)) \
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/mudasser.shaik/.ivy2/cache
The jars for the packages stored in: /Users/mudasser.shaik/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-14be8bd1-f749-40c1-8379-f7acd6fe1b4a;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.5 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.5 in central
	found org.apache.kafka#kafka-clients;3.4.1 in local-m2-cache
	found org.lz4#lz4-java;1.8.0 in local-m2-cache
	found org.xerial.snappy#snappy-java;1.1.10.5 in local-m2-cache
	found org.slf4j#slf4j-api;2.0.7 in local-m2-cache
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in local-m2-cache
	fo

In [2]:
from pyspark.sql.functions import col, concat, lit, from_json
from pyspark.sql.types import ArrayType, FloatType, StringType, StructType, StructField, IntegerType, LongType
from pyspark.sql.functions import from_json, schema_of_json

kafka_topic_name = "product_events"
kafka_bootstrap_servers = "0.0.0.0:29092"

# Kafka configuration
kafka_options = {
    "kafka.bootstrap.servers": kafka_bootstrap_servers,
    "subscribe": kafka_topic_name,
    "enable.auto.commit": "true",
    "auto.offset.reset": "latest",
    "max.poll.records": "1000"
}

# Read from Kafka with JSON deserializer
kafkaDf = spark.read.format("kafka") \
    .options(**kafka_options) \
    .load()


schema_product = StructType([
    StructField("store_id", IntegerType()),
    StructField("product_id", IntegerType()),
    StructField("count", IntegerType()),
    StructField("price", FloatType()),
    StructField("size", StringType()),
    StructField("ageGroup", StringType()),
    StructField("gender", StringType()),
    StructField("season", StringType()),
    StructField("fashionType", StringType()),
    StructField("brandName", StringType()),
    StructField("baseColor", StringType()),
    StructField("articleType", StringType())
])

# Deserialize the value column (assuming it contains JSON data)
jsonDf = kafkaDf.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
# jsonDf = kafkaDf.selectExpr("CAST(value AS STRING)")
jsonDf_new = jsonDf.withColumn("value", from_json(col("value"), schema_product))
jsonDf_new.printSchema()
jsonDf_new.show(truncate=False)

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- store_id: integer (nullable = true)
 |    |-- product_id: integer (nullable = true)
 |    |-- count: integer (nullable = true)
 |    |-- price: float (nullable = true)
 |    |-- size: string (nullable = true)
 |    |-- ageGroup: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- season: string (nullable = true)
 |    |-- fashionType: string (nullable = true)
 |    |-- brandName: string (nullable = true)
 |    |-- baseColor: string (nullable = true)
 |    |-- articleType: string (nullable = true)


25/04/03 13:02:48 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
[Stage 0:>                                                          (0 + 1) / 1]

+---+-------------------------------------------------------------------------------------------------+
|key|value                                                                                            |
+---+-------------------------------------------------------------------------------------------------+
|0  |{17, 196444, 5, 71.13, extra large, adult, female, summer, fashion, Hagar, black, shorts}        |
|1  |{27, 878070, 6, 22.87, small, adult, female, spring, core, Rockhill, red, gloves}                |
|2  |{18, 241319, 6, 55.27, large, adult, female, summer, core, Rockhill, blue, gloves}               |
|3  |{24, 450096, 12, 59.93, extra large, adult, male, fall, core, Calvin Klein, green, socks}        |
|4  |{8, 863998, 3, 49.62, medium, infant, male, fall, formal, Calvin Klein, black, shoes}            |
|5  |{8, 163142, 16, 14.42, small, adult, male, summer, formal, Calvin Klein, red, sweater}           |
|6  |{41, 745531, 16, 53.3, medium, infant, female, summer, fash

                                                                                

In [3]:
# Select individual fields from the JSON
queryableDf = jsonDf_new.select(
    col("key"),
    col("value.store_id"),
    col("value.product_id"),
    col("value.count"),
    col("value.price"),
    col("value.size"),
    col("value.ageGroup"),
    col("value.gender"),
    col("value.season"),
    col("value.fashionType"),
    col("value.brandName"),
    col("value.baseColor"),
    col("value.articleType")
)

# Show the schema and data
queryableDf.printSchema()
queryableDf.show(truncate=False)

root
 |-- key: string (nullable = true)
 |-- store_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- size: string (nullable = true)
 |-- ageGroup: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- season: string (nullable = true)
 |-- fashionType: string (nullable = true)
 |-- brandName: string (nullable = true)
 |-- baseColor: string (nullable = true)
 |-- articleType: string (nullable = true)

+---+--------+----------+-----+-----+-----------+--------+------+------+---------------+------------+---------+-----------+
|key|store_id|product_id|count|price|size       |ageGroup|gender|season|fashionType    |brandName   |baseColor|articleType|
+---+--------+----------+-----+-----+-----------+--------+------+------+---------------+------------+---------+-----------+
|0  |17      |196444    |5    |71.13|extra large|adult   |female|summer|fashion        |Hagar       |black   

25/04/03 13:02:52 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


In [7]:
json_data = queryableDf.toJSON().first()
print(json_data)

25/04/03 13:03:44 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


{"key":"0","store_id":17,"product_id":196444,"count":5,"price":71.13,"size":"extra large","ageGroup":"adult","gender":"female","season":"summer","fashionType":"fashion","brandName":"Hagar","baseColor":"black","articleType":"shorts"}


25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/03 13:03:45 WARN KafkaDataConsumer: KafkaDataConsumer is not running in Un

In [4]:
jsonDf_transformed = (kafkaDf.selectExpr("CAST(value AS STRING)")
 .select(from_json(col("value"), schema_product).alias("data")) \
    .select("data.*"))

jsonDf_transformed.printSchema()
jsonDf_transformed.show(truncate=False)


root
 |-- store_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- size: string (nullable = true)
 |-- ageGroup: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- season: string (nullable = true)
 |-- fashionType: string (nullable = true)
 |-- brandName: string (nullable = true)
 |-- baseColor: string (nullable = true)
 |-- articleType: string (nullable = true)

+--------+----------+-----+-----+-----------+--------+------+------+---------------+------------+---------+-----------+
|store_id|product_id|count|price|size       |ageGroup|gender|season|fashionType    |brandName   |baseColor|articleType|
+--------+----------+-----+-----+-----------+--------+------+------+---------------+------------+---------+-----------+
|17      |196444    |5    |71.13|extra large|adult   |female|summer|fashion        |Hagar       |black    |shorts     |
|27      |878070    |6    |22.87|sma

25/04/02 20:24:11 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


In [5]:
from pyspark.sql.functions import initcap, concat_ws

df = jsonDf_transformed.withColumn("content", initcap(concat_ws(' ',
                                                col("size"),
                                                col("ageGroup"),
                                                col("gender"),
                                                col("season"),
                                                col("fashionType"),
                                                col("brandName"),
                                                col("baseColor"),
                                                col("articleType"),
                                                concat_ws('', lit(', price: '), col("price").cast("string")),
                                                concat_ws('', lit(', store number: '), col("store_id").cast("string")),
                                                concat_ws('', lit(', product id: '), col("product_id").cast("string"))
                                                )))

In [6]:
df.printSchema()
df.show(truncate=False)

root
 |-- store_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- size: string (nullable = true)
 |-- ageGroup: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- season: string (nullable = true)
 |-- fashionType: string (nullable = true)
 |-- brandName: string (nullable = true)
 |-- baseColor: string (nullable = true)
 |-- articleType: string (nullable = true)
 |-- content: string (nullable = false)

+--------+----------+-----+-----+-----------+--------+------+------+---------------+------------+---------+-----------+-------------------------------------------------------------------------------------------------------------------------+
|store_id|product_id|count|price|size       |ageGroup|gender|season|fashionType    |brandName   |baseColor|articleType|content                                                                                                            

25/04/02 20:24:17 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


### Embedding Model
With RAG, Embeddings are dense vector representations of data (such as text, images, etc.) that capture the semantic meaning of the data. This process allows the system to find and retrieve information based on similarity, even if the search terms aren’t exact matches.

Adding extra data, like keywords, tags, or content summaries, can improve these embeddings. Combining this additional context with the main text increases the chances of retrieving relevant information when users ask questions.

#For example: 
If a user searches for “pain relief”, if someone searches for “pain relief,” an embedding that includes related keywords—even if the original text doesn’t mention them—can help them find the correct information. This method ensures that the embeddings reflect a broader context, improving the RAG system's ability to provide accurate and relevant answers.


***Using OpenAI API to generate embeddings***
Embedding API: https://platform.openai.com/docs/guides/embeddings

In [None]:
!pip install --upgrade openai

In [7]:
from pyspark.sql.functions import udf
import openai
import os

print(openai.__version__)
# Set up OpenAI API Key (replace with your actual key)
openai.api_key = os.getenv("OPENAI_API_KEY") 

1.66.3


In [8]:
def generate_embedding(text: str) -> list:
    # Generate a 1536-dimensional vector using OpenAI
    response = openai.embeddings.create(input=text, model="text-embedding-ada-002")
    query_vector = response.data[0].embedding  # Extract the 1536-d vector
    return query_vector

# Create a UDF from the embedding function
get_embeddings_udf = udf(generate_embedding, ArrayType(FloatType()))

# Apply the UDF to create df_with_embeddings
df_with_embeddings = df.withColumn("embedding", get_embeddings_udf(col("content")))
df_with_embeddings.printSchema()
df_with_embeddings.show(truncate=False)

root
 |-- store_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- price: float (nullable = true)
 |-- size: string (nullable = true)
 |-- ageGroup: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- season: string (nullable = true)
 |-- fashionType: string (nullable = true)
 |-- brandName: string (nullable = true)
 |-- baseColor: string (nullable = true)
 |-- articleType: string (nullable = true)
 |-- content: string (nullable = false)
 |-- embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)


25/04/02 20:24:31 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/04/02 20:24:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:31 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:31 WARN KafkaDataConsumer: KafkaDataConsumer is not 

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/var/folders/0l/rqz97gdn35395fdfr3l201c00000gq/T/ipykernel_88292/3332438107.py", line 3, in generate_embedding
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/resources/embeddings.py", line 128, in create
    return self._post(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1242, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 919, in request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 989, in _request
    raise APIConnectionError(request=request) from err
openai.APIConnectionError: Connection error.


### Write to MongoDB

https://www.mongodb.com/docs/spark-connector/current/streaming-mode/streaming-write-config/


In [9]:
# Save the data to vector database - MongoDB atlas
mongo_db_uri = "mongodb+srv://dbuser_genai:R6WHZ7MB2KNLCIPC@cluster-genai.wcknb.mongodb.net/?retryWrites=true&w=majority&appName=Cluster-GenAI"
mongo_db_name = "retail"
mongo_collection_name = "product"

# batch mode
# Set up write connection
# conf.set("spark.mongodb.write.connection.uri", mongo_db_uri)
# conf.set("spark.mongodb.write.database", mongo_db_name)
# conf.set("spark.mongodb.write.collection", mongo_collection_name)
# # If you need to update instead of inserting :
# conf.set("spark.mongodb.write.operationType", "update")

df_with_embeddings.write \
    .format("mongodb") \
    .mode("append") \
    .option("spark.mongodb.write.connection.uri", mongo_db_uri) \
    .option("spark.mongodb.write.database", mongo_db_name) \
    .option("spark.mongodb.write.collection", mongo_collection_name) \
    .save()




25/04/02 20:24:35 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/04/02 20:24:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:35 WARN KafkaDataConsumer: KafkaDataConsumer is not running in UninterruptibleThread. It may hang when KafkaDataConsumer's methods are interrupted because of KAFKA-1894
25/04/02 20:24:35 WARN KafkaDataConsumer: KafkaDataConsumer is not 

Py4JJavaError: An error occurred while calling o225.save.
: org.apache.spark.SparkException: Writing job failed.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.writingJobFailedError(QueryExecutionErrors.scala:903)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:416)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:364)
	at org.apache.spark.sql.execution.datasources.v2.AppendDataExec.writeWithV2(WriteToDataSourceV2Exec.scala:230)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run(WriteToDataSourceV2Exec.scala:342)
	at org.apache.spark.sql.execution.datasources.v2.V2ExistingTableWriteExec.run$(WriteToDataSourceV2Exec.scala:341)
	at org.apache.spark.sql.execution.datasources.v2.AppendDataExec.run(WriteToDataSourceV2Exec.scala:230)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:315)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 5) (192.168.7.218 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/0l/rqz97gdn35395fdfr3l201c00000gq/T/ipykernel_88292/3332438107.py", line 3, in generate_embedding
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/resources/embeddings.py", line 128, in create
    return self._post(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1242, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 919, in request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 989, in _request
    raise APIConnectionError(request=request) from err
openai.APIConnectionError: Connection error.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$5(WriteToDataSourceV2Exec.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:491)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:840)
	Suppressed: com.mongodb.spark.sql.connector.exceptions.DataException: Write aborted for: PartitionId: 0, TaskId: 5. Manual data clean up may be required.
		at com.mongodb.spark.sql.connector.write.MongoDataWriter.abort(MongoDataWriter.java:121)
		at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$10(WriteToDataSourceV2Exec.scala:487)
		at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1408)
		... 15 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:390)
	... 44 more
	Suppressed: com.mongodb.spark.sql.connector.exceptions.DataException: Write aborted for: 286f5ae0-807c-4397-be2d-27040b83ee2f. 0/1 tasks completed.
		at com.mongodb.spark.sql.connector.write.MongoBatchWrite.abort(MongoBatchWrite.java:91)
		at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:411)
		... 44 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/var/folders/0l/rqz97gdn35395fdfr3l201c00000gq/T/ipykernel_88292/3332438107.py", line 3, in generate_embedding
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/resources/embeddings.py", line 128, in create
    return self._post(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1242, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 919, in request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 979, in _request
    return self._retry_request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 1057, in _retry_request
    return self._request(
  File "/Users/mudasser.shaik/IdeaProjects/RealTime_VectorEmbedding_RAG_pipeline/venv/lib/python3.10/site-packages/openai/_base_client.py", line 989, in _request
    raise APIConnectionError(request=request) from err
openai.APIConnectionError: Connection error.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$5(WriteToDataSourceV2Exec.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:491)
	at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
	Suppressed: com.mongodb.spark.sql.connector.exceptions.DataException: Write aborted for: PartitionId: 0, TaskId: 5. Manual data clean up may be required.
		at com.mongodb.spark.sql.connector.write.MongoDataWriter.abort(MongoDataWriter.java:121)
		at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$10(WriteToDataSourceV2Exec.scala:487)
		at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1408)
		... 15 more
