### RAG

In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x7eae24859220>

In [3]:
index.search("how to use kafka with spark")

[{'text': 'While following tutorial 13.2 , when running ./spark-submit.sh streaming.py, encountered the following error:\n…\n24/03/11 09:48:36 INFO StandaloneAppClient$ClientEndpoint: Connecting to master spark://localhost:7077...\n24/03/11 09:48:36 INFO TransportClientFactory: Successfully created connection to localhost/127.0.0.1:7077 after 10 ms (0 ms spent in bootstraps)\n24/03/11 09:48:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors\n24/03/11 09:48:56 INFO StandaloneAppClient$ClientEndpoint: Connecting to master spark://localhost:7077…\n24/03/11 09:49:16 INFO StandaloneAppClient$ClientEndpoint: Connecting to master spark://localhost:7077...\n24/03/11 09:49:36 WARN StandaloneSchedulerBackend: Application ID is not initialized yet.\n24/03/11 09:49:36 ERROR StandaloneSchedulerBacke

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [5]:
question = "how to use kafka with spark"

In [6]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(query, search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
search_results = search(question)

In [8]:
prompt = build_prompt(question, search_results)

In [9]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

<QUESTION>
how to use kafka with spark
</QUESTION>

<CONTEXT>
section: Module 6: streaming with kafka
question: Python Kafka: ./spark-submit.sh streaming.py - ERROR StandaloneSchedulerBackend: Application has been killed. Reason: All masters are unresponsive! Giving up.
answer: While following tutorial 13.2 , when running ./spark-submit.sh streaming.py, encountered the following error:
…
24/03/11 09:48:36 INFO StandaloneAppClient$ClientEndpoint: Connecting to master spark://localhost:7077...
24/03/11 09:48:36 INFO TransportClientFactory: Successfully created connection to localhost/127.0.0.1:7077 after 10 ms (0 ms spent in bootstraps)
24/03/11 09:48:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGeneratio

In [10]:
from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv()

client = OpenAI()

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [11]:
answer = llm(prompt)

In [12]:
print(answer)

To use Kafka with Spark, you should follow the steps in the relevant module that covers streaming with Kafka. Make sure you have the appropriate dependencies set up, and check for any version mismatches between your local PySpark and the intended version (3.3.1 as indicated).

Additionally, if you encounter connection issues to the Spark master, you can troubleshoot by running the following commands in your terminal:

1. Start a new terminal.
2. Run `docker ps` to check active containers.
3. Copy the CONTAINER ID of the spark-master container.
4. Run `docker exec -it <spark_master_container_id> bash` to access the container.
5. Run `cat logs/spark-master.out` to view the logs and identify errors.

If you have issues with Kafka, such as the error `kafka.errors.NoBrokersAvailable`, ensure that your Kafka broker Docker container is up and running by confirming with `docker ps`. If it's not running, navigate to the folder with your Docker compose yaml file and run `docker compose up -d` to

In [13]:
rag(question)

"To use Kafka with Spark, you should ensure that your Kafka broker is running properly and that Spark can connect to it without any issues. Here are some steps to help you set up and troubleshoot your environment:\n\n1. **Check Kafka Broker Status**: \n   - Use the command `docker ps` to confirm that your Kafka broker Docker container is running. \n\n2. **Start Kafka Broker if Necessary**: \n   - If the Kafka broker is not running, navigate to the directory containing your Docker Compose YAML file and run the command:\n     ```\n     docker compose up -d\n     ```\n   - This will start all the required instances, including Kafka.\n\n3. **Spark Submission**:\n   - You'll typically submit your Spark streaming application using a command similar to:\n     ```\n     ./spark-submit.sh streaming.py\n     ```\n   - Make sure you are running this command while your Kafka broker is active.\n\n4. **Version Compatibility**:\n   - Check that your local PySpark version matches the version specified

#### 'Agentic' RAG

In [14]:
prompt_template = """
You are a course teaching assistant.

You are given a Question from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>

If CONTEXT is EMPTY, you can use your FAQ database.
In this case, use the following output template:

{{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action": "ANSWER",
"answer": "<your answer>",
"source" : "CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action" : "ANSWER",
"answer" : "<your answer>",
"source" : "OWN_KNOWLEDGE"
}}
""".strip()

In [15]:
question = "Can i still join the course?"
context = "EMPTY"

In [16]:
prompt = prompt_template.format(question=question, context=context)
print(prompt)

You are a course teaching assistant.

You are given a Question from a course student and that you need to answer with your own knowledge and provided CONTEXT.
At the beginning the context is EMPTY.

<QUESTION>
Can i still join the course?
</QUESTION>

<CONTEXT>
EMPTY
</CONTEXT>

If CONTEXT is EMPTY, you can use your FAQ database.
In this case, use the following output template:

{
"action": "SEARCH",
"reasoning": "<add your reasoning here>"
}

If you can answer the QUESTION using CONTEXT, use this template:

{
"action": "ANSWER",
"answer": "<your answer>",
"source" : "CONTEXT"
}

If the context doesn't contain the answer, use your own knowledge to answer the question

{
"action" : "ANSWER",
"answer" : "<your answer>",
"source" : "OWN_KNOWLEDGE"
}


In [17]:
answer_json = llm(prompt)

In [18]:
import json

In [19]:
answer = json.loads(answer_json)

In [20]:
answer["action"]

'SEARCH'

In [21]:
def build_context(search_results):
    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    return context.strip()

In [22]:
search_results = search(question)
context = build_context(search_results)
prompt = prompt_template.format(question=question, context=context)

In [23]:
answer_json = llm(prompt)

In [24]:
print(answer_json)

{
"action": "ANSWER",
"answer": "Yes, you can still join the course after the start date. Even if you don't register, you're allowed to submit your homeworks. However, be mindful of deadlines for the final projects, so try not to leave everything for the last minute.",
"source": "CONTEXT"
}


In [25]:
def agentic_rag_v1(question):
    context = "EMPTY"
    prompt = prompt_template.format(question=question, context=context)
    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(answer)

    if answer["action"]=="SEARCH":
        print("need to perform search...")
        search_results = search(question)
        context = build_context(search_results)

        prompt = prompt_template.format(question=question, context=context)
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(answer)

    return answer

In [26]:
%%time
agentic_rag_v1("how do i join the course?")

{'action': 'SEARCH', 'reasoning': 'The question is about how to join the course, which is a general inquiry that typically would be found in an FAQ database.'}
need to perform search...
{'action': 'ANSWER', 'answer': "To join the course, you need to register before the course starts using the provided registration link. Although you can still submit homeworks after the course has started, it's important to register to ensure you get all necessary updates and can participate fully in live sessions.", 'source': 'CONTEXT'}
CPU times: user 21.3 ms, sys: 1.79 ms, total: 23.1 ms
Wall time: 3.24 s


{'action': 'ANSWER',
 'answer': "To join the course, you need to register before the course starts using the provided registration link. Although you can still submit homeworks after the course has started, it's important to register to ensure you get all necessary updates and can participate fully in live sessions.",
 'source': 'CONTEXT'}

#### Agentic search

In [46]:
def dedub(seq):
    """
    deduplicates by skipping the repeating element('_id')
    """
    seen = set()
    result = []
    for el in seq:
        _id = el['_id']
        if _id in seen:
            continue
        seen.add(_id)
        result.append(el)
    return result

In [47]:
prompt_template = """
You are a course teaching assistant.

You are given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic.

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than {max_iterations} iterations for a given student question.
The current iteration number: {iteration_number}. If we exceed the allowed number
of iterations, give the best possible answer with the provided information.

Output templates:

If you want to perform search, use this template:

{{
"action" : "SEARCH",
"reasoning" : "<add your reasoning here>",
"keywords" : ["search query 1", "search query 2", ...]
}}

If you can answer the QUESTION using CONTEXT, use this template:

{{
"action" : "ANSWER_CONTEXT",
"answer" : "<your answer>",
"source" :"CONTEXT"
}}

If the context doesn't contain the answer, use your own knowledge to answer the question

{{
"action" : "ANSWER",
"answer" : "<your answer>",
"source" : "OWN_KNOWLEDGE"
}}

<QUESTION>
{question}
</QUESTION>

<SEARCH_QUERIES>
{search_queries}
</SEARCH_QUERIES>

<CONTEXT>
{context}
</CONTEXT>

<PREVIOUS_ACTIONS>
{previous_actions}
</PREVIOUS_ACTIONS>
""".strip()

In [48]:
question = "how do I do well on module 1"

max_iterations = 3
iteration_number = 0
search_queries = []
search_results = []
previous_actions = []

In [49]:
context = build_context(search_results)

prompt = prompt_template.format(
    question = question,
    context = context,
    search_queries = "\n".join(search_queries),
    previous_actions = "\n".join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number,
)

In [50]:
answer_json = llm(prompt)

In [64]:
answer = json.loads(answer_json)

In [65]:
answer

{'action': 'SEARCH',
 'reasoning': 'The context currently lacks specific tips or strategies for succeeding in Module 1. I will search for particular advice on how to excel in that module to provide a comprehensive answer.',
 'keywords': ['success in Module 1',
  'Module 1 study tips',
  'how to do well in Module 1']}

In [66]:
previous_actions.append(answer)

In [67]:
previous_actions

[{'action': 'SEARCH',
  'reasoning': 'To provide the best advice on excelling in module 1, I need to gather specific strategies or tips related to that module from the FAQ database.',
  'keywords': ['module 1 success tips',
   'how to excel in module 1',
   'module 1 study strategies']},
 {'action': 'SEARCH',
  'reasoning': 'The context currently lacks specific tips or strategies for succeeding in Module 1. I will search for particular advice on how to excel in that module to provide a comprehensive answer.',
  'keywords': ['success in Module 1',
   'Module 1 study tips',
   'how to do well in Module 1']}]

In [68]:
keywords = answer['keywords']

In [69]:
print(keywords)

['success in Module 1', 'Module 1 study tips', 'how to do well in Module 1']


In [70]:
for kw in keywords:
    search_queries.append(kw)
    sr = search(kw)
    search_results.extend(sr)

In [71]:
search_results = dedub(search_results)

In [72]:
len(search_results)

6

In [73]:
iteration_number = 2

context = build_context(search_results)

prompt = prompt_template.format(
    question = question,
    context = context,
    search_queries = "\n".join(search_queries),
    previous_actions = "\n".join([json.dumps(a) for a in previous_actions]),
    max_iterations=max_iterations,
    iteration_number=iteration_number,
)

In [74]:
print(prompt)

You are a course teaching assistant.

You are given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic.

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current iteration numbe

In [75]:
answer_json = llm(prompt)

In [78]:
print(answer_json)

{
"action": "ANSWER",
"answer": "To do well in Module 1, which focuses on Docker and Terraform, it's important to grasp the foundational concepts of containerization and infrastructure as code. Ensure you understand how Docker works, including commands like `docker build`, `docker run`, and how images and containers interact. Familiarize yourself with Docker files and the syntax as you will use them to create your containers. Additionally, when it comes to Terraform, focus on understanding the configuration language, how to define infrastructure as code, and the `terraform apply` command. Practice building sample projects to reinforce your learning. Also, make sure to troubleshoot any issues you encounter, as this is a great way to deepen your understanding. Lastly, make use of online resources and community forums for additional help and insights from peers.","source": "OWN_KNOWLEDGE"
}


In [79]:
answer = json.loads(answer_json)

In [80]:
print(answer['answer'])

To do well in Module 1, which focuses on Docker and Terraform, it's important to grasp the foundational concepts of containerization and infrastructure as code. Ensure you understand how Docker works, including commands like `docker build`, `docker run`, and how images and containers interact. Familiarize yourself with Docker files and the syntax as you will use them to create your containers. Additionally, when it comes to Terraform, focus on understanding the configuration language, how to define infrastructure as code, and the `terraform apply` command. Practice building sample projects to reinforce your learning. Also, make sure to troubleshoot any issues you encounter, as this is a great way to deepen your understanding. Lastly, make use of online resources and community forums for additional help and insights from peers.


#### Automating in a loop

In [82]:
question = "what do I need to do to be succesful at module 1?"

search_queries = []
search_results = []
previous_actions = []

iteration = 0

while True:
    print(f'ITERATION #{iteration}...')

    context = build_context(search_results)
    prompt = prompt_template.format(
        question = question,
        context = context,
        search_queries = "\n".join(search_queries),
        previous_actions = "\n".join([json.dumps(a) for a in previous_actions]),
        max_iterations = 3,
        iteration_number = iteration
    )

    print(prompt)

    answer_json = llm(prompt)
    answer = json.loads(answer_json)
    print(json.dumps(answer, indent=2))

    previous_actions.append(answer)

    action = answer['action']
    if action != 'SEARCH':
        break

    keywords = answer['keywords']
    search_queries = list(set(search_queries) | set(keywords))

    for k in keywords:
        res = search(k)
        search_results.extend(res)

    search_results = dedub(search_results)

    iteration = iteration + 1
    if iteration >= 4:
        break
    print()


ITERATION #0...
You are a course teaching assistant.

You are given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic.

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

In [84]:
answer

{'action': 'ANSWER',
 'answer': "To be successful in Module 1 (Docker and Terraform), consider these strategies:\n\n1. **Hands-On Practice**: Engage with the hands-on labs and projects in the module. Docker and Terraform are practical tools that require operational knowledge, so practice building and deploying applications using these technologies.\n\n2. **Utilize Resources**: Make use of the resources provided in the course, such as video lectures, reading materials, and any supplementary materials.\n\n3. **Join Discussions**: Participate in forums or discussion groups with peers. This can enhance understanding and expose you to different perspectives on problem-solving.\n\n4. **Stay Organized**: Keep comprehensive notes on concepts and commands you interact with, as both Docker and Terraform might require remembering various commands and configurations.\n\n5. **Troubleshooting Skills**: Familiarize yourself with common problems and their solutions, as shown in the FAQs. For example, 

In [85]:
iteration

2

In [86]:
def agentic_search(question):
    search_queries = []
    search_results = []
    previous_actions = []
    
    iteration = 0
    
    while True:
        print(f'ITERATION #{iteration}...')
    
        context = build_context(search_results)
        prompt = prompt_template.format(
            question = question,
            context = context,
            search_queries = "\n".join(search_queries),
            previous_actions = "\n".join([json.dumps(a) for a in previous_actions]),
            max_iterations = 3,
            iteration_number = iteration
        )
    
        print(prompt)
    
        answer_json = llm(prompt)
        answer = json.loads(answer_json)
        print(json.dumps(answer, indent=2))
    
        previous_actions.append(answer)
    
        action = answer['action']
        if action != 'SEARCH':
            break
    
        keywords = answer['keywords']
        search_queries = list(set(search_queries) | set(keywords))
    
        for k in keywords:
            res = search(k)
            search_results.extend(res)
    
        search_results = dedub(search_results)
    
        iteration = iteration + 1
        if iteration >= 4:
            break
        print()
        
    return answer

In [87]:
agentic_search("how do I prepare for the course?")

ITERATION #0...
You are a course teaching assistant.

You are given a QUESTION from a course student and that you need to answer with your own knowledge and provided CONTEXT.

The CONTEXT is build with the documents from our FAQ database.
SEARCH_QUERIES contains the queries that were used to retrieve the documents
from FAQ to and add them to the context.
PREVIOUS_ACTIONS contains the actions you already performed.

At the beginning the CONTEXT is empty.

You can perform the following actions:

- Search in the FAQ database to get more data for the CONTEXT
- Answer the question using the CONTEXT
- Answer the question using your own knowledge

For the SEARCH action, build search requests based on the CONTEXT and the QUESTION.
Carefully analyze the CONTEXT and generate the requests to deeply explore the topic.

Don't use search queries used at the previous iterations.

Don't repeat previously performed actions.

Don't perform more than 3 iterations for a given student question.
The current

{'action': 'ANSWER',
 'answer': "To prepare for the course effectively, here are some general tips: \n1. **Register Early**: Make sure to register for the course before it starts, using the provided link. \n2. **Join Communication Channels**: Subscribe to the Google Calendar for course schedules and join the Telegram channel for announcements. Additionally, registering on the DataTalks.Club's Slack and joining relevant channels will keep you updated. \n3. **Familiarize Yourself with Required Tools**: Ensure you have access to the tools and software required for the course. Review any documentation or tutorials related to the tools mentioned in the course structure. \n4. **Establish a Study Schedule**: Allocate dedicated times each week for coursework, reviewing materials, and participating in discussions during live sessions. \n5. **Prepare Your Workspace**: Set up a comfortable and distraction-free environment for study sessions. \n6. **Engage with Peers**: Connecting with fellow stud

In [88]:
print(_['answer'])

To prepare for the course effectively, here are some general tips: 
1. **Register Early**: Make sure to register for the course before it starts, using the provided link. 
2. **Join Communication Channels**: Subscribe to the Google Calendar for course schedules and join the Telegram channel for announcements. Additionally, registering on the DataTalks.Club's Slack and joining relevant channels will keep you updated. 
3. **Familiarize Yourself with Required Tools**: Ensure you have access to the tools and software required for the course. Review any documentation or tutorials related to the tools mentioned in the course structure. 
4. **Establish a Study Schedule**: Allocate dedicated times each week for coursework, reviewing materials, and participating in discussions during live sessions. 
5. **Prepare Your Workspace**: Set up a comfortable and distraction-free environment for study sessions. 
6. **Engage with Peers**: Connecting with fellow students can provide support and motivation

#### Function calling ("tool use")

In [90]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [91]:
search_tool = {
    "type" : "function",
    "name" : "search",
    "description" : "Search the FAQ database",
    "parameters" : {
        "type" : "object",
        "properties" : {
            "query" : {
                "type" : "string",
                "description" : "Search query text to look up in the course FAQ."
            }
        },
        "required" : ["query"],
        "additionalProperties": False
    }
}

In [94]:
question = "what do I do well in module 1?"

developer_prompt = """
You're a course teaching assistant.
You're given a question from a course student and your task is to answer it.
""".strip()

tools = [search_tool]

chat_messages = [
    {"role" : "developer", "content" : developer_prompt},
    {"role" : "user", "content" : question}
]

response = client.responses.create(
    model = 'gpt-4o-mini',
    input = chat_messages,
    tools = tools
)
response.output

[ResponseFunctionToolCall(arguments='{"query":"module 1 strengths"}', call_id='call_VzH6cLNCGKH9D7Flc0s3ZCZg', name='search', type='function_call', id='fc_689ff2b3e18481908683cecd7de4cc050ca86754c69b442f', status='completed')]

In [95]:
calls = response.output

In [97]:
call = calls[0]

In [100]:
f_name = call.name

In [101]:
arguments = json.loads(call.arguments)

In [102]:
arguments

{'query': 'module 1 strengths'}

In [103]:
globals()[f_name]

<function __main__.search(query)>

In [104]:
globals()['search']

<function __main__.search(query)>

In [105]:
globals()['search_tool']

{'type': 'function',
 'name': 'search',
 'description': 'Search the FAQ database',
 'parameters': {'type': 'object',
  'properties': {'query': {'type': 'string',
    'description': 'Search query text to look up in the course FAQ.'}},
  'required': ['query'],
  'additionalProperties': False}}

In [106]:
f = globals()[f_name]

In [108]:
results = f(**arguments)

In [109]:
results

[{'text': 'You need to look for the Py4J file and note the version of the filename. Once you know the version, you can update the export command accordingly, this is how you check yours:\n` ls ${SPARK_HOME}/python/lib/ ` and then you add it in the export command, mine was:\nexport PYTHONPATH=”${SPARK_HOME}/python/lib/Py4J-0.10.9.5-src.zip:${PYTHONPATH}”\nMake sure that the version under `${SPARK_HOME}/python/lib/` matches the filename of py4j or you will encounter `ModuleNotFoundError: No module named \'py4j\'` while executing `import pyspark`.\nFor instance, if the file under `${SPARK_HOME}/python/lib/` was `py4j-0.10.9.3-src.zip`.\nThen the export PYTHONPATH statement above should be changed to `export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.3-src.zip:$PYTHONPATH"` appropriately.\nAdditionally, you can check for the version of ‘py4j’ of the spark you’re using from here and update as mentioned above.\n~ Abhijit Chakraborty: Sometimes, even with adding the correct version of p