<a href="https://colab.research.google.com/github/mambabhi/indic-learn/blob/main/indic_story_english_quiz_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install haystack-ai langfuse-haystack langfuse groq surf json-repair datasets

Collecting haystack-ai
  Downloading haystack_ai-2.12.1-py3-none-any.whl.metadata (14 kB)
Collecting langfuse-haystack
  Downloading langfuse_haystack-1.0.0-py3-none-any.whl.metadata (5.7 kB)
Collecting langfuse
  Downloading langfuse-2.60.2-py3-none-any.whl.metadata (3.2 kB)
Collecting groq
  Downloading groq-0.22.0-py3-none-any.whl.metadata (15 kB)
Collecting surf
  Downloading SuRF-1.1.9.tar.gz (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting json-repair
  Downloading json_repair-0.41.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting haystack-experimental (from haystack-ai)
  Downloading haystack_experimental-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting lazy-imports (from haystack-ai)
  Downloading lazy_imports-0.4.0-py3-none-any.whl.metadata (10 kB)
C

In [2]:
import json
import json_repair
import os
from pathlib import Path
from typing import List, Dict
from getpass import getpass

In [3]:
from haystack import Pipeline, component
from haystack.components.generators import OpenAIGenerator
from haystack.components.builders import PromptBuilder
from haystack.components.websearch.serper_dev import SerperDevWebSearch
from haystack.utils import Secret

import requests
import pandas as pd
from pprint import pprint

Gets API keys from environment variables or user input.

In [4]:
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass("Enter Groq API key:")

Enter Groq API key:··········


In [5]:
if "SERPERDEV_API_KEY" not in os.environ:
    os.environ["SERPERDEV_API_KEY"] = getpass("Enter Serper Dev key:")

Enter Serper Dev key:··········


In [6]:
@component
class QuizParser:
    """Parses the quiz JSON out of the LLM's response."""
    @component.output_types(quiz=Dict)
    def run(self, replies: List[str]):
        reply = replies[0]
        first_index = min(reply.find("{"), reply.find("["))
        last_index = max(reply.rfind("}"), reply.rfind("]")) + 1
        json_portion = reply[first_index:last_index]

        try:
            quiz = json.loads(json_portion)
        except json.JSONDecodeError:
            quiz = json_repair.loads(json_portion)
        if isinstance(quiz, list):
            quiz = quiz[0]
        return quiz

In [7]:
def build_english_quiz_pipeline():
    pipeline = Pipeline()
    pipeline.add_component("websearch", SerperDevWebSearch(top_k=5))
    pipeline.add_component(
        "prompt_builder",
        PromptBuilder(
            template="""
            Given the following - {{text}} - in English language, create 5 multiple choice quizzes in JSON format in English language.
            Each question should have 4 different options, and only one of them should be correct.
            The options should be unambiguous.
            Each option should begin with a letter followed by a period and a space (e.g., "a. king").
            The question should also briefly mention the general topic of the text so that it can be understood in isolation.
            Each question should not give hints to answer the other questions.
            Include challenging questions, which require reasoning.
            Generate simple english sentences so that they can be easily converted to Sanskrit.

            Respond with JSON only, no markdown or descriptions.
            The JSON should be entirely in English.

            Note that you are able to provide more accurate english sentences because you can understand additional context from web sources.
            You are able to use the snippets extracted from the web to excel in your translation.
            Only, if you aren't able to find sources on the web that matches the text, then use your knowledge of English grammar.
            Provide the link to the source from the web that you might have used.

            Example JSON format you should absolutely follow, including the reasoning:

            {
              "quiz": {
                "topic": "a sentence explaining the topic of the text",
                "questions": [
                  {
                    "question": "text of the question",
                    "options": ["a. 1st option", "b. 2nd option", "c. 3rd option", "d. 4th option"],
                    "right_option": "c",
                    "source": "I found a source: <paste_actual_link_here> which provided the context for me to properly generate english quiz"
                  }
                ]
              }
            }

            Snippets:
            {% for doc in documents %}
            - snippet: "{{ doc.content }}"
              link: "{{ doc.meta.link or doc.meta.url or 'unknown' }}"
            {% endfor %}
            """
        ),
    )
    pipeline.add_component(
        "generator",
        OpenAIGenerator(
            api_key=Secret.from_env_var("GROQ_API_KEY"),
            api_base_url="https://api.groq.com/openai/v1",
            model="llama3-70b-8192",
            generation_kwargs={
                "max_tokens": 5000,
                "temperature": 0.5,
                "top_p": 1,
            },
        ),
    )
    pipeline.connect("websearch.documents", "prompt_builder.documents")
    pipeline.connect("prompt_builder", "generator")
    return pipeline

In [17]:
english_quiz_generation= (
    build_english_quiz_pipeline()
)



In [7]:
indic_topic = "The King’s Monkey Servant"

In [8]:
indic_text = """Title: The King’s Monkey Servant
Moral: A king wishing long life should never keep foolish servants.”
Story: A king had a monkey as his body-guard. He was very fond of the king, and as he was very much trusted by the king, he could go into the kings’ bed room without being stopped by anyone.
Once when the king was sleeping the monkey started breezing the king with a fan. While doing this a fly came and sat on the king’s chest. The monkey tried to ward off the fly with the fan. But the fly would come again and sit on the same place.
The monkey due to its foolish nature became angry, got a sharp sword and hit the fly to kill it. The fly flew away but, the king’s chest was divided into two, and the king died."""

In [9]:
search = SerperDevWebSearch(top_k=5)
results = search.run(query='"The King’s Monkey Servant" full story text')

for doc in results["documents"]:
    print("---")
    print("Content:", doc.content)
    print("Meta:", doc.meta)

---
Content: H Stories from Panchatantra 3 - The King's Monkey Servant H. = l. U ~ *\ ~. 7] U 7 ~. 9 7. 7 -. ~ ~ ~. \ U { 3 , H Stories from Panchatantra 4 "A king wishing ...
Meta: {'title': 'Stories From Panchatantra | PDF | Nature - Scribd', 'link': 'https://ro.scribd.com/doc/66866724/Stories-From-Panchatantra?BWrQpWATE4ftR=709UQ2sssl7sgYw', 'position': 1}
---
Content: पंचतकथासंमहः <strong>Stories</strong> <strong>from</strong> <strong>Panchatantra</strong><br />. नृपसेवकवानर-कथा<br />. The King's Monkey Servant<br />.
Meta: {'title': 'Stories_from_Panchatantra_-_Sanskrit_English - YUMPU', 'link': 'https://www.yumpu.com/en/document/view/36448455/stories-from-panchatantra-sanskrit-english', 'date': 'Jan 29, 2015', 'position': 2}
---
Content: 4. पंचत कथासंमहः Stories from Panchatantra 3 नृपसेवकवानर-कथा The King's Monkey Servant त ाि रायुिर त नृपेण मूख ऽनुचरो न र िणयः ।
Meta: {'title': 'Stories from panchatantra (sanskrit text &amp; english - SlideShare', 'link': 'https://www.slideshar

In [10]:
pipeline = Pipeline()
pipeline.add_component("websearch", SerperDevWebSearch(top_k=5))
pipeline.add_component("prompt_builder", PromptBuilder(template="""
            Given the following - {{text}} - in English language, create 5 multiple choice quizzes in JSON format in English language.
            Each question should have 4 different options, and only one of them should be correct.
            The options should be unambiguous.
            Each option should begin with a letter followed by a period and a space (e.g., "a. king").
            The question should also briefly mention the general topic of the text so that it can be understood in isolation.
            Each question should not give hints to answer the other questions.
            Include challenging questions, which require reasoning.
            Generate simple english sentences so that they can be easily converted to Sanskrit.

            Respond with JSON only, no markdown or descriptions.
            The JSON should be entirely in English.

            Note that you are able to provide more accurate english sentences because you can understand additional context from web sources.
            You are able to use the snippets extracted from the web to excel in your translation.
            Only, if you aren't able to find sources on the web that matches the text, then use your knowledge of English grammar.
            Provide the link to the source from the web that you might have used.

            Example JSON format you should absolutely follow, including the reasoning:

            {
              "quiz": {
                "topic": "a sentence explaining the topic of the text",
                "questions": [
                  {
                    "question": "text of the question",
                    "options": ["a. 1st option", "b. 2nd option", "c. 3rd option", "d. 4th option"],
                    "right_option": "c",
                    "source": "I found a source: <paste_actual_link_here> which provided the context for me to properly generate english quiz"
                  }
                ]
              }
            }

            Snippets:
            {% for doc in documents %}
            - snippet: "{{ doc.content }}"
              link: "{{ doc.meta.link or doc.meta.url or 'unknown' }}"
            {% endfor %}
            """))

pipeline.connect("websearch.documents", "prompt_builder.documents")

result = pipeline.run({
    "websearch": {"query": '"The King’s Monkey Servant" full story text'},
    "prompt_builder": {"text": indic_text}
})

print("\nGenerated Prompt:\n")
print(result["prompt_builder"]["prompt"])




Generated Prompt:


            Given the following - Title: The King’s Monkey Servant
Moral: A king wishing long life should never keep foolish servants.”
Story: A king had a monkey as his body-guard. He was very fond of the king, and as he was very much trusted by the king, he could go into the kings’ bed room without being stopped by anyone.
Once when the king was sleeping the monkey started breezing the king with a fan. While doing this a fly came and sat on the king’s chest. The monkey tried to ward off the fly with the fan. But the fly would come again and sit on the same place.
The monkey due to its foolish nature became angry, got a sharp sword and hit the fly to kill it. The fly flew away but, the king’s chest was divided into two, and the king died. - in English language, create 5 multiple choice quizzes in JSON format in English language.
            Each question should have 4 different options, and only one of them should be correct.
            The options should be unam

In [23]:
for doc in results["documents"]:
    print(doc.__dict__)

{'id': '10cccbdfb99bc6ed56374251006aeb3e6f154745cdfbb734aa1a6edd720cf211', 'content': "Published by: http://Sanskritebooks.wordpress. com (Sanskrit Text & English Translation) ... The King's Monkey Servant H. = l. U ~ *\\ ~. 7] U 7 ~. 9 7. 7 -. ~ ~ ~.", 'blob': None, 'meta': {'title': 'Stories From Panchatantra | PDF | Nature - Scribd', 'link': 'https://ro.scribd.com/doc/66866724/Stories-From-Panchatantra?BWrQpWATE4ftR=709UQ2sssl7sgYw', 'position': 1}, 'score': None, 'embedding': None, 'sparse_embedding': None}
{'id': '090155ab205e5bcd81b32ea0c01e10bc25c12cc321788dbeb81c8dfad65c82b9', 'content': "पंचतकथासंमहः <strong>Stories</strong> <strong>from</strong> <strong>Panchatantra</strong><br />. नृपसेवकवानर-कथा<br />. The King's Monkey Servant<br />.", 'blob': None, 'meta': {'title': 'Stories_from_Panchatantra_-_Sanskrit_English - YUMPU', 'link': 'https://www.yumpu.com/en/document/view/36448455/stories-from-panchatantra-sanskrit-english', 'date': 'Jan 29, 2015', 'position': 2}, 'score': No

In [19]:
english_quiz = english_quiz_generation.run(
    data={
        "websearch": {"query": f"""{indic_topic} full story text."""},
        "prompt_builder": {
            "text": indic_text
        }
    },
    debug=True,
)

TypeError: Pipeline.run() got an unexpected keyword argument 'debug'

In [15]:
parser = QuizParser()

In [16]:
english_quiz_text = parser.run(replies=english_quiz['generator']['replies'])
pprint(english_quiz_text)

{'quiz': {'questions': [{'options': ['a. The king was afraid of the monkey',
                                     'b. The king was very fond of the monkey',
                                     'c. The king hated the monkey',
                                     'd. The king was indifferent to the '
                                     'monkey'],
                         'question': "What was the king's relationship with "
                                     'the monkey?',
                         'right_option': 'b',
                         'source': 'I used my knowledge of English grammar to '
                                   'generate this question'},
                        {'options': ['a. Playing with the king',
                                     'b. Breezing the king with a fan',
                                     'c. Eating food',
                                     'd. Sleeping'],
                         'question': 'What was the monkey doing when the fly '
         