In [69]:
from dotenv import load_dotenv
import os 
import getpass

load_dotenv()
!export LANGSMITH_TRACING=true
!export LANGSMITH_API_KEY=os.getenv("LANGSMITH_API_KEY")

# os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")
# os.environ["LANGSMITH_TRACING"] = "true"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [65]:
# just for reference

C_PROGRAMMING_TOPICS = [
    "programming_fundamentals", 
    "algorithm_and_flowchart", 
    "introduction_c_programming", 
    "data_and_expressions", 
    "input_output", 
    "control_structures", 
    "arrays_strings_pointers", 
    "functions", 
    "structures", 
    "file_handling", 
    "oop_overview"
]

In [85]:
from typing import Literal, Optional, List, Tuple 
from pydantic import BaseModel, Field

class QuestionSearch(BaseModel):
    """Search over the json file about the question of particular year or some particular metadata... ."""

    id: Optional[str] = Field(
        None, # Default to None for optional fields
        description="ID of a particular question. Will hold a value like 'subject_code+question_number'."
    )

    subject: Literal["computer programming"] = Field(
        description="Subject the question belongs to."
    )

    # year_ad: Optional[int] = Field(
    #     None,
    #     description="Year in AD that the question appeared."
    # )

    # year_bs: Optional[int] = Field( # Made optional for flexibility
    #      None,
    #     description="Year in BS the question appeared."
    # )

    # Modified to accept list of years
    year_ad: Optional[List[int]] = Field(
        None,
        description="List of years in AD that the questions appeared."
    )

    year_bs: Optional[List[int]] = Field(
        None,
        description="List of years in BS that the questions appeared."
    )

    # question_text: Optional[str] = Field( # Added field for searching question text
    #     None,
    #     description="Keywords or full text to search within the question itself."
    # )

    type: Optional[Literal["theory", "programming"]] = Field( 
        None,
        description="Type of the question." 
    )

    format: Optional[Literal["short", "long"]] = Field(
        None,
        description="Format of the question (e.g., short answer, long answer)." 
    )

    marks: Optional[int] = Field(
        None,
        description="Marks allocated to the question being searched."
    )

    topic: Optional[Literal["programming_fundamentals", "algorithm_and_flowchart", "introduction_c_programming", "data_and_expressions", "input_output", "control_structures", "arrays_strings_pointers", "functions", "structures", "file_handling", "oop_overview"]] = Field(
        None,
        description="Topic that the question is from."
    )

    unit: Optional[int] = Field(
        None,
        description="Unit the question is from."
    )

    tags: Optional[List[str]] = Field(
        None,
        description="Tags associated with the question."
    )

    question_number: Optional[str] = Field(
        None,
        description="Question number (e.g., '1a', '2b', 4, 5)."
    )

    source: Optional[Literal["regular", "back"]] = Field(
        None,
        description="Source exam type (regular or back paper)." 
    )

    semester: Optional[Literal["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth"]] = Field(
         None,
         description="Semester the question is for." 
    )



In [86]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()

llm = ChatGroq(
            api_key=os.getenv('GROQ_API_KEY'),
            model_name="llama-3.3-70b-versatile"
        )

### Structured Question Output

In [87]:
from langchain_core.prompts import ChatPromptTemplate

# TODO adjust prompt for handling mutiple years list...
# XXX year filter is only after year 2075
system = """You are an expert at converting user questions about past exam papers into structured JSON queries.
You have access to a database (JSON file) containing information about past exam questions for subjects like Computer Programming, Mathematics, and Digital Logic from various years and semesters.
Given a user's question, your goal is to construct a JSON query object that conforms to the `QuestionSearch` schema to retrieve the most relevant question(s) from the database.

When users mention multiple years, collect them into a list. For example:
- "questions from 2075, 2076 BS" → year_bs: [2075, 2076]
- "questions between 2019 and 2021 AD" → year_ad: [2019, 2020, 2021]
- "questions before year 2076" → year_bs: [2075, 2076] (DO NOT provide year before 2075)

You must identify key information in the user's request, such as:
- Subject name (e.g., "computer programming")
- Year (Specify BS or AD, e.g., "2080 BS", "2023 AD")
- Question type ("theory" or "programming")
- Question format ("short" or "long")
- Marks
- Topic
- Unit number
- Specific tags
- Question number (e.g., "1a", "5b")
- Source ("regular" or "back" exam)
- Semester ("first", "second", etc.)
- Keywords within the question text itself.

Map this extracted information accurately to the corresponding fields in the `QuestionSearch` JSON schema.
- Pay close attention to the required fields and the allowed values for fields with `Literal` types (like `subject`, `type`, `format`, `source`, `semester`).
- Use `null` or omit optional fields if the information is not provided in the user's query.
- Do not invent information or assume details not explicitly stated by the user.
- If the user uses specific terms, acronyms, or numbers, preserve them accurately in the query values.
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)
structured_llm = llm.with_structured_output(QuestionSearch)
structured_chain = prompt | structured_llm

In [88]:
# structured_chain.invoke({"question": "history of C 2024?"})

## Data Loading

In [81]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="formatted_data/c_question.json",
    jq_schema=".[]",
    text_content=False,
)
docs = loader.load()

# Vector Store

In [82]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = InMemoryVectorStore(embedding=embeddings)

_ = vector_store.add_documents(docs)

In [83]:
vector_store.similarity_search("algorithms and flowchart.")

[Document(id='7cc67a76-1b46-475b-bb9d-f9606458dad2', metadata={'source': '/Users/nirajanpaudel17/Documents/Projects/CBOT/formatted_data/c_question.json', 'seq_num': 60}, page_content='{"id": "CT401_1b", "subject": "computer Programming", "year_bs": 2078, "year_ad": 2021, "question": "What is an algorithm? Explain how does algorithm and flowchart helps in computer Programming.", "type": "theory", "format": "short", "marks": 4, "topic": "algorithm_and_flowchart", "unit": 1, "tags": ["algorithm", "flowchart", "programming"], "question_number": "1b", "source": "regular", "semester": "first"}'),
 Document(id='77150066-4d25-485f-bb54-494c64b87264', metadata={'source': '/Users/nirajanpaudel17/Documents/Projects/CBOT/formatted_data/c_question.json', 'seq_num': 86}, page_content='{"id": "CT401_1b", "subject": "computer Programming", "year_bs": 2076, "year_ad": 2019, "question": "List the basic step of problem solving using computer. Write an algorithm and draw a flowchart to find the sum of N n

### try::

In [91]:
def create_dynamic_filter(query_result: QuestionSearch) -> dict:
    filter_dict = {}
    
    print(f"[INFO]\n ---------------\n{query_result.model_dump().items()}\n---------------\n")
    for field_name, value in query_result.model_dump().items():
        if value is not None:
            # Special handling for case-insensitive subject matching
            if field_name == 'subject':
                filter_dict[field_name] = value.lower()
            elif field_name in ['year_ad', 'year_bs'] and isinstance(value, list):
                # For year fields, create an "OR" condition
                if value:  # Only if the list is not empty
                    filter_dict[field_name] = {"$in": value}  # Using $in operator for list matching
            # Handle list type fields (like tags) if needed        
            elif isinstance(value, list):
                if value:  # Only add if list is not empty
                    filter_dict[field_name] = value
            else:
                filter_dict[field_name] = value
    
    return filter_dict

def get_filtered_questions(question: str, k: int = 5):
    """
    Get filtered questions based on the user's natural language query.
    
    Args:
        question: Natural language question from user
        k: Maximum number of results to retrieve
    
    Returns:
        List of relevant documents that match the filter criteria exactly
    """
    # Get structured query from the chain
    query_result = structured_chain.invoke({"question": question})
    
    # Create dynamic filter
    filter_dict = create_dynamic_filter(query_result)
    
    print(f"[INFO] -> FILTER DICTIONARY \n{filter_dict}\n ----------------\n")  # Debug info
    
    # Create retriever with the filter
    retriever = vector_store.as_retriever(
        search_kwargs={'k': k},  # Note: moved k into search_kwargs
        filter=filter_dict
    )
    
    # Get results
    results = retriever.invoke(question)
    
    filtered_results = []
    for doc in results:
        doc_content = json.loads(doc.page_content)
        print(f"[INFO] -------- doc------ \n{doc} \n ----------")
        # Modified matching logic to handle lists
        matches_all_criteria = True
        for key, value in filter_dict.items():
            if key in ['year_ad', 'year_bs']:
                if isinstance(value, dict) and '$in' in value:
                    # Check if document's year is in the list of years
                    if doc_content.get(key) not in value['$in']:
                        matches_all_criteria = False
                        break
            else:
                # Regular matching for other fields
                if str(doc_content.get(key, '')).lower() != str(value).lower():
                    matches_all_criteria = False
                    break
        
        if matches_all_criteria:
            filtered_results.append(doc)
    
    return filtered_results

import json
# Usage example:
def display_filtered_questions(question: str, k: int = 5):
    """
    Display filtered questions with their content.
    
    Args:
        question: Natural language question from user
        k: Maximum number of results to retrieve
    """
    results = get_filtered_questions(question, k)
    
    if not results:
        print("No documents found matching the specified criteria.")
        return
    
    print(f"\nFound {len(results)} matching documents:")
    for i, doc in enumerate(results, 1):
        # Parse the JSON string into a dictionary
        doc_content = json.loads(doc.page_content)
        # Now we can access the "question" field
        print(f"\n{i}. {doc_content['question']}")

In [100]:
# Example usage
results = display_filtered_questions("List questions asked from year 2079")

[INFO]
 ---------------
dict_items([('id', None), ('subject', 'computer programming'), ('year_ad', None), ('year_bs', [2079]), ('type', None), ('format', None), ('marks', None), ('topic', None), ('unit', None), ('tags', None), ('question_number', None), ('source', None), ('semester', None)])
---------------

[INFO] -> FILTER DICTIONARY 
{'subject': 'computer programming', 'year_bs': {'$in': [2079]}}
 ----------------

[INFO] -------- doc------ 
page_content='{"id": "CT401_1a", "subject": "computer Programming", "year_bs": 2075, "year_ad": 2019, "question": "What is computer Programming and computer software? Explain about types of programming languages and software.", "type": "theory", "format": "short", "marks": 4, "topic": "programming_fundamentals", "unit": 1, "tags": ["computer Programming", "computer software", "programming languages"], "question_number": "1a", "source": "regular", "semester": "first"}' metadata={'source': '/Users/nirajanpaudel17/Documents/Projects/CBOT/formatted_

In [None]:
# # let's define a tool for retrieving data using similarity search
# from langchain.tools import tool

# @tool
# def c_past_questions(
#     id: Optional[str] = None,
#     # XXX defaults to computer programming for now, change later...
#     subject: Literal["computer programming", "mathematics", "digital logic"] = "computer programming",
#     year_ad: Optional[int] = None,
#     year_bs: Optional[int] = None,
#     question_text: Optional[str] = None,
#     type: Optional[Literal["theory", "programming"]] = None,
#     format: Optional[Literal["short", "long"]] = None,
#     marks: Optional[int] = None,
#     topic: Optional[Literal["programming_fundamentals", "algorithm_and_flowchart", "introduction_c_programming", 
#                           "data_and_expressions", "input_output", "control_structures", "arrays_strings_pointers", 
#                           "functions", "structures", "file_handling", "oop_overview"]] = None,
#     unit: Optional[int] = None,
#     tags: Optional[List[str]] = None,
#     question_number: Optional[str] = None,
#     source: Optional[Literal["regular", "back"]] = None,
#     semester: Optional[Literal["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth"]] = None
# ) -> List[dict]:
#     """
#     Search for computer programming questions with the given criteria.
    
#     Args:
#         id: ID of a particular question (e.g., 'subject_code+question_number')
#         subject: Subject the question belongs to (defaults to "computer programming")
#         year_ad: Year in AD that the question appeared
#         year_bs: Year in BS that the question appeared
#         question_text: Keywords or full text to search within the question itself
#         type: Type of the question (theory or programming)
#         format: Format of the question (short or long)
#         marks: Marks allocated to the question
#         topic: Topic that the question is from
#         unit: Unit the question is from
#         tags: List of tags associated with the question
#         question_number: Question number (e.g., '1a', '2b', 4, 5)
#         source: Source exam type (regular or back paper)
#         semester: Semester the question is for
    
#     Returns:
#         List[dict]: List of matching questions with their metadata
#     """
#     pass