# Creating a new tree in main branch when question goes out of context - FINAL WORKING CODE

In [None]:
import json
import os
import uuid
import re

# Define a set of common English stopwords to filter out during tokenization.
STOPWORDS = {
    "what", "is", "your", "a", "an", "the", "and", "or", "but", "if",
    "of", "for", "to", "in", "with", "on", "at", "from", "as", "it",
    "are", "this", "that", "was", "were", "be", "been", "has", "have", "had"
}

def find_node_by_id(nodes, node_id):
    """
    Recursively searches for a node with the given node_id in a list of nodes.

    Parameters:
        nodes (list): A list of node dictionaries.
        node_id (str): The unique identifier to search for.

    Returns:
        dict or None: The node dictionary if found; otherwise, None.
    """
    for node in nodes:
        if node['id'] == node_id:
            return node
        found = find_node_by_id(node.get('children', []), node_id)
        if found:
            return found
    return None

def tokenize(text):
    """
    Tokenizes the input text into a set of lower-case words, removing punctuation
    and filtering out common stopwords.

    Parameters:
        text (str): The input text.

    Returns:
        set: A set of words with stopwords removed.
    """
    tokens = set(re.findall(r'\w+', text.lower()))
    filtered_tokens = {token for token in tokens if token not in STOPWORDS}
    return filtered_tokens

def is_in_context(new_question, parent_node, threshold=0.2):
    """
    Determines whether the new question is contextually relevant to the parent's content.

    This heuristic computes the token overlap ratio after filtering out common stopwords.

    Parameters:
        new_question (str): The new question text.
        parent_node (dict): The parent node containing 'question' and 'answer' keys.
        threshold (float): The minimum fraction of common tokens required to consider the question in context.

    Returns:
        bool: True if the new question is in context, False otherwise.
    """
    parent_text = parent_node.get('question', '') + " " + parent_node.get('answer', '')
    new_tokens = tokenize(new_question)
    parent_tokens = tokenize(parent_text)

    if not new_tokens:
        return False  # Avoid division by zero

    common_tokens = new_tokens.intersection(parent_tokens)
    ratio = len(common_tokens) / len(new_tokens)

    return ratio >= threshold

def update_json_tree(file_path, question, answer, parent_id=None):
    """
    Dynamically updates a JSON file by adding a new node to a tree of question-answer pairs.

    This function supports creating a new tree if the new question is determined to be out of context
    relative to the intended parent node.

    Each node in the tree contains:
      - id: A unique identifier for the node.
      - question: The question text.
      - answer: The corresponding answer.
      - children: A list of child nodes (subtree).

    Parameters:
        file_path (str): Path to the JSON file.
        question (str): The question text to be added.
        answer (str): The corresponding answer.
        parent_id (str, optional): The unique identifier of the parent node. If None,
                                   the new node is added as a root-level entry. If provided but the new question
                                   is out of context with the parent's content, a new tree is started.

    Returns:
        str: The unique identifier of the newly added node.
    """
    # Load existing tree data; initialize as an empty list if the file is absent or invalid.
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                tree = json.load(f)
                if not isinstance(tree, list):
                    tree = []
        except json.JSONDecodeError:
            tree = []
    else:
        tree = []

    # Create a new node with a unique identifier and an empty children list.
    new_id = str(uuid.uuid4())
    new_node = {
        "id": new_id,
        "question": question,
        "answer": answer,
        "children": []
    }

    # If a parent_id is provided, attempt to attach the new node as a child if in context.
    if parent_id is not None:
        parent_node = find_node_by_id(tree, parent_id)
        if parent_node is not None:
            if is_in_context(question, parent_node):
                parent_node.setdefault("children", []).append(new_node)
                print(f"Adding node under parent id {parent_id}.")
            else:
                print(f"Question out of context with parent id {parent_id}. Creating a new tree.")
                tree.append(new_node)
        else:
            print(f"Warning: Parent with id {parent_id} not found. Adding as a new tree.")
            tree.append(new_node)
    else:
        tree.append(new_node)

    # Write the updated tree back to the JSON file with pretty printing.
    with open(file_path, 'w') as f:
        json.dump(tree, f, indent=4)

    return new_id

# Test Cases to Demonstrate Subtree and New Tree Functionality
if __name__ == "__main__":
    file_path = "qa_tree.json"

    # Test Case 1: Add a root-level question.
    root_id = update_json_tree(file_path,
                               "What is your favorite color?",
                               "Blue")
    print(f"Added root node with id: {root_id}")

    # Test Case 2: Add a child question under the root question (in context).
    child1_id = update_json_tree(file_path,
                                 "Why do you like that color?",
                                 "It is calming and reminiscent of the sky.",
                                 parent_id=root_id)
    print(f"Added child node with id: {child1_id} under parent id: {root_id}")

    # Test Case 3: Add a new question that is out of context with the current tree.
    new_tree_id = update_json_tree(file_path,
                                   "What is your favourite sport?",
                                   "I enjoy playing soccer.",
                                   parent_id=root_id)  # Context check should fail, creating a new tree.
    print(f"Added node with id: {new_tree_id} as a new tree due to context change from parent id: {root_id}")

    # Test Case 4: Add another child in context under the first child question.
    grandchild_id = update_json_tree(file_path,
                                     "Could you elaborate on how the color affects your mood?",
                                     "It creates a soothing ambiance.",
                                     parent_id=child1_id)
    print(f"Added grandchild node with id: {grandchild_id} under parent id: {child1_id}")


Added root node with id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Adding node under parent id 74b4e45d-ba62-41f7-a6f3-d93bcd922754.
Added child node with id: 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51 under parent id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Question out of context with parent id 74b4e45d-ba62-41f7-a6f3-d93bcd922754. Creating a new tree.
Added node with id: 942e2c31-25ed-4e82-be61-5ac596c9c87e as a new tree due to context change from parent id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Adding node under parent id 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51.
Added grandchild node with id: 59eb16b6-3d04-47bf-a7a5-f01d7b5cc686 under parent id: 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51


In [1]:
import json
import os
import uuid
import re

# Define a set of common English stopwords to filter out during tokenization.
STOPWORDS = {
    "what", "is", "your", "a", "an", "the", "and", "or", "but", "if",
    "of", "for", "to", "in", "with", "on", "at", "from", "as", "it",
    "are", "this", "that", "was", "were", "be", "been", "has", "have", "had"
}

def find_node_by_id(nodes, node_id):
    """
    Recursively searches for a node with the given node_id in a list of nodes.
    """
    for node in nodes:
        if node['id'] == node_id:
            return node
        found = find_node_by_id(node.get('children', []), node_id)
        if found:
            return found
    return None

def tokenize(text):
    """
    Tokenizes the input text into a set of lower-case words, removing punctuation
    and filtering out common stopwords.
    """
    tokens = set(re.findall(r'\w+', text.lower()))
    filtered_tokens = {token for token in tokens if token not in STOPWORDS}
    return filtered_tokens

def is_in_context(new_question, parent_node, threshold=0.2):
    """
    Determines whether the new question is contextually relevant to the parent's content.
    """
    parent_text = parent_node.get('question', '') + " " + parent_node.get('answer', '')
    new_tokens = tokenize(new_question)
    parent_tokens = tokenize(parent_text)

    if not new_tokens:
        return False  # Avoid division by zero

    common_tokens = new_tokens.intersection(parent_tokens)
    ratio = len(common_tokens) / len(new_tokens)

    return ratio >= threshold

def update_qa_json(file_path, question, answer, parent_id=None):
    """
    Updates the qa_json file with a new question-answer pair.
    """
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                tree = json.load(f)
                if not isinstance(tree, list):
                    tree = []
        except json.JSONDecodeError:
            tree = []
    else:
        tree = []

    new_id = str(uuid.uuid4())
    new_node = {
        "id": new_id,
        "question": question,
        "answer": answer,
        "children": []
    }

    if parent_id is not None:
        parent_node = find_node_by_id(tree, parent_id)
        if parent_node:
            if is_in_context(question, parent_node):
                parent_node.setdefault("children", []).append(new_node)
            else:
                tree.append(new_node)
        else:
            tree.append(new_node)
    else:
        tree.append(new_node)

    with open(file_path, 'w') as f:
        json.dump(tree, f, indent=4)

    return new_id

# Example usage
if __name__ == "__main__":
    file_path = "qa2_tree.json"

    # Add questions and answers one by one
    root_id = update_qa_json(file_path, "What is the capital of France?", "Paris")
    child_id = update_qa_json(file_path, "What is the population of Paris?", "Approximately 2.1 million", root_id)
    update_qa_json(file_path, "What is the weather like in Paris?", "Variable, check a weather app", child_id)
    update_qa_json(file_path, "What is the capital of Germany?", "Berlin") #new root
    update_qa_json(file_path, "What is the capital of Spain?", "Madrid") #new root
    update_qa_json(file_path, "what is the best food in paris?", "French food is generally considered excellent.", child_id)