# Creating a new tree in main branch when question goes out of context - FINAL WORKING CODE

In [None]:
import json
import os
import uuid
import re

# Define a set of common English stopwords to filter out during tokenization.
STOPWORDS = {
    "what", "is", "your", "a", "an", "the", "and", "or", "but", "if",
    "of", "for", "to", "in", "with", "on", "at", "from", "as", "it",
    "are", "this", "that", "was", "were", "be", "been", "has", "have", "had"
}

def find_node_by_id(nodes, node_id):
    """
    Recursively searches for a node with the given node_id in a list of nodes.

    Parameters:
        nodes (list): A list of node dictionaries.
        node_id (str): The unique identifier to search for.

    Returns:
        dict or None: The node dictionary if found; otherwise, None.
    """
    for node in nodes:
        if node['id'] == node_id:
            return node
        found = find_node_by_id(node.get('children', []), node_id)
        if found:
            return found
    return None

def tokenize(text):
    """
    Tokenizes the input text into a set of lower-case words, removing punctuation
    and filtering out common stopwords.

    Parameters:
        text (str): The input text.

    Returns:
        set: A set of words with stopwords removed.
    """
    tokens = set(re.findall(r'\w+', text.lower()))
    filtered_tokens = {token for token in tokens if token not in STOPWORDS}
    return filtered_tokens

def is_in_context(new_question, parent_node, threshold=0.2):
    """
    Determines whether the new question is contextually relevant to the parent's content.

    This heuristic computes the token overlap ratio after filtering out common stopwords.

    Parameters:
        new_question (str): The new question text.
        parent_node (dict): The parent node containing 'question' and 'answer' keys.
        threshold (float): The minimum fraction of common tokens required to consider the question in context.

    Returns:
        bool: True if the new question is in context, False otherwise.
    """
    parent_text = parent_node.get('question', '') + " " + parent_node.get('answer', '')
    new_tokens = tokenize(new_question)
    parent_tokens = tokenize(parent_text)

    if not new_tokens:
        return False  # Avoid division by zero

    common_tokens = new_tokens.intersection(parent_tokens)
    ratio = len(common_tokens) / len(new_tokens)

    return ratio >= threshold

def update_json_tree(file_path, question, answer, parent_id=None):
    """
    Dynamically updates a JSON file by adding a new node to a tree of question-answer pairs.

    This function supports creating a new tree if the new question is determined to be out of context
    relative to the intended parent node.

    Each node in the tree contains:
      - id: A unique identifier for the node.
      - question: The question text.
      - answer: The corresponding answer.
      - children: A list of child nodes (subtree).

    Parameters:
        file_path (str): Path to the JSON file.
        question (str): The question text to be added.
        answer (str): The corresponding answer.
        parent_id (str, optional): The unique identifier of the parent node. If None,
                                   the new node is added as a root-level entry. If provided but the new question
                                   is out of context with the parent's content, a new tree is started.

    Returns:
        str: The unique identifier of the newly added node.
    """
    # Load existing tree data; initialize as an empty list if the file is absent or invalid.
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                tree = json.load(f)
                if not isinstance(tree, list):
                    tree = []
        except json.JSONDecodeError:
            tree = []
    else:
        tree = []

    # Create a new node with a unique identifier and an empty children list.
    new_id = str(uuid.uuid4())
    new_node = {
        "id": new_id,
        "question": question,
        "answer": answer,
        "children": []
    }

    # If a parent_id is provided, attempt to attach the new node as a child if in context.
    if parent_id is not None:
        parent_node = find_node_by_id(tree, parent_id)
        if parent_node is not None:
            if is_in_context(question, parent_node):
                parent_node.setdefault("children", []).append(new_node)
                print(f"Adding node under parent id {parent_id}.")
            else:
                print(f"Question out of context with parent id {parent_id}. Creating a new tree.")
                tree.append(new_node)
        else:
            print(f"Warning: Parent with id {parent_id} not found. Adding as a new tree.")
            tree.append(new_node)
    else:
        tree.append(new_node)

    # Write the updated tree back to the JSON file with pretty printing.
    with open(file_path, 'w') as f:
        json.dump(tree, f, indent=4)

    return new_id

# Test Cases to Demonstrate Subtree and New Tree Functionality
if __name__ == "__main__":
    file_path = "qa_tree.json"

    # Test Case 1: Add a root-level question.
    root_id = update_json_tree(file_path,
                               "What is your favorite color?",
                               "Blue")
    print(f"Added root node with id: {root_id}")

    # Test Case 2: Add a child question under the root question (in context).
    child1_id = update_json_tree(file_path,
                                 "Why do you like that color?",
                                 "It is calming and reminiscent of the sky.",
                                 parent_id=root_id)
    print(f"Added child node with id: {child1_id} under parent id: {root_id}")

    # Test Case 3: Add a new question that is out of context with the current tree.
    new_tree_id = update_json_tree(file_path,
                                   "What is your favourite sport?",
                                   "I enjoy playing soccer.",
                                   parent_id=root_id)  # Context check should fail, creating a new tree.
    print(f"Added node with id: {new_tree_id} as a new tree due to context change from parent id: {root_id}")

    # Test Case 4: Add another child in context under the first child question.
    grandchild_id = update_json_tree(file_path,
                                     "Could you elaborate on how the color affects your mood?",
                                     "It creates a soothing ambiance.",
                                     parent_id=child1_id)
    print(f"Added grandchild node with id: {grandchild_id} under parent id: {child1_id}")


Added root node with id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Adding node under parent id 74b4e45d-ba62-41f7-a6f3-d93bcd922754.
Added child node with id: 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51 under parent id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Question out of context with parent id 74b4e45d-ba62-41f7-a6f3-d93bcd922754. Creating a new tree.
Added node with id: 942e2c31-25ed-4e82-be61-5ac596c9c87e as a new tree due to context change from parent id: 74b4e45d-ba62-41f7-a6f3-d93bcd922754
Adding node under parent id 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51.
Added grandchild node with id: 59eb16b6-3d04-47bf-a7a5-f01d7b5cc686 under parent id: 686cf5b3-77da-44bb-8cd8-7b7ff10d0b51


```
Q: what is SD?
A: Software Development involves creating different types: web based, android, desktop, etc.
  Q: what is a web based software?
  A: Web-based software runs in a browser and can be accessed from any device with internet connection.
    Q: how can i build one?
    A: You'll need to learn HTML, CSS, JavaScript, and a backend language like Python, Node.js, or PHP.
  Q: what is an desktop based software?
  A: Desktop software runs locally on your computer and doesn't require internet to function.
    Q: what language I should learn?
    A: For desktop applications, Java or C# are good choices.
      Q: what is C#?
      A: C# (pronounced C-sharp) is a programming language developed by Microsoft primarily for Windows applications.
```

# New code - use this

In [2]:
import json
import os
import uuid
import re

# Define a set of common English stopwords to filter out during tokenization.
STOPWORDS = {
    "what", "is", "your", "a", "an", "the", "and", "or", "but", "if",
    "of", "for", "to", "in", "with", "on", "at", "from", "as", "it",
    "are", "this", "that", "was", "were", "be", "been", "has", "have", "had"
}

def find_node_by_id(nodes, node_id):
    """
    Recursively searches for a node with the given node_id in a list of nodes.
    """
    for node in nodes:
        if node['id'] == node_id:
            return node
        found = find_node_by_id(node.get('children', []), node_id)
        if found:
            return found
    return None

def tokenize(text):
    """
    Tokenizes the input text into a set of lower-case words, removing punctuation
    and filtering out common stopwords.
    """
    tokens = set(re.findall(r'\w+', text.lower()))
    filtered_tokens = {token for token in tokens if token not in STOPWORDS}
    return filtered_tokens

def is_in_context(new_question, parent_node, threshold=0.2):
    """
    Determines whether the new question is contextually relevant to the parent's content
    by computing the token overlap ratio after filtering out common stopwords.
    """
    parent_text = parent_node.get('question', '') + " " + parent_node.get('answer', '')
    new_tokens = tokenize(new_question)
    parent_tokens = tokenize(parent_text)

    if not new_tokens:
        return False

    common_tokens = new_tokens.intersection(parent_tokens)
    ratio = len(common_tokens) / len(new_tokens)

    return ratio >= threshold

def get_last_leaf_node(tree_list):
    """
    Retrieves the deepest (last) leaf node from the most recent tree in the list.
    Returns None if no nodes exist.
    """
    if not tree_list:
        return None
    current = tree_list[-1]
    while current.get('children'):
        current = current['children'][-1]
    return current

def update_json_tree(file_path, question, answer, parent_id=None):
    """
    Dynamically updates a JSON file by automatically determining the proper location
    for a new questionâ€“answer node. If no parent_id is provided, the function retrieves
    the last leaf node from the most recent tree and uses it as the default parent if the new
    question is in context; otherwise, the new node is added as a root-level node.
    """
    # Load existing tree data; initialize as an empty list if the file is absent or invalid.
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                tree = json.load(f)
                if not isinstance(tree, list):
                    tree = []
        except json.JSONDecodeError:
            tree = []
    else:
        tree = []

    # Create a new node with a unique identifier and an empty children list.
    new_id = str(uuid.uuid4())
    new_node = {
        "id": new_id,
        "question": question,
        "answer": answer,
        "children": []
    }

    # If no parent_id is provided, auto-detect based on the last leaf node.
    if parent_id is None:
        default_parent = get_last_leaf_node(tree)
        if default_parent is not None and is_in_context(question, default_parent):
            parent_id = default_parent['id']
            print(f"Auto-detected context: Adding node under parent id {parent_id}.")
        else:
            print("Auto-detected context: Creating a new tree (root-level node).")

    # If a parent_id is available, try to add the new node as a child.
    if parent_id is not None:
        parent_node = find_node_by_id(tree, parent_id)
        if parent_node is not None:
            # Even if a parent_id is provided, double-check context.
            if is_in_context(question, parent_node):
                parent_node.setdefault("children", []).append(new_node)
                print(f"Adding node under parent id {parent_id}.")
            else:
                print(f"Context mismatch with parent id {parent_id}; creating a new tree.")
                tree.append(new_node)
        else:
            print(f"Warning: Parent with id {parent_id} not found; adding as a new tree.")
            tree.append(new_node)
    else:
        # No valid parent detected; add as a new root-level node.
        tree.append(new_node)

    # Write the updated tree back to the JSON file with pretty printing.
    with open(file_path, 'w') as f:
        json.dump(tree, f, indent=4)

    return new_id

# Test Cases to Demonstrate Automatic Context Detection


In [3]:
file_path = "qa_tree_7.json"

# Test Case 1: Add a root-level question.
id1 = update_json_tree(file_path,
                        "what is SD?",
                        "Software Development involves creating different types: web based, android, desktop, etc.")
print(f"Added node with id: {id1}")


Auto-detected context: Creating a new tree (root-level node).
Added node with id: ced86b49-80dd-4369-aa2d-c73ee4b4f7ad


In [4]:
# Test Case 2: Add a question that is in context with the previous one.
id2 = update_json_tree(file_path,
                        "what is a web based software?",
                        "Web-based software runs in a browser and can be accessed from any device with internet connection.")
print(f"Added node with id: {id2}")

Auto-detected context: Adding node under parent id ced86b49-80dd-4369-aa2d-c73ee4b4f7ad.
Adding node under parent id ced86b49-80dd-4369-aa2d-c73ee4b4f7ad.
Added node with id: 3fe97d9f-f725-4676-affb-4a270991829a


In [5]:
# Test Case 3: Add a question that is out of context.
id3 = update_json_tree(file_path,
                        "how can i build one?",
                        "You'll need to learn HTML, CSS, JavaScript, and a backend language like Python, Node.js, or PHP.")
print(f"Added node with id: {id3} as a new tree due to context change.")


Auto-detected context: Adding node under parent id 3fe97d9f-f725-4676-affb-4a270991829a.
Adding node under parent id 3fe97d9f-f725-4676-affb-4a270991829a.
Added node with id: 9736f773-3e38-43a1-8fbe-972ab076155f as a new tree due to context change.


In [6]:
# Test Case 4: Add another in-context question after the new tree.
id4 = update_json_tree(file_path,
                        "what is an desktop based software?",
                        "Desktop software runs locally on your computer and doesn't require internet to function.")
print(f"Added node with id: {id4}")

Auto-detected context: Creating a new tree (root-level node).
Added node with id: b63e5544-2c36-4c99-a67c-116d2d15e7ad


In [7]:
# Test Case 4: Add another in-context question after the new tree.
update_json_tree(file_path,
                        "what language I should learn?",
                        "For desktop applications, Java or C# are good choices.")

Auto-detected context: Creating a new tree (root-level node).


'9c6fe5eb-8329-42b2-9334-e3ba1cb02f12'

In [8]:
update_json_tree(file_path,
                        "what is C#?",
                        "C# (pronounced C-sharp) is a programming language developed by Microsoft primarily for Windows applications.")

Auto-detected context: Adding node under parent id 9c6fe5eb-8329-42b2-9334-e3ba1cb02f12.
Adding node under parent id 9c6fe5eb-8329-42b2-9334-e3ba1cb02f12.


'6fab660a-841a-488c-ab6e-8599a3b91216'

In [None]:
import json
import os
import uuid
import re

# -----------------------------------------
# Configuration
# -----------------------------------------
THRESHOLD = 0.2  # Adjust as needed for stricter or looser matching

# Define a set of common English stopwords to filter out during tokenization.
STOPWORDS = {
    "what", "is", "your", "a", "an", "the", "and", "or", "but", "if",
    "of", "for", "to", "in", "with", "on", "at", "from", "as", "it",
    "are", "this", "that", "was", "were", "be", "been", "has", "have", "had"
}

def tokenize(text):
    tokens = set(re.findall(r'\w+', text.lower()))
    filtered_tokens = {token for token in tokens if token not in STOPWORDS}
    return filtered_tokens

def context_overlap_ratio(question, node):
    node_text = node.get('question', '') + " " + node.get('answer', '')
    question_tokens = tokenize(question)
    node_tokens = tokenize(node_text)
    if not question_tokens:
        return 0.0
    common_tokens = question_tokens.intersection(node_tokens)
    return len(common_tokens) / len(question_tokens)

def find_deepest_context_node(node, question, threshold):
    ratio_here = context_overlap_ratio(question, node)
    if ratio_here < threshold:
        # Not in context with this node at all
        return None, 0.0
    
    # Node is in context; see if there's a child that is also in context, possibly with a better ratio.
    best_node = node
    best_ratio = ratio_here
    
    for child in node.get('children', []):
        candidate_node, candidate_ratio = find_deepest_context_node(child, question, threshold)
        # Only consider children that are themselves in context
        if candidate_node is not None and candidate_ratio >= threshold:
            # Prefer the child if it meets threshold and is deeper
            # We'll pick the child with the highest ratio among siblings.
            if candidate_ratio > best_ratio:
                best_node = candidate_node
                best_ratio = candidate_ratio

    return best_node, best_ratio

def find_best_placement_in_forest(forest, question, threshold):
    best_node = None
    best_ratio = 0.0
    
    for root in forest:
        node, ratio = find_deepest_context_node(root, question, threshold)
        if ratio > best_ratio:
            best_node = node
            best_ratio = ratio
    
    return best_node, best_ratio

def update_json_tree(file_path, question, answer, threshold=THRESHOLD):
    # Load existing tree data; initialize as an empty list if the file is absent or invalid.
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r') as f:
                tree = json.load(f)
                if not isinstance(tree, list):
                    tree = []
        except json.JSONDecodeError:
            tree = []
    else:
        tree = []

    # Create a new node with a unique identifier and an empty children list.
    new_id = str(uuid.uuid4())
    new_node = {
        "id": new_id,
        "question": question,
        "answer": answer,
        "children": []
    }

    # Find the best placement among all roots
    best_node, best_ratio = find_best_placement_in_forest(tree, question, threshold)

    if best_node is not None and best_ratio >= threshold:
        # Attach under the best matching node
        best_node.setdefault('children', []).append(new_node)
        print(f"Placed '{question}' under parent '{best_node['question']}' (overlap ratio={best_ratio:.2f}).")
    else:
        # No suitable parent found => new root node
        tree.append(new_node)
        if best_node is None:
            print(f"No suitable parent found for '{question}'; created new root.")
        else:
            print(f"Overlap ratio below threshold ({best_ratio:.2f}); created new root for '{question}'.")

    # Write the updated tree back to the JSON file with pretty printing.
    with open(file_path, 'w') as f:
        json.dump(tree, f, indent=4)

    return new_id



In [12]:
# -------------------------------------------------------------------
# Example Usage / Test Cases
# -------------------------------------------------------------------
file_path = "qa7_tree.json"

# Optional: clear or re-initialize the file for a fresh test
# with open(file_path, 'w') as f:
#     json.dump([], f)

# Q1: "What is software development?"
update_json_tree(
    file_path,
    "What is software development?",
    "Software development is the process of conceiving, specifying, designing, programming, etc."
)


No suitable parent found for 'What is software development?'; created new root.


'95a67f68-3e78-4cd4-af81-3d1295247034'

In [13]:
update_json_tree(
    file_path,
    "What are the different kinds of software?",
    "There are many types, including system software, application software, and more."
)

Placed 'What are the different kinds of software?' under parent 'What is software development?' (overlap ratio=0.33).


'b5d4a652-7355-44f1-b88a-2920f44f08e6'

In [14]:
update_json_tree(
    file_path,
    "What is web based software?",
    "Web-based software is hosted on a remote server and accessed through a browser."
)

Placed 'What is web based software?' under parent 'What is software development?' (overlap ratio=0.33).


'a501020c-3dcc-45b5-b7b8-8c44269072dc'

In [15]:
q4_id = update_json_tree(
    file_path,
    "What is desktop software?",
    "Desktop software is installed locally on a computer."
)

Placed 'What is desktop software?' under parent 'What is software development?' (overlap ratio=0.50).


In [16]:
update_json_tree(
    file_path,
    "What language should I learn?",
    "You could learn Python, Java, or any other language depending on your goals."
)


No suitable parent found for 'What language should I learn?'; created new root.


'0aaeaa03-b3d9-4183-a0cd-ed6db1109226'