In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import json
import uuid
import pickle
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional, Tuple, Callable

import numpy as np
import pymongo

from web2comp.llm.gemini import (
    create_model,
    create_embedding_model
)
from web2comp.llm_processing import extract_response_from_tag

# Initialization

In [3]:
load_dotenv()

True

In [4]:
APP_NAME = 'DASHBOARD'

In [5]:
model = json.load(open(f'./new_models/{APP_NAME}.json'))

# Database Class

In [6]:
class InMemoryWarningVectorDB:
    def __init__(self):
        """
        Minimal in-memory database for storing features and component features,
        with vector similarity search capability.
        """
        self.features: Dict[str, Dict[str, Any]] = {}  # _id -> feature_doc
        self.component_features: List[Dict[str, Any]] = []
        
        self._all_feature_ids: List[str] = []
        self._all_embeddings_matrix: np.ndarray = np.array([], dtype=np.float32)
        self._embedding_dim: int = -1

    # --- Internal Embedding Management ---
    def _validate_and_set_embedding_dim(self, embedding: List[float]) -> None:
        current_len = len(embedding)
        if self._embedding_dim == -1:
            if current_len == 0:
                raise ValueError("Embedding dimension cannot be 0.")
            self._embedding_dim = current_len
            if self._all_embeddings_matrix.ndim == 1 and self._all_embeddings_matrix.shape[0] == 0:
                 self._all_embeddings_matrix = np.empty((0, self._embedding_dim), dtype=np.float32)
        elif current_len != self._embedding_dim:
            raise ValueError(
                f"All embeddings must have the same dimension. "
                f"Expected {self._embedding_dim}, got {current_len}."
            )

    def _rebuild_embeddings_index(self) -> None:
        if not self.features:
            self._all_feature_ids = []
            self._all_embeddings_matrix = np.empty((0, self._embedding_dim), dtype=np.float32) if self._embedding_dim != -1 else np.array([], dtype=np.float32)
            return

        temp_embeddings_list = []
        current_feature_ids = [] # Build locally then assign to self._all_feature_ids

        # Determine or validate embedding dimension if not already done or if features were empty before
        if self._embedding_dim == -1:
            for _id, feature_data in self.features.items():
                embedding = feature_data.get("embedding")
                if embedding is not None and isinstance(embedding, list) and len(embedding) > 0:
                    self._validate_and_set_embedding_dim(embedding) # This sets self._embedding_dim
                    break
        
        if self._embedding_dim == -1 and self.features: # Still no valid dimension found among features
            # Cannot build a meaningful matrix. This case implies features exist but none have valid embeddings.
            self._all_feature_ids = list(self.features.keys()) # Store IDs, but matrix will be empty or mis-shapen
            self._all_embeddings_matrix = np.array([], dtype=np.float32) # Or np.empty((0,0))
            # print("Warning: Features exist but no valid embeddings found to determine dimension. Index will be empty.")
            return


        for _id, feature_data in self.features.items():
            embedding = feature_data.get("embedding")
            if embedding is not None:
                if len(embedding) != self._embedding_dim: # Should have been caught by _validate_and_set_embedding_dim on add/update
                    raise ValueError(
                        f"Inconsistent embedding dimension for feature {_id}. "
                        f"Expected {self._embedding_dim}, got {len(embedding)}."
                    )
                temp_embeddings_list.append(embedding)
                current_feature_ids.append(_id)
        
        self._all_feature_ids = current_feature_ids
        if temp_embeddings_list:
            self._all_embeddings_matrix = np.array(temp_embeddings_list, dtype=np.float32)
        elif self._embedding_dim != -1: # No features with embeddings, but dim is known
            self._all_embeddings_matrix = np.empty((0, self._embedding_dim), dtype=np.float32)
        else: # No features with embeddings, dim unknown
            self._all_embeddings_matrix = np.array([], dtype=np.float32)


    def _cosine_similarity_matrix(self, query_vector: np.ndarray, target_matrix: np.ndarray) -> np.ndarray:
        if query_vector.ndim == 1:
            query_vector_2d = query_vector.reshape(1, -1)
        else:
            query_vector_2d = query_vector
        
        if target_matrix.shape[0] == 0:
            return np.array([])

        query_norm = np.linalg.norm(query_vector_2d, axis=1, keepdims=True)
        query_unit_vector = np.divide(query_vector_2d, query_norm, 
                                      out=np.zeros_like(query_vector_2d), where=query_norm!=0)

        target_norms = np.linalg.norm(target_matrix, axis=1, keepdims=True)
        target_unit_matrix = np.divide(target_matrix, target_norms, 
                                       out=np.zeros_like(target_matrix), where=target_norms!=0)
        
        similarities = np.dot(target_unit_matrix, query_unit_vector.T).flatten()
        return similarities

    # --- Feature Management Methods ---
    def add_feature(self, feature_doc: Dict[str, Any]) -> str:
        _id = str(feature_doc.get("_id", uuid.uuid4().hex)) # Use hex for shorter UUID string
        
        embedding_val = feature_doc.get("embedding")
        if embedding_val is None or not isinstance(embedding_val, list):
            raise ValueError("Feature document must contain a valid 'embedding' list.")
        
        self._validate_and_set_embedding_dim(embedding_val)
        
        feature_doc["_id"] = _id
        if _id in self.features:
            raise ValueError(f"Feature with ID {_id} already exists. Use update_feature instead.")
            
        self.features[_id] = feature_doc.copy() # Store a copy
        self._rebuild_embeddings_index()
        return _id

    def get_feature(self, feature_id: str) -> Optional[Dict[str, Any]]:
        feature = self.features.get(str(feature_id))
        return feature.copy() if feature else None # Return a copy

    def update_feature(self, feature_id: str, updates: Dict[str, Any]) -> bool:
        feature_id_str = str(feature_id)
        if feature_id_str not in self.features:
            return False
        
        original_feature = self.features[feature_id_str]
        embedding_changed = False
        modified = False

        if "embedding" in updates:
            new_embedding = updates["embedding"]
            if new_embedding is None or not isinstance(new_embedding, list):
                raise ValueError("Updated 'embedding' must be a valid list.")
            self._validate_and_set_embedding_dim(new_embedding)
            if original_feature.get("embedding") != new_embedding:
                embedding_changed = True
        
        for key, value in updates.items():
            if original_feature.get(key) != value:
                original_feature[key] = value
                modified = True
        
        if modified and embedding_changed:
            self._rebuild_embeddings_index()
        
        return modified

    def delete_feature(self, feature_id: str) -> bool:
        feature_id_str = str(feature_id)
        if feature_id_str in self.features:
            del self.features[feature_id_str]
            self._rebuild_embeddings_index()
            return True
        return False

    def list_features(self, app_filter: Optional[str] = None) -> List[Dict[str, Any]]:
        target_features = self.features.values()
        if app_filter:
            return [feat.copy() for feat in target_features if feat.get("app") == app_filter]
        return [feat.copy() for feat in target_features]

    def vector_search_features(self, query_embedding: List[float], top_k: int) -> List[Tuple[str, float]]:
        self._validate_and_set_embedding_dim(query_embedding)
        
        if self._all_embeddings_matrix.shape[0] == 0 or \
           (self._embedding_dim != -1 and self._all_embeddings_matrix.shape[1] != self._embedding_dim) :
            return []

        query_vec_np = np.array(query_embedding, dtype=np.float32)
        similarities = self._cosine_similarity_matrix(query_vec_np, self._all_embeddings_matrix)

        num_available_features = len(self._all_feature_ids)
        actual_top_k = min(top_k, num_available_features)

        if actual_top_k == 0 or similarities.size == 0:
            return []

        if actual_top_k < num_available_features:
            partitioned_indices = np.argpartition(similarities, -actual_top_k)[-actual_top_k:]
            candidate_indices = partitioned_indices[np.argsort(similarities[partitioned_indices])[::-1]]
        else:
            candidate_indices = np.argsort(similarities)[::-1]
            
        results = []
        for idx in candidate_indices:
            if idx < len(self._all_feature_ids):
                feature_id = self._all_feature_ids[idx]
                results.append((feature_id, float(similarities[idx])))
        return results

    # --- Component Feature Management Methods ---
    def add_component_feature(self, component_feature_doc: Dict[str, Any]) -> str:
        _id = str(component_feature_doc.get("_id", uuid.uuid4().hex))
        doc_copy = component_feature_doc.copy()
        doc_copy["_id"] = _id
        self.component_features.append(doc_copy)
        return _id

    def list_component_features(self, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
        if not self.component_features:
            return []
        
        source_list = self.component_features
        
        if not filters:
            return [doc.copy() for doc in source_list]
        
        results = []
        for doc in source_list:
            match = True
            for key, filter_value in filters.items():
                doc_value = doc.get(key)
                if isinstance(filter_value, dict) and '$in' in filter_value:
                    if doc_value not in filter_value['$in']:
                        match = False
                        break
                elif doc_value != filter_value:
                    match = False
                    break
            if match:
                results.append(doc.copy())
        return results

    def update_component_features(self, filter_criteria: Dict[str, Any], updates: Dict[str, Any]) -> int:
        updated_count = 0
        for doc in self.component_features: # Operates on original docs in list
            match = True
            for key, filter_value in filter_criteria.items():
                doc_value = doc.get(key)
                if isinstance(filter_value, dict) and '$in' in filter_value:
                    if doc_value not in filter_value['$in']:
                        match = False
                        break
                elif doc_value != filter_value:
                    match = False
                    break
            
            if match:
                for update_key, update_value in updates.items():
                    doc[update_key] = update_value
                updated_count += 1
        return updated_count

    def delete_component_features(self, filter_criteria: Dict[str, Any]) -> int:
        initial_count = len(self.component_features)
        new_list = []
        for doc in self.component_features:
            match = True
            for key, filter_value in filter_criteria.items():
                doc_value = doc.get(key)
                if isinstance(filter_value, dict) and '$in' in filter_value:
                    if doc_value not in filter_value['$in']:
                        match = False
                        break
                elif doc_value != filter_value:
                    match = False
                    break
            if not match:
                new_list.append(doc)
        
        deleted_count = initial_count - len(new_list)
        if deleted_count > 0:
            self.component_features = new_list
        return deleted_count

    # --- Utility ---
    def clear_all_data(self):
        self.features.clear()
        self.component_features.clear()
        self._all_feature_ids = []
        self._all_embeddings_matrix = np.array([], dtype=np.float32)
        self._embedding_dim = -1

In [7]:
def save_db_to_file(db_instance: InMemoryWarningVectorDB, filepath: str) -> None:
    """
    Saves the essential data of an InMemoryWarningVectorDB instance to a file using pickle.

    Args:
        db_instance: The instance of InMemoryWarningVectorDB to save.
        filepath: The path to the file where the data will be saved.

    Raises:
        Exception: Propagates exceptions that occur during file writing or pickling.

    Warning:
        The pickle module is not secure. Only use with trusted files.
    """
    data_to_save = {
        "features": db_instance.features,
        "component_features": db_instance.component_features,
        "_all_feature_ids": db_instance._all_feature_ids,
        "_all_embeddings_matrix": db_instance._all_embeddings_matrix,
        "_embedding_dim": db_instance._embedding_dim,
        # If you add other critical state attributes to your DB class,
        # make sure to include them here for persistence.
    }
    
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Database state saved successfully to {filepath}")
    except Exception as e:
        print(f"Error saving database to {filepath}: {e}")
        raise # Re-raise the exception for the caller to handle

def load_db_from_file(filepath: str, db_class_constructor: type) -> Optional[InMemoryWarningVectorDB]:
    """
    Loads the data for an InMemoryWarningVectorDB from a file.

    Args:
        filepath: The path to the file from which to load the data.
        db_class_constructor: The constructor of your database class (e.g., InMemoryWarningVectorDB).
                              This is needed to create a new instance.

    Returns:
        A new instance of your database class populated with the loaded data,
        or None if loading fails.

    Warning:
        The pickle module is not secure. Only unpickle data from trusted sources.
    """
    try:
        with open(filepath, 'rb') as f:
            loaded_data = pickle.load(f)
            
        # Create a new instance using the provided constructor
        new_db_instance = db_class_constructor() 
        
        # Populate the instance with loaded data, with defaults for robustness
        new_db_instance.features = loaded_data.get("features", {})
        new_db_instance.component_features = loaded_data.get("component_features", [])
        new_db_instance._all_feature_ids = loaded_data.get("_all_feature_ids", [])
        
        loaded_matrix = loaded_data.get("_all_embeddings_matrix")
        if isinstance(loaded_matrix, np.ndarray):
            new_db_instance._all_embeddings_matrix = loaded_matrix
        elif loaded_matrix is not None: # Handle case if it was somehow saved not as ndarray
            print("Warning: Loaded embeddings matrix was not a NumPy ndarray. Attempting conversion.")
            new_db_instance._all_embeddings_matrix = np.array(loaded_matrix, dtype=np.float32)
        else: # Default to an empty array if missing or None
            new_db_instance._all_embeddings_matrix = np.array([], dtype=np.float32)

        new_db_instance._embedding_dim = loaded_data.get("_embedding_dim", -1)
        
        # Basic integrity check (optional, but can be helpful)
        if new_db_instance._embedding_dim != -1 and \
           new_db_instance._all_embeddings_matrix.ndim == 2 and \
           new_db_instance._all_embeddings_matrix.shape[0] > 0 and \
           new_db_instance._all_embeddings_matrix.shape[1] != new_db_instance._embedding_dim:
            print(f"Warning: Loaded embedding dimension ({new_db_instance._embedding_dim}) "
                  f"may be inconsistent with the loaded embeddings matrix shape "
                  f"({new_db_instance._all_embeddings_matrix.shape}). "
                  f"Consider data integrity or call _rebuild_embeddings_index() if issues arise.")
        
        # If the matrix is empty but an embedding dim is set, ensure it's 2D with correct second dim
        if new_db_instance._all_embeddings_matrix.shape[0] == 0 and \
           new_db_instance._embedding_dim != -1 and \
           (new_db_instance._all_embeddings_matrix.ndim != 2 or \
            (new_db_instance._all_embeddings_matrix.ndim == 2 and \
             new_db_instance._all_embeddings_matrix.shape[1] != new_db_instance._embedding_dim)):
            new_db_instance._all_embeddings_matrix = np.empty((0, new_db_instance._embedding_dim), dtype=np.float32)


        print(f"Database state loaded successfully from {filepath}")
        return new_db_instance
        
    except FileNotFoundError:
        print(f"Error: Database file not found at {filepath}")
        return None
    except pickle.UnpicklingError:
        print(f"Error: Could not unpickle data from {filepath}. "
              "File may be corrupted, not a pickle file, or from an incompatible version.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while loading database from {filepath}: {e}")
        return None

# Inference Functions

## Feature Extraction

In [8]:
FEATURE_EXTRACTION_PROMPT = """
You are an expert QA engineer. Your primary objective is to infer potential **User Operation Chains** that a given UI component (the `current component`) is likely to be part of. Each chain should correspond to a **single, specific user goal**.

A **User Operation** is defined as:
* **Entity Operations:** Actions corresponding to Create, Read, Update, or Delete (CRUD) operations on data entities (e.g., creating a new user, reading a product description, updating profile information, deleting an item from a cart). These actions should represent a committed change or retrieval of data.
* **Configuration Operations:** Actions that modify system configurations or user settings (e.g., toggling dark mode, adjusting notification preferences, signing in/out which changes an authentication flag). These actions should result in a persistent change to settings.

**Important Exclusions:**
1.  **Navigation:** Simple navigation actions, such as "Navigate to home page" or "Go to settings page," are **NOT** considered User Operations for this task unless they are an integral part of achieving a broader Entity or Configuration operation (e.g., navigating to a specific form to *then* create a new entity).
2.  **Cancellations/Dismissals:** Actions that primarily serve to cancel, abort, dismiss, or revert an ongoing process without a definitive data change or configuration commitment (e.g., "cancel form submission," "discard edits," "close modal without action") are **NOT** the target User Operations. Focus on operations that progress towards a completed goal.

You will infer these operation chains by analyzing the `current component` and, when available, the UI component interacted with in the immediately preceding state (the `previous component`).

## Input Information:
* **Current Webpage Context:** A description of the overall purpose or main function of the current webpage.
* **Current Component's Hierarchical Context:** Descriptions of all ancestor elements of the `current component`, providing its placement within the UI structure.
* **Current Component's Context:** A specific description detailing the `current component` being analyzed.
* **Current Component's Code:** The React code snippet for the `current component`.
* **(Optional) Previous Component's Full Context:** If provided, this includes the Hierarchical Context, Context, and Code for the component interacted with in the *previous UI state*.

## Your Goal:
Analyze the provided information for the `current component` (and `previous component`, if available). Your primary goal is to identify and describe potential **User Operation Chains**. Each User Operation Chain represents a sequence of operations a user might perform to achieve **one specific, distinct overall user goal**.

For example, if a component is a "Manage Items" button leading to a page where items can be viewed, created, or deleted:
* A potential goal is "View list of items." Chain: `Read (Item List)`
* Another distinct goal is "Create a new item." Chain: `Create (New Item)`
* Another distinct goal is "Delete an existing item." Chain: `Delete (Specific Item)`

The User Operations you predict do not need to occur *immediately* after interacting with the given components, but the components should clearly indicate the possibility of these operations forming part of a user's workflow towards a specific, constructive outcome.

**Focus:** You are predicting the sequence of underlying system operations (Entity or Configuration) and the **single, specific, user-centric purpose** that the `current component` (potentially in sequence with the `previous component`) helps achieve. Formulate a concise, natural language label for this **overall user goal**.
* **Granularity:** Each predicted goal must be singular. Avoid bundling multiple distinct operations (e.g., do not use "Manage X (view, edit, delete)"). Instead, if the component(s) suggest multiple distinct goals like "View X," "Edit X," and "Delete X," identify each as a separate potential User Goal with its own chain and likelihood.
* **Exclusions:** Avoid identifying goals that are solely navigational or represent a cancellation/dismissal of an action. The goal should reflect a meaningful, progressive interaction with data or system settings, leading to a committed state.

## Prediction Categorization by Likelihood:
Categorize each inferred User Operation Chain (and its single, specific User Goal) based on its probability of existence:
* **Category 1: Extremely Probable:** Given the component(s) and context, this specific User Goal and its chain (leading to a constructive outcome) are almost certainly supported.
* **Category 2: Probable:** The specific User Goal and its chain (leading to a constructive outcome) are plausible, but existence isn't guaranteed without further observation or broader application context.
* **Category 3: Extremely Improbable:** It's highly unlikely this specific constructive User Goal and its chain are relevant to the given component(s), though a remote theoretical possibility might exist.

Generate as many distinct User Operation Chains (and their corresponding single User Goals) as the provided information reasonably implies, keeping the exclusion and granularity criteria in mind. Categories can be empty.

## Output Format:

Your response must be structured into two distinct sections, enclosed in the specified tags:

<Reasoning>
Provide a concise, step-by-step thought process for your inferences.
-   For each potential **single, specific User Goal** (and its implied operation chain) you considered:
    1.  **User Goal:** State the overall user goal (e.g., "View product reviews," "Create a new user profile," "Delete an item from cart"). Ensure this is a single, specific goal and not a bundled set of operations, a simple navigation task, or a cancellation/dismissal.
    2.  **Implied Operation Chain:** Outline the sequence of User Operations (e.g., `Read (Product Details) -> Read (Product Reviews)` or `Create (UserProfile)`). Clearly specify the Entity Operation (CRUD on which entity?) or Configuration Operation. Ensure operations are constructive and not merely abortive.
    3.  **Role of Current Component:** Explain which specific User Operation in this chain the `current component` directly facilitates towards achieving this single goal.
    4.  **Role of Previous Component (if provided and relevant):** If the `previous component`'s context was available and used in your inference, describe how its facilitated User Operation leads to or sets up the `current component`'s operation within this chain for this specific goal. Explain how analyzing both components *together* strengthens the prediction of this chain and goal. If not relevant or not provided, state so.
    5.  **Likelihood & Justification:** Assign a likelihood (Extremely Probable, Probable, Extremely Improbable) to this specific User Goal and its chain. Justify your categorization based on the provided component(s), their context, and common user interaction patterns, focusing on constructive outcomes.
-   Avoid re-stating the questions from this prompt in your reasoning. Provide direct answers.

</Reasoning>

<Response>
This section must contain a single JSON array of objects, parsable by `json.loads` in Python. Do not include any explanatory text or formatting outside this JSON structure.
-   Each object in the array represents **one** inferred User Goal and its associated chain.
-   Each object must contain two keys:
    * `"category"`: An integer (`1`, `2`, or `3`) corresponding to the likelihood categories (1: Extremely Probable, 2: Probable, 3: Extremely Improbable).
    * `"feature"`: A concise natural language label describing the **single, specific overall user goal** (e.g., "Add item to shopping cart," "Update notification preferences," "View order history," "Create new pet entry," "Edit pet details," "Delete specific pet"). This label should *not* describe a purely navigational task, a cancellation/dismissal of a process, or a bundled set of multiple operations (e.g., avoid "Manage pets").
-   The JSON array should be sorted by `"category"` in ascending order.
</Response>

Example for <Response>:
```json
[
  {
    "category": 1,
    "feature": "Search for products"
  },
  {
    "category": 1,
    "feature": "View product details after search"
  },
  {
    "category": 2,
    "feature": "Add searched product to wishlist"
  },
  {
    "category": 2,
    "feature": "Create a new pet profile"
  },
  {
    "category": 2,
    "feature": "Edit an existing pet profile"
  }
]
```
""".strip()


feature_extraction_model = create_model(FEATURE_EXTRACTION_PROMPT)


def extract_component_features(state, component, prev_component=None):
    prompt=f'Page Context: {state['context']}\nHierarchical Context:\n{component['hcontexts']}\nComponent Context: {component['context']}\nComponent Code:\n{component['code']}'
    if prev_component is not None:
        prompt += f'Prev Hierarchical Context:\n{prev_component['hcontexts']}\n'
        prompt += f'Prev Component Context: {prev_component['context']}\n'
        prompt += f'Prev Component Code: {prev_component['code']}'
    
    res = feature_extraction_model(
        prompt=prompt
    ).text

    try:
        features = json.loads(extract_response_from_tag(res, 'Response'))
    except:
        print(res)

    return features

## Feature to Database

In [9]:
def category_to_score(category):
    p_map = {
        1: 0.9,
        2: 0.5,
        3: 0.1,
        4: 0.01
    }

    return np.log(p_map[category])

In [10]:
def query_similar_features(
    db: InMemoryWarningVectorDB,
    app_name: str,
    query_embedding: List[float],
    vector_search_limit: int = 10, 
    result_limit: int = 5
) -> List[Dict[str, Any]]:
    candidate_tuples = db.vector_search_features(query_embedding, top_k=vector_search_limit) 

    results = []
    for feature_id, similarity_score in candidate_tuples:
        feature_doc = db.get_feature(feature_id) # Returns a copy
        if feature_doc and feature_doc.get("app") == app_name:
            feature_doc['search_similarity_score'] = similarity_score 
            results.append(feature_doc)
            if len(results) >= result_limit:
                break
    return results

In [11]:
def get_exact_match_indices(text, similar_features):
    indices = []
    
    for i in range(len(similar_features)):
        if similar_features[i]['text'] == text:
            indices.append(i)
    
    return indices


SIMILARITY_PROMPT = """
Given a description of a software feature and a list of other software feature descriptions, your task is to determine if the initial feature matches any features in the list.

Output format:
Your analysis is enclosed in two tags:
<Reasoning>
- For each item in the list, argue why the base feature and the feature in the list are or aren't describing the same action being performed in the app.
    - Are they exactly or semantically equivalent?
    - If they are different, how are they different?
- Avoid repeating the questions in your responses every time.
- Your analyses should be short and concise.
</Reasoning>
<Response>
- A JSON object containing the following keys:
    - match: true If any feature in the list matches the base feature, false if not.
    - match_index: An array of indices of matched features in the list. Only include this key if there is a matching feature.
    - combined_text: If the features match, a concise description of that feature. Only include this key if there is a matching feature. You can omit some of the redundant words to keep this sentence simple.
- Parsable by `json.loads`.
</Response>
""".strip()


similarity_model = create_model(SIMILARITY_PROMPT)


def map_similar_feature_to_exact_match(feature_info):
    category, text, embedding, similar_features = feature_info

    # print('\n'.join(map(lambda x: f'{str(x['_id'])}, {x['text']}', similar_features)))
    
    exact_match_indices = get_exact_match_indices(text, similar_features)
    
    if len(exact_match_indices) > 0:
        match = {
            'match': True,
            'match_index': exact_match_indices,
            'combined_text': text
        }
    else:
        match = {
            'match': False
        }
    
    if len(similar_features) != 0:
        res = similarity_model(
            prompt=f'Base feature:\n{text}\nThe list of features:\n{'\n'.join(map(lambda x: x['text'], similar_features))}'
        ).text.replace('```json', '').replace('```', '')
        # print(res)
        
        match = json.loads(extract_response_from_tag(res, 'Response'))
        
        if 'match_index' in match:
            match['match_index'] = list(set(match['match_index'] + exact_match_indices))
        elif len(exact_match_indices) > 0:
            match['match_index'] = exact_match_indices
            match['combined_text'] = text
    
    if match['match']:
        if type(match['match_index']) == int:
            match['match_id'] = [similar_features[match['match_index']]['_id']]
        else:
            match['match_id'] = [similar_features[m]['_id'] for m in match['match_index']]
    
    match['category'] = category
    match['text'] = text
    match['embedding'] = embedding
    
    return match

In [12]:
google_embedding = create_embedding_model()

In [13]:
def no_match_insert(
    db: InMemoryWarningVectorDB, 
    app_name: str,
    text: str,
    embedding: List[float], # Assumed to be pre-calculated
    score: float,         # Assumed to be pre-calculated
    final: bool = False
) -> str:
    feature_doc = {
        "app": APP_NAME,
        "text": text,
        "embedding": embedding,
        "score": score,
        "final": False
        # _id will be auto-generated by db.add_feature if not provided
    }
    inserted_id = db.add_feature(feature_doc)
    return inserted_id


def match_update(
    db: InMemoryWarningVectorDB,
    app_name: str,
    match_ids: List[str], # List of feature IDs, first is primary
    combined_text: str,
    new_embedding: List[float] # Assumed to be pre-calculated
) -> Optional[str]:

    if not match_ids:
        return None # No IDs provided
        
    primary_match_id = str(match_ids[0])
    
    primary_feature = db.get_feature(primary_match_id)
    if not primary_feature or primary_feature.get("app") != app_name:
        return None # Primary feature doesn't exist or not for this app

    # Update the primary feature
    update_payload = {
        "text": combined_text,
        "embedding": new_embedding
        # Note: 'app' is not changed, _id is the key
    }
    db.update_feature(primary_match_id, update_payload)

    ids_to_process_for_deletion = [str(mid) for mid in match_ids[1:]]
    successfully_deleted_ids_for_comp_feat_update = []

    if ids_to_process_for_deletion:
        for id_to_delete in ids_to_process_for_deletion:
            if id_to_delete == primary_match_id: # Should not happen with match_ids[1:]
                continue
            feature_to_delete = db.get_feature(id_to_delete)
            if feature_to_delete and feature_to_delete.get("app") == app_name:
                if db.delete_feature(id_to_delete):
                    successfully_deleted_ids_for_comp_feat_update.append(id_to_delete)
        
        if successfully_deleted_ids_for_comp_feat_update:
            comp_feat_filter = {
                "app": app_name,
                "feature_pointer": {"$in": successfully_deleted_ids_for_comp_feat_update}
            }
            comp_feat_update_payload = {"feature_pointer": primary_match_id}
            db.update_component_features(comp_feat_filter, comp_feat_update_payload)
            
    return primary_match_id


def update_databases_with_match(
    db: InMemoryWarningVectorDB,
    app_name: str,
    match: Dict[str, Any]
):
    if match['match']:
        return match_update(
            db=db,
            app_name=app_name,
            match_ids=match['match_id'],
            combined_text=match['combined_text'],
            new_embedding=google_embedding(match['combined_text'])
        )
    return no_match_insert(
        db=db,
        app_name=app_name,
        text=match['text'],
        embedding=match['embedding'],
        score=category_to_score(match['category']),
        final=False
    )

In [14]:
def insert_features(
    db: InMemoryWarningVectorDB,
    app_name: str,
    features: List[Dict]
) -> List[str]:
    feature_categories = list(map(lambda x: x['category'], features))
    feature_descriptions = list(map(lambda x: x['feature'], features))
    feature_embeddings = google_embedding(feature_descriptions, default_as_list=True)

    similar_features = map(
        lambda x: query_similar_features(db=db, app_name=app_name, query_embedding=x),
        feature_embeddings
    )

    matches = map(
        map_similar_feature_to_exact_match,
        zip(
            feature_categories,
            feature_descriptions,
            feature_embeddings,
            similar_features
        )
    )

    insertion_ids = list(map(
        lambda x: update_databases_with_match(db=db, app_name=app_name, match=x),
        matches
    ))
    
    return insertion_ids

In [15]:
def insert_action_features(
    db: InMemoryWarningVectorDB,
    # Common data for all documents in this batch
    app_name: str, 
    state_url: str,
    state_id: str,
    prev_state_id: str,
    component_id: str,
    prev_component_id: str,
    component_depth: int,
    component_type: str,
    # Each tuple: (dict_with_category_info, feature_id_to_point_to)
    per_item_data: List[Tuple[Dict[str, Any], str]], 
) -> List[str]:

    inserted_comp_feature_ids = []
    for item_specifics, feature_pointer_val in per_item_data:
        document = {
            "app": app_name,
            "url": state_url,
            "state": state_id,
            "prev_state": prev_state_id,
            "component": component_id,
            "prev_component": prev_component_id,
            "depth": component_depth,
            "type": component_type,
            "rank_score": category_to_score(item_specifics['category']),
            "feature_pointer": str(feature_pointer_val),
            "final": False, # Default as in original user code
            # _id will be auto-generated by db.add_component_feature
        }
        inserted_id = db.add_component_feature(document)
        inserted_comp_feature_ids.append(inserted_id)
        
    return inserted_comp_feature_ids

## Mark Features

In [16]:
def update_feature_score(
    db: InMemoryWarningVectorDB,
    app_name: str,
    prev_state: Dict[str, Any],
    prev_comp: Dict[str, Any],
    curr_state: Dict[str, Any],
    curr_comp: Dict[str, Any]
) -> None:
    # Fetch current action's "DOUBLE" type component features
    curr_comp_features = db.list_component_features(filters={
        'app': app_name,
        'state': curr_state['id'],
        'component': curr_comp['id'],
        'type': 'DOUBLE'
    })
    
    # Fetch previous action's "SINGLE" type component features
    prev_comp_features = db.list_component_features(filters={
        'app': app_name,
        'state': prev_state['id'],
        'component': prev_comp['id'],
        'type': 'SINGLE'
    })
    
    feature_score_updates: Dict[str, float] = {} # feature_pointer_id -> score_difference
    
    for curr_comp_feat in curr_comp_features:
        feature_pointer = curr_comp_feat.get('feature_pointer')
        if not feature_pointer:
            continue
            
        corresponding_prev_comp_feat = list(filter(
            lambda x: x.get('feature_pointer') == feature_pointer, 
            prev_comp_features
        ))
        
        prev_score = category_to_score(4) # Default if no corresponding previous
        if corresponding_prev_comp_feat:
            prev_score = corresponding_prev_comp_feat[0].get('rank_score', 0.0)
            
        current_rank_score = curr_comp_feat.get('rank_score', 0.0)
        diff = current_rank_score - prev_score
        feature_score_updates[str(feature_pointer)] = diff # Ensure feature_pointer is string
    
    for feature_id_str, score_diff in feature_score_updates.items():
        feature_to_update = db.get_feature(feature_id_str) # get_feature returns a copy
        
        if feature_to_update and \
           feature_to_update.get('app') == app_name and \
           not feature_to_update.get('final', False): # Check 'final' status
            
            current_score = feature_to_update.get('score', 0.0)
            new_score = current_score + score_diff
            db.update_feature(feature_id_str, {'score': new_score})

In [17]:
FINALITY_PROMPT = """
You are an expert QA engineer. Your task is to analyze a given UI component (the `current component`) and determine if interacting with it represents the **final step** taken by the user before a specific **User Operation Goal** is achieved. A User Operation Goal refers to the successful completion of an Entity Operation or a Configuration Operation.

**Key Definitions:**
* **User Operation Goal:** The user's aim to complete an Entity Operation or Configuration Operation.
    * **Entity Operations:** Actions corresponding to Create, Read, Update, or Delete (CRUD) operations on data entities (e.g., submitting a form to create an account, saving changes to a profile, confirming a deletion).
    * **Configuration Operations:** Actions that modify system configurations or user settings (e.g., applying a new theme, saving notification preferences).
* **Final Step:** Interacting with the `current component` is the last action a user needs to perform on the UI to trigger the backend processing of the User Operation or to see the User Operation Goal realized. No further UI interactions are required for that specific goal after this component.

## Input Information:
* **Current Webpage Context:** Description of the purpose of the current webpage.
* **Current Component's Hierarchical Context:** Contextual descriptions for all ancestor elements of the `current component`.
* **Current Component's Context:** A specific description of the `current component` being analyzed.
* **Current Component's Code:** The React code snippet for the `current component`.
* **Target User Operation Goals:** A list of specific User Operation Goals (e.g., "Submit user registration form," "Save notification settings," "View account summary"). You will evaluate the `current component` against each goal in this list.

## Your Task:
For each User Operation Goal provided in the **Target User Operation Goals** list, determine if interacting with the `current component` is the **final step** that immediately leads to the achievement of that goal.

## Output Format:

Your analysis must be enclosed in two distinct tags:

<Reasoning>
Detail your thought process concisely for each User Operation Goal from the input list.
-   For each User Operation Goal:
    1.  Clearly state the User Operation Goal you are evaluating (you can refer to it by its description or index).
    2.  Argue whether interacting with the `current component` would or would not conclude the actions required for this specific User Operation Goal.
    3.  Consider if any further component interactions by the user are necessary after the `current component` to achieve this goal.
    4.  If the `current component` is *not* final for the goal, briefly state why (e.g., "requires subsequent 'Confirm' button click," "only updates a field, needs form submission").
-   Avoid repeating the full description of the User Operation Goal if referencing by name/index is clear.
-   Your analyses should be short and concise.
</Reasoning>

<Response>
This section must contain a single JSON array of boolean values.
-   The array must directly correspond to the order of the **Target User Operation Goals** provided in the input.
-   The boolean value at index `i` in the array should be `true` if interacting with the `current component` is the final step to achieve the User Operation Goal `i` from the input list. Otherwise, the value should be `false`.
-   The output must be parsable by `json.loads` in Python (use `true` and `false` for boolean values). Do not include any explanatory text outside the JSON array in this section.
</Response>
""".strip()


finality_model = create_model(FINALITY_PROMPT)


def mark_final_features(
    db: InMemoryWarningVectorDB,
    app_name: str,
    curr_state: Dict[str, Any],
    curr_comp: Dict[str, Any]
) -> None:
    # Fetch component features related to the current state and action
    current_comp_features = db.list_component_features(filters={
        'app': app_name,
        'state': curr_state['id'],
        'component': curr_comp['id'],
    })

    if not current_comp_features:
        return

    # Collect unique function pointers (which are feature IDs)
    func_pointer_ids = list(set(
        str(cf['func_pointer']) for cf in current_comp_features if 'func_pointer' in cf
    ))

    if not func_pointer_ids:
        return

    # Retrieve the actual feature documents ("functionalities")
    retrieved_features: List[Dict[str, Any]] = []
    for fp_id in func_pointer_ids:
        feature = db.get_feature(fp_id) # Returns a copy
        if feature and feature.get('app') == app_name:
            retrieved_features.append(feature)
            
    if not retrieved_features:
        return
    
    # Prepare data for and call the LLM
    # Assuming curr_state['context'] and curr_comp['outerHTML'] exist
    # And feature documents have a 'text' field
    llm_response = finality_model(
        prompt=f'App Context: {state['context']}\nHierarchical\'s Context:\n{'\n'.join(curr_comp['hcontexts'])}\nComponent Context: {curr_comp['context']}\nComponent Code: {curr_comp['code']}\nFeatures:\n{'\n'.join(feat.get('text', '') for feat in retrieved_features)}'
    ).text

    try:
        # The original code uses eval, which can be risky.
        # Ensure the LLM output is strictly controlled or use a safer parsing method.
        finality_list_str = extract_response_from_tag(llm_response, 'Response')
        finality_determinations = eval(finality_list_str) # Example: [True, False, True, ...]
        
        if not isinstance(finality_determinations, list) or \
           len(finality_determinations) != len(retrieved_features):
            print(f"Warning: Mismatch between finality determinations ({len(finality_determinations)}) and "
                  f"retrieved features ({len(retrieved_features)}). Or invalid format.")
            return

    except Exception as e:
        print(f"Error processing LLM response for finality: {e}")
        return

    for i, is_final in enumerate(finality_determinations):
        if is_final:
            feature_to_mark = retrieved_features[i]
            feature_id_to_mark = str(feature_to_mark['_id'])

            # Mark the feature itself as final
            db.update_feature(feature_id_to_mark, {'final': True})
            
            # Mark related component_features as final
            # This targets only those component_features that were initially fetched and led to this feature.
            # The original query was specific to `curr_comp['id']` and `curr_state['id']`
            filter_for_comp_features = {
                'app': app_name,
                'func_pointer': feature_id_to_mark,
                'component': curr_comp['id'],
                'state': curr_state['id']
            }
            db.update_component_features(
                filter_criteria=filter_for_comp_features,
                updates={'final': True}
            )

# Feature Extraction Loop

In [18]:
in_memory_db = InMemoryWarningVectorDB()

In [19]:
queue = [
    list(model['nodes'].keys())[0]
]

In [20]:
queue

['3a9a0cc95faadb210453ae06e176340513c3b2fd4ac65ae6d0f81afdae9340dc']

In [21]:
while len(queue) > 0:
    state_id = queue[0]
    print(f"Visiting state {state_id}")

    state = model['nodes'][state_id]

    for component in state['components']:
        print(f'Extracting component scenarios: {component['id']}')
        
        if component['original_state'] != '' and component['original_state'] != state_id:
            continue

        prev_state = model['nodes'][state['prev_state']] if state['depth'] > 0 else None
        prev_action = state['prev_action']
        try:
            prev_component = list(filter(
                lambda c: prev_action['id'].replace('//BODY', '//html[1]/body[1]').lower() in list(map(lambda a: a['id'], c['actions'])),
                prev_state['components']
            ))[0] if state['depth'] > 0 else None
        except:
            prev_component = None

        
        features = extract_component_features(state, component)
        print(features)

        if len(features) != 0:
            feature_ids = insert_features(db=in_memory_db, app_name=APP_NAME, features=features)
            insert_action_features(
                db=in_memory_db,
                app_name=APP_NAME,
                state_url=state['url'],
                state_id=state_id,
                prev_state_id=state['prev_state'],
                component_id=component['id'],
                prev_component_id=prev_component['id'] if prev_component is not None else None,
                component_depth=state['depth'],
                component_type="SINGLE",
                per_item_data=list(zip(features, feature_ids))
            )
        
        if state['depth'] > 0 and prev_component is not None:
            print('Extracting double action scenarios')

            double_features = extract_component_features(state, component, prev_component)
            print(double_features)

            if len(double_features) != 0:
                double_feature_ids = insert_features(db=in_memory_db, app_name=APP_NAME, features=double_features)
                insert_action_features(
                    db=in_memory_db,
                    app_name=APP_NAME,
                    state_url=state['url'],
                    state_id=state_id,
                    prev_state_id=state['prev_state'],
                    component_id=component['id'],
                    prev_component_id=prev_component['id'],
                    component_depth=state['depth'],
                    component_type="DOUBLE",
                    per_item_data=list(zip(double_features, double_feature_ids))
                )
            
            print('Updating action scores')

            update_feature_score(
                db=in_memory_db,
                app_name=APP_NAME,
                prev_state={ 'id': state['prev_state'], **prev_state },
                prev_comp=prev_component,
                curr_state={ 'id': state_id, **state },
                curr_comp=component
            )

            print('Action scores updated')

        print('Marking final functionalities')

        mark_final_features(
            db=in_memory_db,
            app_name=APP_NAME,
            curr_state={ 'id': state_id, **state },
            curr_comp=component
        )
    
        print('Final actions marked')
    
    if state_id in model['edges']:
        for target_state in model['edges'][state_id].values():
            queue.append(target_state)
    
    queue = queue[1:]

Visiting state 3a9a0cc95faadb210453ae06e176340513c3b2fd4ac65ae6d0f81afdae9340dc
Extracting component scenarios: //html[1]/body[1]/div[1]/main[1]/div[1]/div[1]
[{'category': 1, 'feature': 'Log into the application'}, {'category': 1, 'feature': 'Reset user password'}]
Marking final functionalities
Final actions marked
Visiting state f3da9e519a1a495ba0b9e0d8501cc460495b087624abd9f91180ebf647b3126b
Extracting component scenarios: //html[1]/body[1]/div[1]/main[1]/div[1]/div[1]/div[1]/aside[1]/div[1]
[{'category': 1, 'feature': 'Read store configuration details (including avatar)'}, {'category': 2, 'feature': 'Update store avatar image'}]
Marking final functionalities
Final actions marked
Extracting component scenarios: //html[1]/body[1]/div[1]/main[1]/div[1]/div[1]/div[1]/aside[1]/div[2]
[{'category': 1, 'feature': 'View product details'}, {'category': 1, 'feature': 'View order details'}, {'category': 1, 'feature': 'View customer details'}, {'category': 1, 'feature': 'View discount details'

In [22]:
features = in_memory_db.list_features()

print('\n'.join(map(
    lambda x: f'{x[0]}. {x[1][0]}',
    zip(
        range(1, len(features) + 1),
        sorted(
            list(map(lambda x: (x['text'], x['score']), features)),
            key=lambda x: x[1],
            reverse=True
        )
    )
)))

1. Log into the application
2. Reset user password
3. View product or menu item details
4. View order details
5. View customer details
6. Access and view list of sales discounts and their details
7. View list of available content or page types
8. View list of installed applications
9. View overall dashboard information
10. View dashboard information filtered by channel
11. View recent store activity
12. Access and view list of products
13. View daily sales total
14. View the number of items currently out of stock
15. View list of sales orders
16. Search content list or products or discounts
17. View list of product types/categories
18. View list of gift cards and information about gift card products
19. Configure/View columns displayed in the product table/list in preferred format
20. Return to or navigate to the previous page or state
21. Select/Filter specific product types from a list
22. Filter the list of orders
23. View list of draft orders
24. Filter the list of discounts
25. Se

In [23]:
save_db_to_file(in_memory_db, f'{APP_NAME}.db')

Database state saved successfully to DASHBOARD.db
