## About
This notebook experiments with the idea of mapping the query to the toolspace via the vector span. We then construct of tree by decomposing the subqueries using GPT in this experiment. At each level, the nodes are checked with the list of tools to determine whether the query should be broken down further based on a certain threshold. If there exists tools that have a similarity score higher than the threshold, the tools can be used to solve the main query and are added to the final list of tools being returned.

This subset of tools can then be used instead of the entire set in the main pipeline.

In [None]:
!pip install openai sentence_transformers

In [None]:
import json
from openai import OpenAI

client = OpenAI(
    api_key = "<ENTER-YOUR-OPENAI-API-KEY>"
)


#Tools

In [None]:
tools_dict = {
    "works_list": {
        "description": "Returns a list of work items matching the request",
        "arguments": [
            {
                "name": "applies_to_part",
                "description": "Filters for work belonging to any of the provided parts",
                "type": "array of strings",
                "example": ["FEAT-123", "ENH-123", "PROD-123", "CAPL-123"]
            },
            {
                "name": "created_by",
                "description": "Filters for work created by any of these users",
                "type": "array of strings",
                "example": ["DEVU-123"]
            },
            {
                "name": "issue.priority",
                "description": "Filters for issues with any of the provided priorities. Allowed values: p0, p1, p2, p3",
                "type": "array of strings",
                "example": ["p0"]
            },
            {
                "name": "issue.rev_orgs",
                "description": "Filters for issues with any of the provided Rev organizations",
                "type": "array of strings",
                "example": ["REV-123"]
            },
            {
                "name": "limit",
                "description": "The maximum number of works to return. The default is '50'",
                "type": "integer (int32)"
            },
            {
                "name": "owned_by",
                "description": "Filters for work owned by any of these users",
                "type": "array of strings",
                "example": ["DEVU-123"]
            },
            {
                "name": "stage.name",
                "description": "Filters for records in the provided stage(s) by name",
                "type": "array of strings",
                "example": ["triage"]
            },
            {
                "name": "ticket.needs_response",
                "description": "Filters for tickets that need a response",
                "type": "boolean"
            },
            {
                "name": "ticket.rev_org",
                "description": "Filters for tickets associated with any of the provided Rev organizations",
                "type": "array of strings",
                "example": ["REV-123"]
            },
            {
                "name": "ticket.severity",
                "description": "Filters for tickets with any of the provided severities. Allowed values: blocker, high, low, medium",
                "type": "array of strings",
                "example": ["high"]
            },
            {
                "name": "ticket.source_channel",
                "description": "Filters for tickets with any of the provided source channels",
                "type": "array of strings",
                "example": ["slack"]
            },
            {
                "name": "type",
                "description": "Filters for work of the provided types. Allowed values: issue, ticket, task",
                "type": "array of strings",
                "example": ["issue"]
            }
        ]
    },
    "summarize_objects": {
        "description": "Summarizes a list of objects. The logic of how to summarize a particular object type is an internal implementation detail.",
        "arguments": [
            {
                "name": "objects",
                "description": "List of objects to summarize",
                "type": "array of objects"
            }
        ]
    },
    "prioritize_objects": {
        "description": "Returns a list of objects sorted by priority. The logic of what constitutes priority for a given object is an internal implementation detail.",
        "arguments": [
            {
                "name": "objects",
                "description": "A list of objects to be prioritized",
                "type": "array of objects"
            }
        ]
    },
    "add_work_items_to_sprint": {
        "description": "Adds the given work items to the sprint",
        "arguments": [
            {
                "name": "work_ids",
                "description": "A list of work item IDs to be added to the sprint.",
                "type": "array of strings"
            },
            {
                "name": "sprint_id",
                "description": "The ID of the sprint to which the work items should be added",
                "type": "str"
            }
        ]
    },
    "get_sprint_id": {
        "description": "Returns the ID of the current sprint",
        "arguments": []
    },
    "get_similar_work_items": {
        "description": "Returns a list of work items that are similar to the given work item",
        "arguments": [
            {
                "name": "work_id",
                "description": "The ID of the work item for which you want to find similar items",
                "type": "string"
            }
        ]
    },
    "search_object_by_name": {
        "description": "Given a search string, returns the id of a matching object in the system of record. If multiple matches are found, it returns the one where the confidence is highest.",
        "arguments": [
            {
                "name": "query",
                "description": "The search string, could be for example customer’s name, part name, user name.",
                "type": "string"
            }
        ]
    },
    "create_actionable_tasks_from_text": {
        "description": "Given a text, extracts actionable insights, and creates tasks for them, which are kind of a work item.",
        "arguments": [
            {
                "name": "text",
                "description": "The text from which the actionable insights need to be created.",
                "type": "string"
            }
        ]
    },
    "who_am_i": {
        "description": "Returns the ID of the current user",
        "arguments": []
    }
}

json_schema = {
  "type": "array",
  "items": {
      "type": "object",
      "properties": {
        "tool_name": { "type": "string" },
        "arguments": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "argument_name": { "type": "string" },
              "argument_value": { "type": "string" }
            },
          "required": ["argument_name", "argument_value"]
          }
        },
        "return_label": { "type": "string" },
        "return_description": { "type": "string" },
        "return_type": { "type": "string" },
      },
  "required": ["tool_name", "arguments", "tool_returned_label", "tool_returned_description", "tool_returned_data_type"]
  }
}

#Code

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load a pre-trained sentence embedding model
model = SentenceTransformer('all-mpnet-base-v2')

tools = [works_list, summarize_objects, prioritize_objects, add_work_items_to_sprint,
         get_sprint_id, get_similar_work_items, search_object_by_name,
         create_actionable_tasks_from_text, who_am_i]

def compute_similarity(query, description):
    query_embedding = model.encode(query, convert_to_tensor=True)
    description_embedding = model.encode(description, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(query_embedding, description_embedding)
    return similarity_score.item()

def make_embed_matrix(query):
    A = []
    A.append((np.expand_dims((model.encode(query)).astype(np.float32), axis=0))[0])
    for tool in tools:
        embed = (np.expand_dims((model.encode(tool.__doc__)).astype(np.float32), axis=0))[0]
        A.append(embed)
    A = np.vstack(A).T

    # Perform SVD
    U, Sigma, Vt = np.linalg.svd(A, full_matrices=True)

    # Find the rank of A
    rank_A = np.linalg.matrix_rank(A)

    # Extract the columns of U corresponding to non-zero singular values
    span_vectors = U[:, :rank_A]

    # Compute the projection of v onto the span of the vectors in A
    projection_A = span_vectors @ span_vectors.T @ A
    return projection_A, span_vectors

def compute_embed_similarity(query_embedding, tool_embedding):
    similarity_score = util.pytorch_cos_sim(query_embedding, tool_embedding)
    return similarity_score.item()

def interact_with_llm(query, prompt):
    response = client.chat.completions.create(
        model='gpt-4-1106-preview',  # Use gpt-3.5-16k if needed
        messages=[
            {'role': 'system', 'content': prompt},
            {'role': 'user', 'content': query}
        ],
        temperature=0.05,
    )
    return str(response.choices[0].message.content)


def decompose_query(query, prompt):
    subqueries_list = eval(interact_with_llm(query, prompt))
    print(f"The main query '{query}' needs to be decomposed")
    print("Here is the subquery list:")
    print(subqueries_list)
    return subqueries_list

def generate_matrix(query, projection_A, span_vectors):
  scores = []
  return_tool = []
  tools_embed = projection_A.T[1:]
  query_embed = (np.expand_dims((model.encode(query)).astype(np.float32), axis=0))

  query_embed_span =  (span_vectors @ span_vectors.T @ query_embed.T)

  for i, tool_embed in enumerate(tools_embed):
    scores.append(compute_embed_similarity(query_embed_span.T, tool_embed))
    return_tool.append(tools[i])
    print(compute_embed_similarity(query_embed_span.T, tool_embed), tools[i])
  return scores, return_tool


prompt ='''
Given an original query, your task is to break it down into two subqueries. Each subquery may or may not be further broken down.
The union of these subqueries must return the original query. Think logically while decomposing the query. The tools that maybe used to
solve this query are given as''' + '''<tools>''' + str(tools_dict) + '''</tools>'''+ '''

Instructions:
Break down the original query into two subqueries. Return the two subqueries as a LIST.
To reference the value of the ith tool in the chain, use $$PREV[i-1] as the argument value (i = 0, 1, .. j-1; j = the current tool’s index in the array).
If the query contains pronouns like "my," "mine," "me," etc., the 'whoami' function should be called first.

Example:
Query: "Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1"
Output: ["Search for issues similar to don:core:dvrv-us-1:devo/0:issue/1", "Summarize the issues found"]

Query: "Search for issues similar to don:core:dvrv-us-1:devo/0:issue/1"
Output: ["Search object by name don:core:dvrv-us-1:devo/0:issue/1", "Get the work items similar to don:core:dvrv-us-1:devo/0:issue/1"]

Query: "Get the work items similar to don:core:dvrv-us-1:devo/0:issue/1"
Output: [""Get the work id of don:core:dvrv-us-1:devo/0:issue/1", "Get similar items using work_id ie $$PREV[0]"]
'''


In [None]:
def process_query(query, projection_A, span_vectors, threshold=0.7, visited_queries=set(), chosen_tools=[]):
    if query in visited_queries:
        print(f"Query '{query}' has already been visited. Avoiding infinite loop.")
        return chosen_tools

    visited_queries.add(query)

    scores, return_tool = generate_matrix(query, projection_A, span_vectors)
    if all(value < threshold for value in scores):
        subqueries_list = decompose_query(query, prompt)
        for subquery in subqueries_list:
            chosen_tools = process_query(subquery, projection_A, span_vectors, threshold, visited_queries, chosen_tools)
    else:
        for i, score in enumerate(scores):
            if score > threshold:
                chosen_tools.append(return_tool[i])

    return chosen_tools

# Example usage
user_query = "Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1"
projection_A, span_vectors = make_embed_matrix(user_query)

chosen_tools = process_query(user_query, projection_A, span_vectors)
print("Chosen tools:", chosen_tools)


0.17574678361415863 <function works_list at 0x7eadf63445e0>
0.12623077630996704 <function summarize_objects at 0x7eadf6344550>
0.133187398314476 <function prioritize_objects at 0x7eadf6344670>
0.14656589925289154 <function add_work_items_to_sprint at 0x7eadf63444c0>
0.16182181239128113 <function get_sprint_id at 0x7eadf63443a0>
0.08232847601175308 <function get_similar_work_items at 0x7eadf6344430>
0.04749486595392227 <function search_object_by_name at 0x7eadf6344280>
0.16205738484859467 <function create_actionable_tasks_from_text at 0x7eadf6347c70>
0.1790088415145874 <function who_am_i at 0x7eadf6347eb0>
The main query 'Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1' needs to be decomposed
Here is the subquery list:
['Search object by name don:core:dvrv-us-1:devo/0:issue/1', 'Summarize objects with the list of similar issues found using $$PREV[0]']
0.4884686768054962 <function works_list at 0x7eadf63445e0>
0.24693933129310608 <function summarize_objects at 0x7eadf634455