In [1]:
import json
import sys
sys.path.append("../")

from fns.azure_fns import write_file, read_file

# RESET KV STORE 🔥
# write_file("benchmarks/kv.json", json.dumps({}, indent=2))

# print(read_file("benchmarks/kv.json"))
# print(json.dumps(results, indent=4, sort_keys=True))

## Azure CLI Commands

https://learn.microsoft.com/en-us/cli/azure/group?view=azure-cli-latest

## Cosmos Notebooks

[Cosmos RAGs]

- Gremlin
  - [gremlin starter]
  - [other gremlin examples]

## Ideas

### Recommender

keep track of nodes visited by a user and find mutual connections between those nodes (online updates)

### Beginner

- If the user is a beginner, bump the `important` link (many inbound links) - more likely to be relevant
- If the user asks a simple question and then asks another question, we determine she is a newb and bump the `important` link

### Content Analysis

- if there are few or no mutual connections between the nodes, ask for disambiguation
- community detection
- topic modeling
- centrality
- next-best action
- recommendations

### User Profile Generation

#### Individual

- `niche` number: query complexity proxy: use inbound links as a metric to determine how niche the answer is - thus how complex the query is
  - if the user asks x (e.g., 3) questions with a average/cumulative inbound link count of y (e.g., 5), we determine the user is a beginner
  - how to leverage the profile:

#### Cohort

- use graph traversal activities to cluster users based on their activity
- recommendations
- social networks
- communities
- user groups/personas
- customer journey mapping

MEAN inbound edges for sample 500 = 4.31

## Pros Cons

### Graph Databases
- pros:
  - Designed for surfacing/discovering relationships (direct and indirect) between entities
  - Analysis can be done on the entire graph relatively quickly
  - Summary statistics can be generated with ease
- cons:
  1. Requires modeling the data as a graph, which the data may not - unaltered - be conducive to
  2. Subgraph traversal relevance is dependent upon picking relevant entry point (i.e., related entities to irrelevant entities produces irrelevant results)
- remediation techniques:
  1. Use LLM to generate graph entities from plain text (narrative) data
  2. Use a hybrid approach (e.g., graph + vector) to leverage the strengths of both

### Vector Databases
- pros:
  - Designed for easy and fast retrieval of similar pieces of text
  - Relationships are not directly modeled, but rather inferred by their proximity in the vector space (easy)
- cons:
  1. Result quality highly dependent on the quality of the query
  2. May return results that are similar to the query, but from widely different contexts
- remediation techniques:
  1. Provide suggestions to the user to refine their query
  2. Use methods of disambiguation to determine the context of the query

## Opportunities
- hash table of titles/headings to vectors to find entry-point for traversal
- inbound -> outbound customer journey assistance
- follow-up questions (related articles from articles/chunks)
- profiling users based on their visitation patterns (customer journey/cohorts/personas)

[Cosmos RAGs]: https://github.com/microsoft/AzureDataRetrievalAugmentedGenerationSamples/blob/main/Python/CosmosDB-MongoDB-vCore/CosmosDB-MongoDB-vCore_AzureOpenAI_Tutorial.ipynb
[gremlin starter]: https://github.com/Azure-Samples/azure-cosmos-db-graph-python-getting-started/blob/master/connect.py
[other gremlin examples]: https://github.com/search?q=org%3AAzure-Samples%20gremlin&type=code


In [2]:
import nest_asyncio
from fns.rag_fns import get_community
from fns.azure_fns import graph_get_most_connected
nest_asyncio.apply()

# results = get_community("How do you calculate tax in the state of PA?")
# results = get_community("How do I add a taxpayer?")

# results = get_community("Where can I find information on how to integrate the Exemption Certificate Wizard?") 
# No match

# What is the purpose of the Buyer Input Tax event?
# -> Financial events in O Series	Asset/Goods Movement event
# results = get_community("What is the purpose of the Buyer Input Tax event?") 
# Match in 3rd kb result > 2nd idx sibling

# What is the maximum number of characters that can be used for the taxpayer's code
# -> Provide general taxpayer information in O Series	Field descriptions for the Taxpayers > General tab Part 1
# results = get_community("What is the maximum number of characters that can be used for the taxpayer's code in O Series?") 
# works if you add 'in O Series'  and grab the 3rd kb result > 1st idx sibling

# What are the different tabs available in the customer setup process and what information is entered
# -> Set up your customers in O Series	Navigate to the Customer feature
# results = get_community("What are the different tabs available in the customer setup process in O Series?") 
# ✅ works if you add 'in O Series' to the question

# What is the jurisdiction and imposition type for the tax that is paid by the buyer
# -> MATCHED!	Test for Brazil ISS - Intercity service provider perspective
# results = get_community("What is the jurisdiction and imposition type for the tax that is paid by the buyer?")
# match in 2nd kb result > 0th idx sibling

# What does the Includes Taxable Amount check box indicate?	
# -> Specify details for a post calculation evaluation rule	Field descriptions for Post Calculation Evaluation rule > Details tab Part 2
# results = get_community("What does the 'Includes Taxable Amount' check box indicate Taxability Review and Data Export Methods")
# works if you add 'for a post calculation' to the question and grab the 3rd kb result > 3rd idx sibling

# What does the Is Service column indicate in the report?
# -> Transaction Detail extract in O Series Custom Reports - VRA	Notes: Part 12
# results = get_community("What does the Is Service column indicate in the report?")
# ✅ gets the right result, but the product isn't labeled O Series Cloud (mislabelled)

# What are the default values for the O Series configuration parameters in ITI?
# -> O Series configuration parameters for ITI	O Series configuration parameters for ITI Part 2
# results = get_community("What are the default values for the O Series configuration parameters in ITI?")
# FIXME: `important`, `mutuals` and `siblings` are all empty because they are all chunks from the same page
# gets the right page, but not the right chunk ("Part 2")

# What prerequisites and setup information do I need to be aware of before extracting data from
# -> Extracting data from SAP for ITI	Extracting data from SAP for ITI
# results = get_community("What prerequisites and setup information do I need to be aware of before extracting data from SAP for ITI?")
# gets correct chunk, but same problem as above, all chunks are from the same page

# What are the required user authentication parameters for O Series On-Premise and On
# -> Command line utility: getFile	Command line utility: getFile
# results = get_community("What are the required user authentication parameters for file downloads?")
# gets the correct answer if you add "for file downloads" to the question


# What are the line-item level tabs in the O Series Transaction Tester?	
# -> Set up line-item details in the O Series Transaction Tester	Set up line-item details in the O Series Transaction Tester
results = get_community("What are the line-item level tabs in the O Series Transaction Tester?")
# ✅ works now 🤔 

# What data elements were added to the O Series 9.0 schema to support
# -> XML schema changes for Communications Tax in O Series 9.0	New communications tax data elements
# results = get_community("What data elements were added to the O Series 9.0 schema to support communications tax?")
# ✅ looks like it works now :: content seems to change a lot


# What is the purpose of the batch.client.userName parameter in the Batch Client
# -> Process a BCI database using runBatchClientInterface in Client Utilities	Process a BCI database using runBatchClientInterface in Client Utilities
# results = get_community("What is the purpose of the 'batch.client.userName' parameter in the Batch Client?")
# ✅ actually gets a good result, not the one specified in the benchmark though




print(json.dumps(results, indent=4))

# connections = graph_get_most_connected(["COSMyEnterprise:en:Taxpayers", 'COSMyEnterprise:en:169626', 'COSMyEnterprise:en:169591'], "both")
# print(json.dumps(connections, indent=4))

graph_get_connected Query: g.V('COSTools:en:185493').out('parent').has('heading', 'Set up line item details in the O Series Transaction Tester').tree()
graph_get_connected Query: g.V('COSTools:en:185493').out('parent').tree()
node_id: COSTools:en:185493
url: https://community.vertexinc.com/s/document-item?bundleId=COSTools&topicId=185493.html&_LANG=enus
heading: Set up line item details in the O Series Transaction Tester
graph_get_connected Query: g.V('COSTools:en:185493').out('related').tree()
graph_get_connected Query: g.V('COSTools:en:185493:0').out('related').tree()
graph_get_connected Query: g.V('COSTools:en:185341').out('parent').has('heading', 'Line Type codes in O Series').tree()
graph_get_connected Query: g.V('COSTools:en:185341').out('parent').tree()
node_id: COSTools:en:185341
url: https://community.vertexinc.com/s/document-item?bundleId=COSTools&topicId=185341.html&_LANG=enus
heading: Line Type codes in O Series
graph_get_connected Query: g.V('COSTools:en:185341').out('rela

In [3]:
from fns.graph_construction import download_af_files
import json
import nest_asyncio
nest_asyncio.apply()


test_targets = [
    "IrisProductGuide",
    "GettingStartedWithProductDocumentation",
    "OSeries9ReleaseNotesCloud",
    "WhatsNewinVertexCloudfor2022",
    "COSOverview",
    "Vertexe-InvoicingReleaseNotes",
    "OSInstallationGuide",
    "ITIOverview",
    "AccessConnectorDocumentation",
    "AccountandBillingforVertexCloud",
    "AddOnFeaturesinVertexCloud",
    "AddressCleansing",
    "AdobeCommerce-ReleaseNotes",
]

# download_af_files("test") # , targeted_products=test_targets)

#### For Error: `CERTIFICATE_VERIFY_FAILED` error in websocket-client

This can happen on MacOS when connecting to a websocket server. To fix this, run
the `Install Certificates.command` script that comes with Python.

```sh
open "/Applications/Python 3.<your version>/Install Certificates.command"
```


In [4]:
import json
import nest_asyncio
from fns.azure_fns import upsert_node, exec_gremlin, upsert_edges, prune
nest_asyncio.apply()

nodes = [
    {"node_id": "tu", "node_type": "person", "properties": {"order": 1}},
    {"node_id": "kim", "node_type": "person", "properties": {"order": 1}},
    {"node_id": "logan", "node_type": "person", "properties": {"order": 2}},
    {"node_id": "william", "node_type": "person", "properties": {"order": 3}},
    {"node_id": "amelia", "node_type": "person", "properties": {"order": 3}},
    {"node_id": "sung", "node_type": "person", "properties": {"order": 3}},
    {"node_id": "kyo", "node_type": "person", "properties": {"order": 3}},
    {"node_id": "catherine", "node_type": "person", "properties": {"order": 3}},
]


edges = [
    {"from_id": "tu", "rel_to": "younger", "to_id": "kim"},
    {"from_id": "kim", "rel_to": "parent", "to_id": "logan"},
    {"from_id": "logan", "rel_to": "parent", "to_id": "william"},
    {"from_id": "logan", "rel_to": "parent", "to_id": "amelia"},
    {"from_id": "sung", "to_id": "kim", "rel_to": "parent"},
    {"from_id": "kyo", "to_id": "kim", "rel_to": "parent"},
    {"from_id": "kim", "to_id": "catherine", "rel_to": "parent"},
    {"from_id": "kyo", "to_id": "tu", "rel_to": "parent"},
    {"from_id": "tu", "to_id": "sung", "rel_from": "parent", "rel_to": "child"},
]

# nodes_out = [upsert_node(**node) for node in nodes]
# edges_out = [upsert_edges(**edge) for edge in edges]
# pruned = prune("logan")

# print(json.dumps(nodes_out, indent=2))
# print(json.dumps(edges_out, indent=2))
# print(json.dumps(pruned, indent=2))