# Data Storage and Access Patterns (including Astra Vector Search)



# Table Schema

```
CREATE TABLE IF NOT EXISTS workspan.customer_opportunities (
    customer_id text,
    partner_id text,
    opportunity_id text,
    customer_name text static,
    next_step text,
    cadence text,
    llm_output text,
    opportunity map<text, text>,
    llm_output_embedding vector<float, 1536>,
    sentiment text,
    PRIMARY KEY ((customer_id, partner_id), opportunity_id)
) WITH CLUSTERING ORDER BY (opportunity_id DESC)

```



* customer_id: Unique identifier for each customer
* partner_id: AWS / Azure / GCP
* opportunity_id: Unique identifier for each opportunity
* opportunity: Dynamic collection of data field names and corresponding values specific to the opportunity. Other fields can be stored in separate table columns.
* llm_ouput: LLM output summarizing the 'next steps' , 'challenges' and 'open items'  
* llm_ouput_embedding: Text embeddings corresponding to llm_output
* sentiment: Positive, Neutral and Negative sentiment derived from 'next step' and 'cadence' data fields. This can be determined either by using a sentiment analysis library such as  Python Natural Language Toolkit (NLTK) or from LLM.


# Imports

In [None]:
!pip install openai cassandra-driver llama-index

In [3]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
import openai
from llama_index import ListIndex
from llama_index.readers.schema.base import Document
from IPython.display import Markdown, display

# Keys & Environment Variables

In [4]:
# keys and tokens here
openai_api_key = "<openai_key>"
openai.api_key = openai_api_key
cass_user = '<astra_client_id>'
cass_pw = '<astra_secret>'
scb_path = 'secure-connect-vector-search-demo.zip'

# Select a model to compute embeddings

In [5]:
model_id = "text-embedding-ada-002"

# Connect to the Cluster

In [None]:
cloud_config= {
  'secure_connect_bundle': scb_path
}
auth_provider = PlainTextAuthProvider(cass_user, cass_pw)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()
session.set_keyspace('workspan')

# Drop / Create Schema

In [None]:

# # Create Table
session.execute(f"""CREATE TABLE IF NOT EXISTS workspan.customer_opportunities (
    customer_id text,
    partner_id text,
    opportunity_id text,
    customer_name text static,
    next_step text,
    cadence text,
    llm_output text,
    opportunity map<text, text>,
    llm_output_embedding vector<float, 1536>,
    sentiment text,
    PRIMARY KEY ((customer_id, partner_id), opportunity_id)
) WITH CLUSTERING ORDER BY (opportunity_id DESC)""")



In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities(opportunity) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities (llm_output_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities (sentiment) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

# Create the following SAI with analyser from CQL


```
CREATE CUSTOM INDEX ON workspan.customer_opportunities(llm_output) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' WITH OPTIONS = {'index_analyzer': '{
"tokenizer" : {"name" : "standard"},
"filters" : [{"name" : "porterstem"}] }'};
```



# Calculate the sentiment

Sentiment can be calculated either by using a sentiment analysis library such as Python Natural Language Toolkit (NLTK) or from LLM.

In [8]:
def indentify_sentiment(next_step_and_cadence):
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    nltk.download('vader_lexicon')
    sid = SentimentIntensityAnalyzer()

    sentiment_scores = sid.polarity_scores(next_step_and_cadence)
    if sentiment_scores['compound'] >= 0.05:
        sentiment = 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    print(f"Sentiment: {sentiment}")
    return sentiment


# Generate LLM ouput

In [31]:
def extract_llm_information(next_step_and_cadence):

    llm_input = f"""
                Given the following information, please identify the challenges, next steps, and open items. If there is a mention of any meetings that need to be scheduled, please list it under 'schedule a meeting:'. If there's no indication of a meeting, then avoid the listing:

                {next_step_and_cadence}

                End of information.
                """

    response = openai.ChatCompletion.create(
        #model="gpt-3.5-turbo",
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use the instructions provided to process the information."},
            {"role": "user", "content": llm_input}
        ]
    )

    return response.choices[0].message['content']

# Insert records with the calculated sentiment and LLM embeddings

In [32]:
query = SimpleStatement(
            f"""
            INSERT INTO workspan.customer_opportunities
            (customer_id, partner_id, opportunity_id, customer_name, llm_output,  opportunity , llm_output_embedding , sentiment )
            VALUES (%s, %s, %s, %s, %s , %s , %s , %s )
            """
        )

# record #1

next_step = f"""
Action Items:
From Michael, confirmed deprioritize. From Anjaney, account executive interest to schedule meeting - Anjaney to schedule call with Nirav/Amy on R&D.
"""

cadence = f"""
Next Step:
08/16/2023 : Review partner information updates and update opportunity details. 8/17(LR) - connecting with Partner to offer co-sell support

Next Step History:
null;08/16/2023 : Review partner information updates and update opportunity details.;08/16/2023 : Review partner information updates and update opportunity details. 8/17(LR) - connecting with Partner to offer co-sell support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-7202838a', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #2

next_step = f"""
Action Items:
From Autumn, send recording of last call and our discussed inputs from demo 8/28. Ramesh will provide to Caroline by early next week (of 9/11).
"""

cadence = f"""
REVIEW TECH & Economic Proposal
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a038b8a', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #3

next_step = f"""
Action Items:
Joint sync set for 9/7. Enablement session to follow + in person account mapping. Caroline / Michael to begin coordinating. EAI presence
"""

cadence = f"""
07/05/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a3b0348', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #4

next_step = f"""
Action Items:
From Caroline, user community engaged to respond to questions. @Dataiku - How can we get initial data from user community/pull together PoV for client? Action (Asan/Ken (sp?)): In-person outreach to Deloitte users and follow-up to 5 responses received.
"""

cadence = f"""
null;06/20/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support;07/05/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a7128a3', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #4

next_step = f"""
Propsal did not go thru. No budget Left. Negative.
"""

cadence = f"""
No further follow up required.
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a7128a4', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment: positive
Challenges:
1. Identified from the available information, a challenge is the deprioritization confirmed by Michael. However, further context is needed to ascertain the exact nature of this challenge.

Next Steps:
1. On 08/16/2023, review partner information updates and update opportunity details.
2. On 08/17/2023, LR is to connect with the Partner to offer co-sell support.

Open Items:
1. Scheduling of meeting as mentioned by Anjaney needs to be confirmed.

Schedule a Meeting:
1. Anjaney to schedule a call with Nirav/Amy on R&D./n
Sentiment: neutral


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Challenges:
1. Ensuring that the recording of the last call and our discussed inputs from demo 8/28 are sent in a timely manner.
2. Reviewing the Technical and Economic proposal may require a significant amount of time and expertise.

Next Steps:
1. Autumn must send the recording of the last call and their discussed inputs from demo 8/28.
2. Ramesh needs to provide the aforementioned resources to Caroline by early next week (of 9/11).
3. Reviewing the Technical and Economic proposal.

Open Items:
1. Waiting for Autumn to complete the task of sending the recording and discussed inputs.
2. Expecting Ramesh to provide resources to Caroline by early next week.
3. Review of the Technical and Economic proposal is still pending.

Schedule a Meeting:
No meetings to be scheduled as per the information provided./n
Sentiment: positive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Challenges:
1. Coordinating with all parties involved in the joint sync and enablement session.
2. Engaging with Federico Gandolfo for deal support.

Next Steps:
1. Conduct the joint sync on 9/7 followed by an enablement session and in-person account mapping.
2. Contact and coordinate with Federico Gandolfo to discuss deal support on 07/05/2023.

Open Items:
1. Begin coordination between Caroline and Michael for the joint sync and enablement session.
2. Establishing EAI presence.

Schedule a Meeting: 
1. A joint sync is scheduled for 9/7.
2. A meeting with Federico Gandolfo needs to be arranged on 07/05/2023./n
Sentiment: positive


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Challenges:
1. Getting initial data from the user community.
2. Pulling together a PoV for a client.
3. Reaching out to Deloitte users for questions and feedback, especially those that haven't responded.

Next Steps:
1. Asan and Ken will initiate an in-person outreach to Deloitte users and follow up on the 5 responses received. 
2. Caroline will continue to engage the user community to respond to questions. 
3. Federico Gandolfo needs to be contacted to discuss Deal support. 

Open Items: 
1. What is the strategy to get initial data from the user community?
2. What would be the ideal PoV for the client?

Schedule a Meeting:
1. Federico Gandolfo needs to be contacted on 06/20/2023 to discuss Deal support.
2. A second meeting with Federico Gandolfo needs to be scheduled on 07/05/2023 for further discussion on Deal support./n
Sentiment: negative


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Challenges: 
1. The proposal did not get approval. 
2. There is no budget left.

Next Steps:
1. No further follow up required.

Open Items:
None

Schedule a Meeting: 
None/n


<cassandra.cluster.ResultSet at 0x7910a4651ed0>

# Opportunity Specific Queries

What is the customer sentiment on this opportunity?

In [11]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and opportunity_id = 'WS-7202838a'  ;'''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    sentiment:      positive

...


What are the next steps for this opportunity? Result can be further parsed to display only the next steps. Same query returns open items and challenges as well.  

In [34]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and opportunity_id = 'WS-7202838a';  '''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    llm_output:    \n  {row.llm_output}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    llm_output:    
  Challenges:
1. Identified from the available information, a challenge is the deprioritization confirmed by Michael. However, further context is needed to ascertain the exact nature of this challenge.

Next Steps:
1. On 08/16/2023, review partner information updates and update opportunity details.
2. On 08/17/2023, LR is to connect with the Partner to offer co-sell support.

Open Items:
1. Scheduling of meeting as mentioned by Anjaney needs to be confirmed.

Schedule a Meeting:
1. Anjaney to schedule a call with Nirav/Amy on R&D.

...


# Customer Specific Queries (across multiple opportunities)

Identify the wins

In [35]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and sentiment = 'positive'  ;'''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a7128a3
    customer_name:      Teradyne, Inc.
    sentiment:      positive

Row 1:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a3b0348
    customer_name:      Teradyne, Inc.
    sentiment:      positive

Row 2:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    sentiment:      positive

...


Identify opportunities with next step to schedule a meeting

In [36]:
vectorsearchon = 'next action to set up a meeting'
embedding = openai.Embedding.create(input= vectorsearchon, model=model_id)['data'][0]['embedding']

cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and llm_output : 'schedule a meeting' ORDER BY llm_output_embedding ANN OF {embedding} LIMIT 10;  '''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')
    print(f'    llm_output:    \n  {row.llm_output}')

print('\n...')



Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a038b8a
    customer_name:      Teradyne, Inc.
    sentiment:      neutral
    llm_output:    
  Challenges:
1. Ensuring that the recording of the last call and our discussed inputs from demo 8/28 are sent in a timely manner.
2. Reviewing the Technical and Economic proposal may require a significant amount of time and expertise.

Next Steps:
1. Autumn must send the recording of the last call and their discussed inputs from demo 8/28.
2. Ramesh needs to provide the aforementioned resources to Caroline by early next week (of 9/11).
3. Reviewing the Technical and Economic proposal.

Open Items:
1. Waiting for Autumn to complete the task of sending the recording and discussed inputs.
2. Expecting Ramesh to provide resources to Caroline by early next week.
3. Review of the Technical and Economic proposal is still pending.

Schedule a Meeting:
No meetings to be scheduled as per the information provided

I want to know more about the customer and the challenges so that it is possible to plan accordingly (Implement query using agent framework such as LangChain, LlamaIndex..)

In [37]:
vectorsearchon = 'find opportunity with listed challenges'
embedding = openai.Embedding.create(input= vectorsearchon, model=model_id)['data'][0]['embedding']

cqlSelect = f'''SELECT llm_output FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' ORDER BY llm_output_embedding ANN OF {embedding} LIMIT 10;  '''
rows = session.execute(cqlSelect)
print(rows)
documents = []
for item in rows:
    documents.append(Document(text=str(item)))
    print(str(item))

index = ListIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What are the Challenges?")

# visualize in console or web
print(response)
display(Markdown(f"<b>{response}</b>"))

<cassandra.cluster.ResultSet object at 0x7910a4482650>
Row(llm_output='Challenges:\n1. Identified from the available information, a challenge is the deprioritization confirmed by Michael. However, further context is needed to ascertain the exact nature of this challenge.\n\nNext Steps:\n1. On 08/16/2023, review partner information updates and update opportunity details.\n2. On 08/17/2023, LR is to connect with the Partner to offer co-sell support.\n\nOpen Items:\n1. Scheduling of meeting as mentioned by Anjaney needs to be confirmed.\n\nSchedule a Meeting:\n1. Anjaney to schedule a call with Nirav/Amy on R&D.')
Row(llm_output="Challenges:\n1. Getting initial data from the user community.\n2. Pulling together a PoV for a client.\n3. Reaching out to Deloitte users for questions and feedback, especially those that haven't responded.\n\nNext Steps:\n1. Asan and Ken will initiate an in-person outreach to Deloitte users and follow up on the 5 responses received. \n2. Caroline will continue t

<b>The challenges mentioned in the context are:
1. Identified deprioritization, but further context is needed to ascertain the exact nature of this challenge.
2. Getting initial data from the user community.
3. Pulling together a PoV for a client.
4. Reaching out to Deloitte users for questions and feedback, especially those that haven't responded.
5. The proposal did not get approval.
6. There is no budget left.
7. Ensuring that the recording of the last call and discussed inputs from demo are sent in a timely manner.
8. Reviewing the Technical and Economic proposal may require a significant amount of time and expertise.
9. Coordinating with all parties involved in the joint sync and enablement session.
10. Engaging with Federico Gandolfo for deal support.</b>