# Data Storage and Access Patterns (including Astra Vector Search)



# Table Schema

```
CREATE TABLE IF NOT EXISTS workspan.customer_opportunities (
    customer_id text,
    partner_id text,
    opportunity_id text,
    customer_name text static,
    next_step text,
    cadence text,
    llm_output text,
    opportunity map<text, text>,
    llm_output_embedding vector<float, 1536>,
    sentiment text,
    PRIMARY KEY ((customer_id, partner_id), opportunity_id)
) WITH CLUSTERING ORDER BY (opportunity_id DESC)

```



* customer_id: Unique identifier for each customer
* partner_id: AWS / Azure / GCP
* opportunity_id: Unique identifier for each opportunity
* opportunity: Dynamic collection of data field names and corresponding values specific to the opportunity. Other fields can be stored in separate table columns.
* llm_ouput: LLM output summarizing the 'next steps' , 'challenges' and 'open items'  
* llm_ouput_embedding: Text embeddings corresponding to llm_output
* sentiment: Positive, Neutral and Negative sentiment derived from 'next step' and 'cadence' data fields. This can be determined either by using a sentiment analysis library such as  Python Natural Language Toolkit (NLTK) or from LLM.


# Imports

In [None]:
!pip install openai cassandra-driver llama-index

In [2]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory
from cassandra.query import SimpleStatement
import openai
from llama_index import ListIndex
from llama_index.readers.schema.base import Document
from IPython.display import Markdown, display

# Keys & Environment Variables

In [3]:
# keys and tokens here
openai_api_key = "<openai_key>"
openai.api_key = openai_api_key
cass_user = '<user>'
cass_pw = '<pwd>'
scb_path = 'secure-connect-vector-search-demo.zip'

# Select a model to compute embeddings

In [4]:
model_id = "text-embedding-ada-002"

# Connect to the Cluster

In [None]:
cloud_config= {
  'secure_connect_bundle': scb_path
}
auth_provider = PlainTextAuthProvider(cass_user, cass_pw)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()
session.set_keyspace('workspan')

# Drop / Create Schema

In [None]:

# # Create Table
session.execute(f"""CREATE TABLE IF NOT EXISTS workspan.customer_opportunities (
    customer_id text,
    partner_id text,
    opportunity_id text,
    customer_name text static,
    next_step text,
    cadence text,
    llm_output text,
    opportunity map<text, text>,
    llm_output_embedding vector<float, 1536>,
    sentiment text,
    PRIMARY KEY ((customer_id, partner_id), opportunity_id)
) WITH CLUSTERING ORDER BY (opportunity_id DESC)""")



In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities(opportunity) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities (llm_output_embedding) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

In [None]:
session.execute(f"""CREATE CUSTOM INDEX ON workspan.customer_opportunities (sentiment) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'""")

# Create the following SAI with analyser from CQL


```
CREATE CUSTOM INDEX ON workspan.customer_opportunities(llm_output) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' WITH OPTIONS = {'index_analyzer': '{
"tokenizer" : {"name" : "standard"},
"filters" : [{"name" : "porterstem"}] }'};
```



# Calculate the sentiment

Sentiment can be calculated either by using a sentiment analysis library such as Python Natural Language Toolkit (NLTK) or from LLM.

In [10]:
def indentify_sentiment(next_step_and_cadence):
    import nltk
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    nltk.download('vader_lexicon')
    sid = SentimentIntensityAnalyzer()

    sentiment_scores = sid.polarity_scores(next_step_and_cadence)
    if sentiment_scores['compound'] >= 0.05:
        sentiment = 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    print(f"Sentiment: {sentiment}")
    return sentiment


# Generate LLM ouput

In [23]:
def extract_llm_information(next_step_and_cadence):
    llm_input = f"""
                Given the following information, please list out the challenges, next steps to schedule a meeting, and open items mentioned. Do not list 'next steps' if there is no follow-up meeting that needs to be scheduled or if it's mentioned that no further follow-up is required:

                {next_step_and_cadence}

                End of information.
                """

    response = openai.ChatCompletion.create(
        #model="gpt-3.5-turbo",
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use the instructions provided to process the information."},
            {"role": "user", "content": llm_input}
        ]
    )

    return response.choices[0].message['content']

# Insert records with the calculated sentiment and LLM embeddings

In [24]:
query = SimpleStatement(
            f"""
            INSERT INTO workspan.customer_opportunities
            (customer_id, partner_id, opportunity_id, customer_name, llm_output,  opportunity , llm_output_embedding , sentiment )
            VALUES (%s, %s, %s, %s, %s , %s , %s , %s )
            """
        )

# record #1

next_step = f"""
Action Items:
From Michael, confirmed deprioritize. From Anjaney, account executive interest to schedule meeting - Anjaney to schedule call with Nirav/Amy on R&D.
"""

cadence = f"""
Next Step:
08/16/2023 : Review partner information updates and update opportunity details. 8/17(LR) - connecting with Partner to offer co-sell support

Next Step History:
null;08/16/2023 : Review partner information updates and update opportunity details.;08/16/2023 : Review partner information updates and update opportunity details. 8/17(LR) - connecting with Partner to offer co-sell support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-7202838a', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #2

next_step = f"""
Action Items:
From Autumn, send recording of last call and our discussed inputs from demo 8/28. Ramesh will provide to Caroline by early next week (of 9/11).
"""

cadence = f"""
REVIEW TECH & Economic Proposal
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a038b8a', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #3

next_step = f"""
Action Items:
Joint sync set for 9/7. Enablement session to follow + in person account mapping. Caroline / Michael to begin coordinating. EAI presence
"""

cadence = f"""
07/05/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a3b0348', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #4

next_step = f"""
Action Items:
From Caroline, user community engaged to respond to questions. @Dataiku - How can we get initial data from user community/pull together PoV for client? Action (Asan/Ken (sp?)): In-person outreach to Deloitte users and follow-up to 5 responses received.
"""

cadence = f"""
null;06/20/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support;07/05/2023: Contact Federico Gandolfo,federico.hernan.gandolfo@abc.com,+54.911.3204.4871 to discuss Deal support
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a7128a3', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))

# record #4

next_step = f"""
Propsal did not go thru. No budget Left. Negative.
"""

cadence = f"""
No further follow up required.
"""

next_step_and_cadence = next_step + cadence

sentiment = indentify_sentiment(next_step_and_cadence)
llm_output = extract_llm_information(next_step_and_cadence)
print(f"{llm_output}/n")
embedding_llm_output = openai.Embedding.create(input= llm_output, model=model_id)['data'][0]['embedding']
session.execute(query, ('CUS100' , 'AWS', 'WS-8a7128a4', 'Teradyne, Inc.', llm_output,  {'Customer State' : 'Ile-de-France', 'Customer Country' : 'France', 'Deal Size' : '30000', 'Description' : 'Pain Point: Persistent phishing attacks and email compromises leading to data leaks and compromised accounts. Description: The customer is struggling with the recurring threat of phishing attacks that infiltrate their systems, compromising sensitive data and risking the confidentiality of critical information.' }
                        , embedding_llm_output , sentiment))



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment: positive
Challenges:
1. Michael: Need to deprioritize certain action items.

Next Steps to Schedule a Meeting:
1. Anjaney: Schedule call with Nirav/Amy on R&D.
2. Next step on 08/16/2023: Review partner information updates and update opportunity details.
3. LR on 08/17/2023: Connect with Partner to offer co-sell support.

Open Items:
1. Review and update of partner information.
2. Update of opportunity details.
3. Connection with Partner for co-sell support./n
Sentiment: neutral


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Challenges:
1. Availability and coordination between Autumn, Ramesh, and Caroline.
2. The need for Autumn to send the recording of the last call and the discussed inputs from demo 8/28. 
3. Reviewing the Tech & Economic Proposal.

Next Steps:
1. Autumn needs to send the recording and inputs to Ramesh. 
2. Ramesh needs to pass on the information to Caroline by the next week.

Open Items:
1. Autumn's recording and discussed inputs from the demo. 
2. Review of the Tech & Economic Proposal./n


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment: positive
Challenges:
1. Coordinating with Caroline and Michael for the joint sync set for 9/7.
2. Organizing the enablement session that is to follow after the joint sync.
3. Establishing EAI presence.
4. Reaching out to Federico Gandolfo for deal support discussions.

Next steps to schedule a meeting:
1. Caroline and Michael to start organizing for the joint sync set for 9/7.
2. Arrangement for the enablement session to follow after the sync.
3. Contact Federico Gandolfo to discuss deal support.

Open Items:
1. Establishing EAI presence. 
2. Contacting Federico Gandolfo for deal support./n


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment: positive
Challenges:
1. Getting initial data from the user community.
2. Pulling together a Point of View (PoV) for the client.
3. Outreach to Deloitte users which is in-person.
4. Follow up on the received 5 responses.

Next Steps To Schedule A Meeting:
1. On 06/20/2023, a meeting should be scheduled to contact Federico Gandolfo to discuss Deal support.
2. On 07/05/2023, another meeting should be scheduled to contact Federico Gandolfo to discuss Deal support.

Open Items:
1. Engage the user community to respond to questions.
2. Address the data gathering concern raised by @Dataiku.
3. Plan and execute an in-person outreach to Deloitte users.
4. Follow-up on the 5 responses received from Deloitte users.
5. Discuss Deal support with Federico Gandolfo./n


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Sentiment: negative
Challenges:
1. Proposal did not go through.
2. There is no remaining budget.

Open items:
None

Next steps:
None mentioned (No further follow up required)./n


<cassandra.cluster.ResultSet at 0x7e3d25aa7ee0>

# Opportunity Specific Queries

What is the customer sentiment on this opportunity?

In [13]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and opportunity_id = 'WS-7202838a'  ;'''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    sentiment:      positive

...


What are the next steps for this opportunity? Result can be further parsed to display only the next steps. Same query returns open items and challenges as well.  

In [14]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and opportunity_id = 'WS-7202838a';  '''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    llm_output:    \n  {row.llm_output}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    llm_output:    
  Challenges:
- It is not clear what needs to be deprioritized based on Michael's message.
- The level of interest from Anjaney regarding scheduling a meeting is not specified.
- The details of the R&D call with Nirav/Amy are not provided.
- The partner information updates and opportunity details need to be reviewed and updated, but it is not specified what these updates entail.
- The purpose or outcome of connecting with the partner to offer co-sell support is not mentioned.

Next Steps:
- Review partner information updates and update opportunity details (scheduled for 08/16/2023).
- Connect with Partner to offer co-sell support (scheduled for 8/17, but not clear who will be responsible for this).

Open Items:
- Confirmation on what needs to be deprioritized.
- Level of interest in scheduling a meeting with Anjaney.
- Purpose and

# Customer Specific Queries (across multiple opportunities)

Identify the wins

In [15]:
cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and sentiment = 'positive'  ;'''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')

print('\n...')


Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a7128a3
    customer_name:      Teradyne, Inc.
    sentiment:      positive

Row 1:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a3b0348
    customer_name:      Teradyne, Inc.
    sentiment:      positive

Row 2:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-7202838a
    customer_name:      Teradyne, Inc.
    sentiment:      positive

...


Identify opportunities with next step to schedule a meeting

In [28]:
vectorsearchon = 'next action to set up a meeting'
embedding = openai.Embedding.create(input= vectorsearchon, model=model_id)['data'][0]['embedding']

cqlSelect = f'''SELECT * FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' and llm_output : 'schedule a meeting' ORDER BY llm_output_embedding ANN OF {embedding} LIMIT 10;  '''
rows = session.execute(cqlSelect)
for row_i, row in enumerate(rows):
    print(f'\nRow {row_i}:')
    print(f'    customer_id:      {row.customer_id}')
    print(f'    partner_id:      {row.partner_id}')
    print(f'    opportunity_id:      {row.opportunity_id}')
    print(f'    customer_name:      {row.customer_name}')
    print(f'    sentiment:      {row.sentiment}')
    print(f'    llm_output:    \n  {row.llm_output}')

print('\n...')



Row 0:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a3b0348
    customer_name:      Teradyne, Inc.
    sentiment:      positive
    llm_output:    
  Challenges:
1. Coordinating with Caroline and Michael for the joint sync set for 9/7.
2. Organizing the enablement session that is to follow after the joint sync.
3. Establishing EAI presence.
4. Reaching out to Federico Gandolfo for deal support discussions.

Next steps to schedule a meeting:
1. Caroline and Michael to start organizing for the joint sync set for 9/7.
2. Arrangement for the enablement session to follow after the sync.
3. Contact Federico Gandolfo to discuss deal support.

Open Items:
1. Establishing EAI presence. 
2. Contacting Federico Gandolfo for deal support.

Row 1:
    customer_id:      CUS100
    partner_id:      AWS
    opportunity_id:      WS-8a7128a3
    customer_name:      Teradyne, Inc.
    sentiment:      positive
    llm_output:    
  Challenges:
1. Getting initial data

I want to know more about the customer and the challenges so that it is possible to plan accordingly (Implement query using agent framework such as LangChain, LlamaIndex..)

In [29]:
vectorsearchon = 'find opportunity with listed challenges'
embedding = openai.Embedding.create(input= vectorsearchon, model=model_id)['data'][0]['embedding']

cqlSelect = f'''SELECT llm_output FROM workspan.customer_opportunities WHERE customer_id = 'CUS100' and partner_id = 'AWS' ORDER BY llm_output_embedding ANN OF {embedding} LIMIT 10;  '''
rows = session.execute(cqlSelect)
print(rows)
documents = []
for item in rows:
    documents.append(Document(text=str(item)))
    print(str(item))

index = ListIndex.from_documents(documents)

# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What are the Challenges?")

# visualize in console or web
print(response)
display(Markdown(f"<b>{response}</b>"))

<cassandra.cluster.ResultSet object at 0x7e3d25aa72b0>
Row(llm_output='Challenges:\n1. Michael: Need to deprioritize certain action items.\n\nNext Steps to Schedule a Meeting:\n1. Anjaney: Schedule call with Nirav/Amy on R&D.\n2. Next step on 08/16/2023: Review partner information updates and update opportunity details.\n3. LR on 08/17/2023: Connect with Partner to offer co-sell support.\n\nOpen Items:\n1. Review and update of partner information.\n2. Update of opportunity details.\n3. Connection with Partner for co-sell support.')
Row(llm_output='Challenges:\n1. Proposal did not go through.\n2. There is no remaining budget.\n\nOpen items:\nNone\n\nNext steps:\nNone mentioned (No further follow up required).')
Row(llm_output='Challenges:\n1. Getting initial data from the user community.\n2. Pulling together a Point of View (PoV) for the client.\n3. Outreach to Deloitte users which is in-person.\n4. Follow up on the received 5 responses.\n\nNext Steps To Schedule A Meeting:\n1. On 06/20

<b>The challenges mentioned in the context are:
1. Need to deprioritize certain action items.
2. Proposal did not go through.
3. There is no remaining budget.
4. Getting initial data from the user community.
5. Pulling together a Point of View (PoV) for the client.
6. Outreach to Deloitte users which is in-person.
7. Follow up on the received 5 responses.
8. Availability and coordination between Autumn, Ramesh, and Caroline.
9. The need for Autumn to send the recording of the last call and the discussed inputs from demo 8/28.
10. Reviewing the Tech & Economic Proposal.
11. Coordinating with Caroline and Michael for the joint sync set for 9/7.
12. Organizing the enablement session that is to follow after the joint sync.
13. Establishing EAI presence.
14. Reaching out to Federico Gandolfo for deal support discussions.</b>