In [2]:
import re
import openai
import numpy as np
import pandas as pd
import pickle
import tiktoken


from typing import Set
from transformers import GPT2TokenizerFast

import numpy as np
from nltk.tokenize import sent_tokenize

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df2 = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df2.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df2.iterrows()
    }

In [4]:
document_embeddings = load_embeddings("embedded_SolarAire_data_2.csv")

In [5]:
df = pd.read_csv('SolarAire-testing-data.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(2)

27 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
SolarAire,SLAs,Product SLAs Definition\nProduct & Service Cat...,
SolarAire Workforce Productivity Package (WPP),Productivity & Collaboration,Productivity & Collaboration Business Values\n...,


In [6]:
openai.api_key = "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"
#document_embeddings = compute_doc_embeddings(df)

In [7]:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('SolarAire', 'Mission Statement') : [-0.004124082, -0.028244535, -0.024418907, 0.006091161, -0.026820099]... (1536 entries)


In [8]:
example_entry = list(document_embeddings.items())[1]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('SolarAire', 'Solutions Overview') : [0.006891754, -0.009139794, 0.001467098, -0.010307433, -0.03811603]... (1536 entries)


In [27]:
df2 = pd.DataFrame(document_embeddings)

In [29]:
df2.to_csv('embedded_SolarAire_data_2.csv', index=False)

In [9]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [10]:
order_document_sections_by_query_similarity("what is SolarAire Design Principles?", document_embeddings)[:5]

[(0.8349916266863447, ('SolarAire', 'Product Architecture')),
 (0.8207055265123668, ('SolarAire', 'Solutions Overview')),
 (0.8151686337207547, ('SolarAire', 'Design Principles')),
 (0.8137956960709938,
  ('SolarAire OneDevSecOps', '  Roles & Responsibilities')),
 (0.8054451231899692, ('SolarAire', 'Solutions Overview Diagram'))]

In [11]:
MAX_SECTION_LEN = 10
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [12]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [13]:
prompt = construct_prompt(
    "What is the SolarAire OneDigital?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 27 document sections:
('SolarAire OneDigital', 'Overview')
('SolarAire', 'OnePlatform')
('SolarAire', 'Product Architecture')
('SolarAire', 'Solutions Overview')
('SolarAire', 'Product Roadmap - 2023')
('SolarAire', 'Mission Statement')
('SolarAire', 'Solutions Overview Diagram')
('SolarAire', 'Product Roadmap')
('SolarAire OneDevSecOps', '  Roles & Responsibilities')
('SolarAire', 'OneAdvisory')
('SolarAire Workforce Productivity Package (WPP)', 'Virtual Device')
('SolarAire Workforce Productivity Package (WPP)', 'Physical Device')
('SolarAire Workforce Productivity Package (WPP)', 'Overview  Diagram')
('SolarAire OneDevSecOps', 'Features')
('SolarAire Workplace Integration Service (WIS)', 'Overview')
('SolarAire OneDevSecOps', 'Toolsets')
('SolarAire', 'Design Principles')
('SolarAire Workforce Productivity Package (WPP)', 'Productivity & Collaboration')
('SolarAire', 'SLAs')
('SolarAire OneDevSecOps', 'Overview')
('SolarAire Workforce Productivity Package (WPP)', 'User Iden

In [14]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [15]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    if "SolarAire" in query: 
        prompt = construct_prompt(
           query,
           document_embeddings,
           df
      )
    else:
        prompt = "Answer the question as truthfully as possible, and if you're unsure of the answer, say ""Sorry, I don't know"". \n Q: " + query + "\n A:" 
    
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [21]:
answer_query_with_context("What is SolarAire principles?", df, document_embeddings)

Selected 27 document sections:
('SolarAire', 'Product Architecture')
('SolarAire', 'Solutions Overview')
('SolarAire', 'OnePlatform')
('SolarAire OneDevSecOps', '  Roles & Responsibilities')
('SolarAire', 'Solutions Overview Diagram')
('SolarAire', 'Product Roadmap')
('SolarAire OneDigital', 'Overview')
('SolarAire', 'Mission Statement')
('SolarAire', 'Design Principles')
('SolarAire', 'Product Roadmap - 2023')
('SolarAire OneDevSecOps', 'Features')
('SolarAire Workforce Productivity Package (WPP)', 'Virtual Device')
('SolarAire Workforce Productivity Package (WPP)', 'Overview  Diagram')
('SolarAire OneDevSecOps', 'Toolsets')
('SolarAire', 'OneAdvisory')
('SolarAire Workforce Productivity Package (WPP)', 'Physical Device')
('SolarAire OneDevSecOps', 'Overview')
('SolarAire OneDigital', 'Security')
('SolarAire Workplace Integration Service (WIS)', 'Overview')
('SolarAire Workforce Productivity Package (WPP)', 'User Identity Management')
('SolarAire', 'Product Launch Lifecycle')
('SolarA

'SolarAire\'s design principles include Diversity - bias to value-added over system solidarity; Flexibility - conscious of various customers\' needs; Simplicity - focusing on simplifying customers\' digital experience; Secure by design - always design defensively with the zero-trust assumption; Start with the end in mind - focusing on delivering values to the customers/users; Composable services and products - breaking off any monolithic views/processes to discrete value-added composable units; Purposeful resiliency - balancing among system performance, availability and cost with the customer experience center and front; Don\'t repeat "yourself" - always design with the reusability in mind: everything as code infrastructure as code configuration as code policy as code documentation as code operation as code.'