In [2]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [3]:
prompt = "Who won the 2020 Summer Olympics men's high jump?"

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"Marcelo Chierighini of Brazil won the gold medal in the men's high jump at the 2020 Summer Olympics."

In [4]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: Who won the 2020 Summer Olympics men's high jump?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"Sorry, I don't know."

In [6]:
df = pd.read_csv('testfile.csv')
df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(3)

3 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens,context
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Consulting Services FAQ,Who from Consulting should I ask in which situation?,Please follow the workflow as described in the...,31,"Follow the workflow in the handbook, create a ..."
Consulting Services FAQ,Why should I sell it even when it blocks me selling the license?,Because it will lead to satisfied customers an...,29,Selling the license leads to satisfied custome...
Consulting Services FAQ,Who is responsible for estimating the effort?,There are a few situations where the effort of...,56,Estimating effort requires understanding merch...


In [7]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [8]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [9]:
document_embeddings = compute_doc_embeddings(df)

In [10]:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('Consulting Services FAQ', 'Why should I sell it even when it blocks me selling the license?') : [0.012360747903585434, -0.027870817109942436, 0.010982075706124306, -0.027681587263941765, -0.01754428818821907]... (1536 entries)


In [11]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [13]:
order_document_sections_by_query_similarity("Who is responsible for estimating the effort?", document_embeddings)[:5]


[(0.7850540591201378,
  ('Consulting Services FAQ',
   'Who from Consulting should I ask in which situation?')),
 (0.7765192600102002,
  ('Consulting Services FAQ',
   'Who is responsible for estimating the effort?')),
 (0.7292348021447077,
  ('Consulting Services FAQ',
   'Why should I sell it even when it blocks me selling the license?'))]

In [14]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [23]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question in german as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [24]:
prompt = construct_prompt(
    "Who is responsible for estimating the effort?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 3 document sections:
('Consulting Services FAQ', 'Who from Consulting should I ask in which situation?')
('Consulting Services FAQ', 'Who is responsible for estimating the effort?')
('Consulting Services FAQ', 'Why should I sell it even when it blocks me selling the license?')
===
 Answer the question in german as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Please follow the workflow as described in the handbook. Simply create a ticket or send an eMail / Slack message to the whole team and we will decide who is available and able to solve the issue.
* There are a few situations where the effort of a consulting task is pretty clear in the beginning. But in most cases we need to understand the merchant’s / IP’s needs before we can make an offer. It is very important that we get a good briefing for a first feedback but then need to have a first short call with the merchant / partner 

In [25]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [26]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [27]:
answer_query_with_context("Why should the sales people sell Consulting Services?", df, document_embeddings)


Selected 3 document sections:
('Consulting Services FAQ', 'Why should I sell it even when it blocks me selling the license?')
('Consulting Services FAQ', 'Who is responsible for estimating the effort?')
('Consulting Services FAQ', 'Who from Consulting should I ask in which situation?')


'Damit die Kunden zufrieden sind und mit den neuen Plänen ist es sehr wichtig, dass wir wiederkehrende Einnahmen haben und Partner ihre Verträge erneuern. Außerdem ermöglicht es uns als Software-Anbieter, als vertrauenswürdiger Berater in das Projekt einzusteigen und dabei zu helfen, das Beste aus Shopware herauszuholen, mit direkter Unterstützung und Einweisung für den Händler.'