### Internal language model POCs and OpenAI API demo

Prepared for Ventera brownbag

In [None]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

openai.api_key = "sk-yourkey" # TODO: Convert to txtField prompt before distributing


### General example

In [None]:
prompt = "What's the main idea behind MVP?"

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

In [None]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: How many sprints in a PI?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

#### Prompt Engineering to increase domain 'truthiness'

In [None]:
prompt = """Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
A Minimum Viable Product (MVP) is a version of a working product that allows the team to learn from and 
interact with their customer with the least amount of effort. MVP attends to the core customer needs 
first and as soon as possible. It helps to validate needs, reduce risk, and help the programs course correct 
quickly, as needed. Rooted in concepts that emerged from the book “The Lean Startup” by Eric Ries, the core 
idea is to facilitate a better understanding of the customers needs and interests without committing or 
using a large number of resources or fully developing a product.

Q: What's the main idea behind MVP?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

#### Preprocess data from vstart.dev

In [None]:
df = pd.read_csv('vstart.csv')
df = df.set_index(["vscontent"])
print(f"{len(df)} rows in the data.")
df.sample(5)

In [None]:
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
    """
    Use the OpenAI Embeddings API to create an embedding for the given text.
    """
    result = openai.Embedding.create(model=model, input=text)
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[str, list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the vscontent value of the row it corresponds to.
    """
    return {r['vscontent']: get_embedding(r['vscontent']) for idx, r in df.iterrows()}

def load_embeddings(fname: str) -> dict[str, list[float]]:
    """
    Read the document embeddings from a CSV.

    fname is the path to a CSV with exactly these named columns: 
        "vscontent", "0", "1", ... up to the length of the embedding vectors.
    """
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "vscontent"])
    return {r[0]: [r[i] for i in range(1, max_dim + 1)] for r in df.itertuples(index=False)}

df = pd.read_csv("vstart.csv")




In [None]:
document_embeddings = compute_doc_embeddings(df)

##### Lets save our embeddings in a file

In [None]:
def save_embeddings(embeddings: dict[str, list[float]], fname: str):
    df = pd.DataFrame.from_dict(embeddings, orient='index')
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'vscontent'}, inplace=True)
    df.to_csv(fname, index=False)


output_folder = "output"
output_fname = f"{output_folder}/embeddings.csv"
save_embeddings(document_embeddings, output_fname)

##### Lets view what an embedding looks like

In [None]:
first_text = list(document_embeddings.keys())[0]
first_embedding = list(document_embeddings.values())[0]
truncated_text = first_text[:50] + "..."
print("Example:", truncated_text, first_embedding[0])


#####  We can use this for search results as well 

...by finding the most similar document embeddings to the question embedding. We're storing this locally but for a larger dataset we would consider a vector DB to index our results.

In [None]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

##### Lets try a semantic search query

In [None]:
order_document_sections_by_query_similarity("how many sprints in a PI", document_embeddings)[:5]