In [14]:
import ast
from openai import OpenAI # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
import os # for getting API token from env variable OPENAI_API_KEY
from scipy import spatial  # for calculating vector similarities for search

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPEN_AI_KEY)
client

<openai.OpenAI at 0x1369fbd90>

In [19]:
df = pd.read_csv("/Users/nardoarevalo/Desktop/pandas_learning/notebooks/openai_embeddings/data.csv")
df['embeddings'] = df['embeddings'].apply(ast.literal_eval)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        41 non-null     object
 1   embeddings  41 non-null     object
dtypes: object(2)
memory usage: 784.0+ bytes


In [20]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def get_query_embeddings(query):
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding
    return query_embedding

def strings_ranked_by_relatedness(
        query_embedding,
        df: pd.DataFrame,
        relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
        top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embeddings"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [21]:
query_embedding = get_query_embeddings("Silver 73: Enhanced Plan")
strings, relatednesses =strings_ranked_by_relatedness(query_embedding, df, top_n=10)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")

relatedness=0.866
relatedness=0.851
relatedness=0.830
relatedness=0.818
relatedness=0.803
relatedness=0.780
relatedness=0.779
relatedness=0.773
relatedness=0.766
relatedness=0.762


In [22]:
def query_message(
        query: str,
        df: pd.DataFrame,
        model: str,
        token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""

    query_embedding = get_query_embeddings(query)
    strings, relatednesses = strings_ranked_by_relatedness(query_embedding, df, top_n=4)
    introduction = 'Use the below articles on Covered California to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nCovered California section:\n"""\n{string}\n"""'
        if (
                num_tokens(message + next_article + question, model=model)
                > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
        query: str,
        df: pd.DataFrame = df,
        model: str = GPT_MODEL,
        token_budget: int = 4096 - 500,
        print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    print("tokens")
    display(num_tokens(message))
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about Covered California Health Insurnace."},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [23]:
query = "is silver 73 a good plan for me if im unhealthy "
ask(query)

tokens


2346

'The Silver 73 plan may not be the best fit for you if you are unhealthy. The plan is more suitable for individuals who are relatively healthy and want typical services such as office visits, basic labs, and prescriptions (generic) covered right away for a minimum set fee (copay) without having to pay a deductible. If you are unhealthy and anticipate needing more medical services, you may want to consider a plan with lower out-of-pocket costs and potentially more comprehensive coverage, such as the Silver 87 plan.'