In [None]:
# imports
import os
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"
openai.api_key = 'OPENAI_API_KEY'

# 1. Prepare search data

In [None]:
from openai.embeddings_utils import get_embedding
import glob

In [None]:
files = glob.glob('data/*')
# Create an empty DataFrame to hold the data
df = pd.DataFrame(columns=['data'])

# Read each file and concatenate into a single row in the DataFrame
for file_path in files:
    with open(file_path, 'r') as f:
        data = f.read()
        df = pd.concat([df, pd.DataFrame({'data': [data]})], axis=0, ignore_index=True)

# Print the resulting DataFrame
print(head(df))

##### below command runs the api used token

#df['code_embedding'] = df['data'].apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
# df.to_csv("data/code_search_openai-python.csv", index=False)

In [None]:
# reading the Dataframe as it is 
# df = pd.read_csv("data/code_search_openai-python.csv")

# 2. Search

Now we'll define a search function that:

Takes a user query and a dataframe with text & embedding columns
Embeds the user query with the OpenAI API
Uses distance between query embedding and text embeddings to rank the texts

Returns two lists:
    The top N texts, ranked by relevance
    Their corresponding relevance scores

In [None]:
def strings_ranked_by_relatedness(
    query,
    df,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n=100
):
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["data"], relatedness_fn(query_embedding, row["code_embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]


In [None]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("password change", df, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

# 3. Ask

With the search function above, we can now automatically retrieve relevant knowledge and insert it into messages to GPT.

Below, we define a function ask that:

Takes a user query
Searches for text relevant to the query
Stuffs that text into a message for GPT
Sends the message to GPT
Returns GPT's answer

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

In [None]:
def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = "Below are some articles that may help answer your question. If you cannot find the information you're looking for, please let me know and I'll do my best to assist you further. You can also ask follow-up questions related to the article sections provided."
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\ article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


In [None]:
def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "In this section, you will find information on how to manage users and set permissions on our platform. If you can't find the answer you're looking for, feel free to ask us a question."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [None]:
ask("how much data is allowed to save")

In [None]:
ask("how to access admin detail")

In [None]:
ask("how to make a normal account and what permission will i have ?")

In [None]:
query = "how to make a normal account and what permission will i have ?"

In [None]:
query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )

In [None]:
query_embedding = query_embedding_response["data"][0]["embedding"]

In [None]:
df.to_csv("data/code_search_openai-python.csv", index=False, encoding="utf-8")

In [None]:
df.code_embedding.iloc[0]