In [15]:
import os
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

In [2]:
embeddings = []
files = os.listdir("articles")
for file in files:
    with open(f"articles/{file}", "r") as f:
        text = f.read()
        embedding = get_embedding(text, engine="text-embedding-ada-002")
        embeddings.append({"text": text, "embedding": embedding})
df = pd.DataFrame(embeddings)



In [13]:
# search through the reviews for a specific product
def search_reviews(df, product_description, n=3, pprint=True):
    product_embedding = get_embedding(
        product_description,
        engine="text-embedding-ada-002"
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
    )
    strings = results.text.tolist()
    relatednesses = results.similarity.tolist()
    return strings[:n], relatednesses[:n]


results = search_reviews(df, "mahlgrad einstellen", n=3)

In [26]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = search_reviews(df, query, n=3)
    introduction = 'Use the below articles on the Tchibo coffe machine to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer." First, translate the subsequent question and all the articles into english. Afterwards, return your answer in german. If the question explicitly mentions coffe, please add a little advertisement for the delicious "Tchibo Barista Caffè Crema" coffee beans.'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nTchibo coffee machine article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the Tchibo coffee machine."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [27]:
ask('welche kaffebohnen kann ich verwenden?')

'In den Artikeln wird keine spezifische Marke von Kaffeebohnen erwähnt, aber ich empfehle die Verwendung von Tchibo Barista Caffè Crema Bohnen für ein köstliches Kaffeeerlebnis mit der Tchibo Kaffeemaschine.'