In [47]:
import os
from dotenv import load_dotenv
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
    wait_random
)  # for exponential backoff

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [48]:
prompt = "What is the medicine for burning eyes?"

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'The most common treatment for burning eyes is artificial tears. Artificial tears are available over-the-counter in the form of eye drops or gels. They help to lubricate the eyes and reduce the burning sensation. Other treatments may include antihistamines, decongestants, and steroid eye drops.'

In [49]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: What is the medicine for burning eyes?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"Sorry, I don't know."

In [79]:
# We have hosted the processed dataset, so you can download it directly without having to recreate it.
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('clarke_symptoms_cleaned.csv')
# df = pd.read_csv('clarke_remedy_info_cleaned_v2.csv')

df = df.set_index(["title", "heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

14958 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
title,heading,Unnamed: 2_level_1,Unnamed: 3_level_1
Zincum Valerianicum,Stool and Anus,"At 2 p.m. evacuation of fÃ¦cal matter, followe...",76
Phytolacca,Female Sexual Organs,Menses: too frequent and copious; mammÃ¦ painf...,388
Equisetum,Male Sexual Organs,Violent erections; in the afternoon. Soreness ...,21
Wiesbaden,Stool and Anus,HÃ¦morrhage from rectum. HÃ¦morrhoidal flow (c...,138
Mercurius Sulphuricus,Generalities,Most pains feel as if a dull stick pressed on ...,41


In [80]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(10))
# @retry(wait=wait_random(min=60, max=120), stop=stop_after_attempt(10))
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [81]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

Again, we have hosted the embeddings for you so you don't have to re-calculate them from scratch.

In [65]:
# document_embeddings = load_embeddings("https://cdn.openai.com/API/examples/data/olympics_sections_document_embeddings.csv")
document_embeddings = load_embeddings("clarke_symptoms_cleaned_embeddings_v2.csv")
# document_embeddings = load_embeddings("clarke_remedy_info_cleaned_v2_embeddings_v2.csv")
# document_embeddings = load_embeddings("clarke_symptoms_cleaned_v2_nose_embeddings_v2.csv")

# ===== OR, uncomment the below line to recaculate the embeddings from scratch. ========

# document_embeddings = compute_doc_embeddings(df)

In [66]:
# pd.DataFrame(document_embeddings.items()).to_csv('clarke_remedy_info_cleaned_v2_embeddings.csv', index=False)
# pd.DataFrame(document_embeddings.items()).to_csv('clarke_symptoms_cleaned_v2_nose_embeddings.csv', index=False)

In [82]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

('Abies Canadensis', 'Mind') : [-0.000476399, 0.004719749, 0.01499975, -0.018661214, -0.015802821]... (1536 entries)


In [83]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [85]:
order_document_sections_by_query_similarity("What are the best medicines for frequent colds with sneezing that increases in the morning?", document_embeddings)[:5]

[(0.8591674604630242, ('Asterias Rubens', 'Nose')),
 (0.8566668020506362, ('Scorpio', 'Nose')),
 (0.8521343794579934, ('NymphÃ¦a Odorata', 'Respiratory Organs')),
 (0.8512689754025645, ('Lilium Tigrinum', 'Nose')),
 (0.8480207687236702, ('Stachys Betonica', 'Nose'))]

In [95]:
MAX_SECTION_LEN = 2000
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [109]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + 'Medicine: ' + section_index[0] + ', Area: ' + section_index[1] + ', Symptoms: ' +
        document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))
    
    # header = """Answer the question as truthfully as possible using the provided context, and if the answer is not specified within the text below, say "I don't know."\n\nContext:\n"""
    header = """Answer the question as truthfully as possible using only from the provided context. Give the best 5 medicines along with their symptoms.\n\nContext:\n"""
    # header = """\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [110]:
prompt = construct_prompt(
    "What are the best medicines for frequent colds with sneezing that increases in the morning?",
    document_embeddings,
    df
)

print("===\n", prompt)

===
 Answer the question as truthfully as possible using only from the provided context. Give the best 5 medicines along with their symptoms.

Context:

* Medicine: Asterias Rubens, Area: Nose, Symptoms: Epistaxis; sneezing and coryza in the morning on waking.
* Medicine: Scorpio, Area: Nose, Symptoms: Frequent sneezing.
* Medicine: NymphÃ¦a Odorata, Area: Respiratory Organs, Symptoms: Some cough in morning.
* Medicine: Lilium Tigrinum, Area: Nose, Symptoms: Sneezing and fluent coryza all day. Running from nose renewed after a glass of cold water; feeling of heat in face and over whole body as in fever; after a while prickling as in perspiration with only moist skin. Snuff has a greater effect than usual in evening.
* Medicine: Stachys Betonica, Area: Nose, Symptoms: Frequent sneezing; on going indoors from open air; increased nasal mucus.
* Medicine: Sulphurosum Acidum, Area: Nose, Symptoms: Sneezing and coryza.
* Medicine: Mimosa, Area: Nose, Symptoms: Sneezing and coryza.
* Medicine

In [111]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [113]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[(str, str), np.array],
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [115]:
answer_query_with_context("Medicine for frequent cold with sneezing upon waking up?", df, document_embeddings)b

'The best 5 medicines for frequent cold with sneezing upon waking up are: \n1. Asterias Rubens: Nose, Symptoms: Epistaxis; sneezing and coryza in the morning on waking.\n2. Baryta Muriatica: Nose, Symptoms: Sneezing in sleep. Coryza.\n3. Plectranthus: Nose, Symptoms: Excessive catarrh, frequent inclination to sneeze; great stoppage with at times discharge of tenacious yellow mucus. Dryness, mornings.\n4. Lilium Tigrinum: Nose, Symptoms: Sneezing and fluent coryza all day. Running from nose renewed after a glass of cold water; feeling of heat in face and over whole body as in fever; after a while prickling as in perspiration with only moist skin. Snuff has a greater effect than usual in evening.\n5. Cornus Alternifolia: Nose, Symptoms: Sneezing; head partially stopped up towards night.'

In [117]:
answer_query_with_context("Medicine for headache at top of head. Headache feels better after pressing hard.", df, document_embeddings)