In [None]:
def set_jupyter_widescreen():
    from IPython.display import display, HTML
    
    display(HTML(data="""
    <style>
        div#notebook-container    {width: 95%; }
        div#menubar-container     {width: 65%; }
        div#maintoolbar-container {width: 99%; }
    </style>
    """))
set_jupyter_widescreen()

In [None]:
# installing packages for text extraction
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install langchain
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install unstructured==0.5.6
# !/Volumes/develop/anaconda3/envs/llm/bin/pip show pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install --upgrade langchain pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pypdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pymupdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install faiss-cpu

In [None]:
import openai
import os
import pandas as pd
import numpy as np
from langchain.document_loaders import PyMuPDFLoader

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [None]:
# from langchain.document_loaders import UnstructuredFileLoader
# loader = UnstructuredFileLoader('./sample_form_150522.pdf')
# documents = loader.load()
# documents_content = '\n'.join(doc.page_content for doc in documents)

#### It doesn't capture the ticked boxes in the PDF

In [None]:
loader = PyMuPDFLoader("./sample_form_150522.pdf")
data = loader.load()
#print(data[19].page_content)
for i in range(82):
    print(i, '---', data[i].page_content[-50:])
doc_content = '\n'.join(doc.page_content for doc in data)

* split the documents into chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [None]:
print(doc_content[:2000])

In [None]:
doc_chunks = text_splitter.split_text(doc_content)

In [None]:
# showing some examples of the chunks and overlaps
for i in range(len(doc_chunks)):
    print(f'\nchunk: {i}\n')    
    print(doc_chunks[i][:300])
    print(''.join(50*['-']))
    print(doc_chunks[i][-300:])
    print(''.join(100*['*']))

* get openai embeddings

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")


def get_doc_search(texts, embedding_model=None):
    if embedding_model is None:
        embeddings = OpenAIEmbeddings()
    else:
        embeddings = OpenAIEmbeddings(model=embedding_model, deployment=embedding_model)
    return FAISS.from_texts(texts, embeddings)

In [None]:
embeddings.

In [None]:
doc_search = get_doc_search(doc_chunks)

In [None]:
query = 'how many employees do you have?'
documents = doc_search.similarity_search(query)

In [None]:
for d in documents:
    if 'employee' in d.page_content:
        print(d.page_content)

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
llm = OpenAI(max_tokens=250,
             temperature=0,
             top_p=1,
             frequency_penalty=0,
             presence_penalty=0)
chain = load_qa_chain(llm, chain_type = "map_rerank",  
                      return_intermediate_steps=True)

In [None]:
from langchain.callbacks import get_openai_callback

In [None]:
with get_openai_callback() as cb:
    results = chain({"input_documents":documents, 
                    "question": query},
                    return_only_outputs=False)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

In [None]:
results = chain({
                    "input_documents":documents, 
                    "question": query
                }, 
                return_only_outputs=True)

In [None]:
results

In [None]:
results['intermediate_steps'][0]['answer'].strip()

In [None]:
# adopted from https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
import tiktoken
model = 'text-davinci-003'
model = 'gpt-3.5-turbo'
encoding = tiktoken.encoding_for_model(model)
#num_tokens_from_messages([doc.page_content for doc in documents], model="gpt-3.5-turbo-0613")

In [None]:
num_tokens = 0
for d in documents:
    num_tokens += len(encoding.encode(d.page_content))

In [None]:
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    openai.api_key  = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")

In [None]:
# adopted from https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [None]:
from openai.embeddings_utils import get_embedding

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" 
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191
cost = 0.0001/1000 # dollar per tokens

# get the token numbers for the document chunks
encoding = tiktoken.encoding_for_model(model)
print(f'encoding name for {embedding_model}: {encoding.name}')
print(f"{''.join(2*['-'])}")
#save embeddings
df = pd.DataFrame()
df['text'] = doc_chunks
df['tokens'] = df['text'].apply(lambda x: len(encoding.encode(x)))
df['embedding'] = df.text.apply(lambda x: get_embedding(x, engine=embedding_model))
df['cost'] = cost*df['tokens']

print(f"maximum tokens:(chunk number:{df.sort_values(by='tokens', ascending=False).iloc[0].name}, token numbers: {df.sort_values(by='tokens', ascending=False).iloc[0].values[1]})")

print(f"{''.join(2*['-'])}")

print(f'Total embedding cost: ${np.round(df.cost.sum(),4)}')
df.to_csv("./sample_form_150522_chunks_with_embeddings.csv")

In [None]:
from openai.embeddings_utils import  cosine_similarity

In [None]:
query = 'how many employees do you have?'
query_embedding = get_embedding(query, engine=embedding_model)

In [None]:
# TODO:  could be a class with embedding attributes and other functions such as similarity search etc.
def num_tokens(x, model='text-embedding-ada-002'):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(x))

def calc_embedding_cost(x, cost=0.0001/1000):
    return num_tokens(x)*cost


def retrieve_relevant_text(query, vectordb, top_n=3, verbose=True):
    # get the cost
    cost = calc_embedding_cost(query)
    
    # get query's embedding
    query_embedding = get_embedding(query, engine=embedding_model)

    # search against the database and get the top matches
    vectordb['similarity_score'] = vectordb.embedding.apply(lambda x: cosine_similarity(x, query_embedding))
    result = vectordb.sort_values(by='similarity_score', ascending=False).head(top_n)
    
    # we sort the chunks  again according to their index in ascending order to maintain the semantic connectedness of the document
    result = result.sort_index()
    
    if verbose:
        print(f'query embedding cost:${cost}')
    
    return [(row["text"], row['similarity_score']) for i, row in result.iterrows()]

In [None]:
def prompt_tempelate(query, vectordb, model, max_tokens = 1000, verbose=True):
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    related_text = retrieve_relevant_text(query, vectordb, top_n=3, verbose=verbose)
    intro = """Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'."""
    question = f"\n\nQuestion: {query}"
    message = intro
    for i in related_text:
        next_text, _ = i
        next_text = f'\n\n{next_text}\n\n'
        if num_tokens(message + next_text + question, model=model) > max_tokens:
            print('stopped using texts from the database...')
            print(f'-- GPT prompt tokens exceeds maximum token limit ({max_tokens})')
            break
        else:
            message += next_text
    return message + question 

In [None]:
def ask(
    query,
    vectordb,
    model="gpt-3.5-turbo",
    max_tokens=1000,
    verbose=False,
    temperature=0,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
):
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = prompt_tempelate(query, vectordb=vectordb, model=model, max_tokens=max_tokens, verbose=verbose)

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": message},
    ]
    
    if verbose:
        print(f"{''.join(20*['-'])}\n")
        print(message)
        print(f"{''.join(20*['-'])}")
        print(f'Total GPT cost: ${np.round(num_tokens_from_messages(messages=messages, model=model)*0.003/1000)}')
        
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=0,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [None]:
ask(query,df.copy(),verbose=False,max_tokens=1250)

In [None]:
test_list = []
test_list = [
        (row["text"], row['similarity_score'])
        for i, row in tt.iterrows()
    ]
    
    
    

In [None]:
tt

In [None]:
test_list

In [None]:
# Function to get prompt messages
def get_prompt(row):
    return [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": f"""Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.
    Question: {row.question}\n\n
    Context: {row.context}\n\n
    Answer:\n""",
        },
    ]

In [None]:
test = df.embedding.apply(lambda x: cosine_similarity(x, query_embeddings))

In [None]:
pd.DataFrame(test).head()

In [None]:
tt = df.merge(test, left_index=True, right_index=True).rename(columns={'embedding_y':'similarity_score'}).sort_values(by='similarity_score', ascending=False).head(3)

In [None]:
tt[['text', 'similarity_score']]

In [None]:
print(tt[['text', 'similarity_score']].values[0][0])