# Lab 1 - Overview of embeddings-based retrieval

Welcome! Here's a few notes about the Chroma course notebooks.
 - A number of warnings pop up when running the notebooks. These are normal and can be ignored.
 - Some operations such as calling an LLM or an opeation using generated data return unpredictable results and so your notebook outputs may differ from the video.
  
Enjoy the course!

In [None]:
!pip install pypdf langchain openai transformers chromadb sentence_transformers

# Basic RAG

In [None]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return (string[:n_chars].rsplit(' ', 1)[0] + '\n' +
                word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars))

In [None]:
from pypdf import PdfReader

reader = PdfReader("/content/microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

In [None]:
print(word_wrap(pdf_texts[0][0:1000]))

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [None]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=100
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

In [None]:
#print(word_wrap(character_split_texts[10]))
#print(f"\nTotal chunks: {len(character_split_texts)}")

In [None]:
#token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

#token_split_texts = []
#for text in character_split_texts:
#    token_split_texts += token_splitter.split_text(text)

#print(word_wrap(token_split_texts[10]))
#print(f"\nTotal chunks: {len(token_split_texts)}")

In [None]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()

In [None]:
print(embedding_function([character_split_texts[10]]))

In [None]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022",
                                                    embedding_function=embedding_function)

ids = [str(i) for i in range(len(character_split_texts))]

chroma_collection.add(ids=ids, documents=character_split_texts)
chroma_collection.count()

In [None]:
query = "What was the total revenue?"

results = chroma_collection.query(query_texts=[query], n_results=3)
retrieved_documents = results['documents'][0]

In [None]:
for document in retrieved_documents:
    print('===========================================================')
    print(word_wrap(document))
    print('\n')

In [None]:
import os
import openai
from openai import OpenAI
from getpass import getpass

In [None]:
openai.api_key = getpass("OpenAI Key: ")

In [None]:
openai_client = OpenAI(api_key = openai.api_key)

In [None]:
def rag(query, retrieved_documents, model="gpt-3.5-turbo"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [None]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))

# Query 2

**What were the most important factors that contributed to increases in revenue?**


In [None]:
original_query = "What were the most important factors that contributed to increases in revenue?"

results_q2 = chroma_collection.query(query_texts=[original_query], n_results=3)
retrieved_documents_q2 = results_q2['documents'][0]

In [None]:
output_q2 = rag(query=original_query, retrieved_documents=retrieved_documents_q2)

print(word_wrap(output_q2))

## Query Expansion

In [None]:
def augment_multiple_query(query, model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about an annual report. "
            "Suggest two additional related questions to help them find the information they need, for the provided question. "
            "Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic."
            "Make sure they are complete questions, and that they are related to the original question."
            "Output one question per line. Do not number the questions."
        },
        {"role": "user", "content": query}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    content = content.split("\n")
    return content

In [None]:
augmented_queries = augment_multiple_query(original_query)

In [None]:
for query in augmented_queries:
    print(query)

In [None]:
queries = [original_query] + augmented_queries
results = chroma_collection.query(query_texts=queries, n_results=2, include=['documents', 'embeddings'])

retrieved_documents = results['documents']

# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

In [None]:
for i, documents in enumerate(retrieved_documents):
    print(f"Query: {queries[i]}")
    print('')
    print("Results:")
    for doc in documents:
        print(word_wrap(doc))
        print('')
    print('-'*100)

In [None]:
output = rag(query=query, retrieved_documents=unique_documents)

print(word_wrap(output))

## Document Retriever with Cross Encoder Re-ranking

In [None]:
query_3 = "What has been the investment in research and development?"
results_3 = chroma_collection.query(query_texts=query_3, n_results=10, include=['documents', 'embeddings'])

In [None]:
retrieved_documents_3 = results_3['documents'][0]

In [None]:
for document in results_3['documents'][0]:
    print('===========================================================')
    print(word_wrap(document))
    print('')

In [None]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [None]:
pairs = [[query_3, doc] for doc in retrieved_documents_3]
scores = cross_encoder.predict(pairs)

In [None]:
scores

In [None]:
import pandas as pd

In [None]:
doc_with_scores = pd.DataFrame( { "doc" : retrieved_documents_3,
                                  "score": scores })

In [None]:
doc_with_scores = doc_with_scores.sort_values("score", ascending = False).reset_index()
doc_with_scores

In [None]:
doc_with_scores['doc'][0:3]

In [None]:
output = rag(query=query_3, retrieved_documents=doc_with_scores['doc'][0:3])

In [None]:
print(word_wrap(output))