In [48]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pdf2image import convert_from_path
import pytesseract
import re
import streamlit as st

# Load API Key

In [35]:
load_dotenv()

oa = OpenAI()
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index_name = 'beige-book-20240529'

# Import Beige Book PDF and OCR It

In [3]:
# https://stackoverflow.com/questions/18381713/how-to-install-poppler-on-windows
# https://stackoverflow.com/questions/53481088/poppler-in-path-for-pdf2image
# Convert PDF to image
pdf_file = 'BeigeBook_20240529.pdf'
pages = convert_from_path(pdf_file, poppler_path=r'C:\Program Files\poppler-24.02.0\library\bin') 

In [4]:
# Define OCR function
def extract_text_from_image(image):
    text = pytesseract.image_to_string(image)
    return text

In [8]:
# https://stackoverflow.com/questions/51677283/tesseractnotfounderror-tesseract-is-not-installed-or-its-not-in-your-path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [9]:
# Create a list to store extracted text from all pages
extracted_text = []

for page in pages[2:]:
    text = extract_text_from_image(page)
    extracted_text.append(text)

In [15]:
def clean_extracted_text(texts):
    """
    Remove new lines and page numbers
    """

    clean_texts = []

    for text in texts:
        clean = re.sub(r'\d+\n\n', '', text)
        clean = re.sub(r'\n\n', '', clean)
        clean = re.sub(r'\n', '', clean)
        clean_texts.append(clean)
    
    return clean_texts

clean_texts = clean_extracted_text(extracted_text)

clean_texts

['About This PublicationWhat is the Beige Book?The Beige Book is a Federal Reserve System publication about current economic conditionsacross the 12 Federal Reserve Districts. It characterizes regional economic conditions and pros-pects based on a variety of mostly qualitative information, gathered directly from each District’ssources. Reports are published eight times per year.What is the purpose of the Beige Book?The Beige Book is intended to characterize the change in economic conditions since the lastreport. Outreach for the Beige Book is one of many ways the Federal Reserve System engageswith businesses and other organizations about economic developments in their communities.Because this information is collected from a wide range of contacts through a variety of formaland informal methods, the Beige Book can complement other forms of regional information gath-ering. The Beige Book is not a commentary on the views of Federal Reserve officials.How is the information collected?Each F

In [24]:
# open file
with open('beige_book_20240529.txt', 'w+') as f:
     
    # write elements of list
    for items in clean_texts:
        f.write('%s\n' %items)
     
    print("File written successfully")
 
 
# close the file
f.close()

File written successfully


# Create Beige Book Embeddings and Save in Pinecone

In [20]:
# Function to embed chunk and query
def embed_chunk(text):

    response = oa.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )

    print(response)

    embedding = response.data[0].embedding

    return embedding

In [21]:
# Create your Pinecone index
def create_index(index):

    try:

        pc.delete_index(index)

    except:

        print("Index does not exist, creating new index...")

    pc.create_index(
        name=index,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
            region='us-east-1',
            cloud='aws'
        )
    )

In [22]:
chunk_size = 1500
chunk_overlap = 150

# This function will split the text into chunk and upsert them
def upsert_chunks_from(text_file):

    index = pc.Index(index_name)

    with open(text_file, 'r', encoding='utf-8') as file:

        text = file.read()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_text(text)

    for i, chunk in enumerate(chunks):

        dict = {
            "id": str(i),
            "values": embed_chunk(chunk),
            "metadata": {
                "chunk": chunk,
            }
        }

        print(dict)

        index.upsert(vectors=[dict])


In [37]:
create_index(index_name)
upsert_chunks_from('BeigeBook_20240529.txt')

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.008584226481616497, 0.002536866581067443, 0.03817024081945419, 0.008611412718892097, -0.006776305381208658, -0.01884043961763382, -0.0007421141490340233, 0.02854611910879612, 0.0002693190472200513, -0.02354375272989273, 0.02876361459493637, 0.010861119255423546, -0.07601424306631088, -0.013369099237024784, 0.03814305365085602, 0.030394820496439934, -0.0062019843608140945, 0.022401908412575722, -0.05309578403830528, 0.00751714501529932, -0.0018028233898803592, -0.00789096299558878, -0.05054022744297981, -0.010106685571372509, 0.000866578659042716, -0.02990545891225338, -0.02672460488975048, -0.03523406758904457, 0.007401600945740938, -0.012791380286216736, 0.010120279155671597, -0.015496465377509594, 0.042873553931713104, -0.014925542287528515, -0.002623524283990264, -0.000290983502054587, -0.0004774678382091224, -0.03637591376900673, 0.06595513224601746, 0.011690315790474415, 0.011221343651413918, 0.006772906985133886, -0.0137497140

# Query Embeddings

In [40]:
# Retrieve chunks from Pinecone
def retrieve_chunks(query, no_of_chunks=3):

    # The query is embedded before querying Pinecone
    embedding = embed_chunk(query)

    index = pc.Index(index_name)

    response = index.query(
        vector= embedding,
        top_k=no_of_chunks,
        include_metadata=True
    )

    retrieved_chunks = ""

    for match in response['matches']:

        retrieved_chunks += "________________________________\n"
        retrieved_chunks += "EXCERPT\n"
        retrieved_chunks += "-------\n"
        retrieved_chunks += match['metadata']['chunk'] + '\n'

    return retrieved_chunks

In [41]:
# Test if the results are relevant?
chunks = retrieve_chunks('What does the Federal Reserve Bank say about private equity?')
print(chunks)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.0015254856552928686, -0.032294124364852905, 0.044922277331352234, 0.045346543192863464, 0.0359877347946167, 0.06259170174598694, -0.03606260567903519, 0.09623350203037262, 0.021125948056578636, -0.0026516621001064777, 0.05021312087774277, -0.03701096400618553, -0.019054532051086426, -0.017943954095244408, -0.008684973232448101, 0.016022277995944023, 0.011916881427168846, -0.0018421253189444542, 0.011785858310759068, 0.03611252084374428, 0.006298477295786142, 0.0030618899036198854, -0.03646191582083702, 0.017407381907105446, -0.03007296845316887, -0.01622193306684494, 0.007967464625835419, -0.011436463333666325, 0.0010084115201607347, -0.01823095791041851, 0.07092728465795517, -0.02311001531779766, 0.027452502399683, -0.01427530124783516, -0.04789213836193085, 0.0378095842897892, -0.03147055208683014, 0.004776111338287592, 0.04507201910018921, 0.009290176443755627, 0.02409580908715725, -0.019042054191231728, -0.00972692109644413, -0.

# Chatbot

In [42]:
# Create the system message and include the relevant context
def inject_context_data(context):
    # Edit this system message
    system_message = f"""
        You are 'Federal Reservve Bot' and you will summarize what the Federal Reserve has said in their Beige Book in a way a teenager can understand based on
        texts that we will provide as context.
        
        Here are the relevant textss:
    
    {context}
    """
    print("+++++++++++++++++++++++++++++++++++++++++++++++")
    print("Your system message is: ")
    print("------------------------------------------------")
    print(system_message)
    print("+++++++++++++++++++++++++++++++++++++++++++++++")

    return system_message

In [43]:
def respond_to_question(question):
    context_data = retrieve_chunks(question)
    system_message = inject_context_data(context_data)

    # Call the OpenAI API with your systems message and question
    response = oa.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": question},
        ]
    )

    # Parse the response to an answer and return it
    return response.choices[0].message.content

In [44]:
answer = respond_to_question(
    'What does the Federal Reserve say about private equity?',
    )

print('Answer:', answer)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.004442309495061636, -0.028571128845214844, 0.043508317321538925, 0.044360436499118805, 0.03857102617621422, 0.06852058321237564, -0.03318260982632637, 0.09142761677503586, 0.019623855128884315, -0.004780651535838842, 0.05378389731049538, -0.033784106373786926, -0.022719061002135277, -0.00894100870937109, -0.009717943146824837, 0.013195350766181946, 0.00983698945492506, -0.004743058234453201, 0.009981098584830761, 0.039523396641016006, 0.005729889962822199, 0.001956433057785034, -0.035187602043151855, 0.020676475018262863, -0.028044819831848145, -0.026115015149116516, 0.007819467224180698, -0.011810652911663055, -0.002536000916734338, -0.01605246402323246, 0.07603930681943893, -0.022656403481960297, 0.029147563502192497, -0.01421037781983614, -0.046114806085824966, 0.036591093987226486, -0.02614007703959942, 0.0032800408080220222, 0.03954845666885376, 0.009367070160806179, 0.01857123337686062, -0.0164785236120224, -0.0120800742879509

In [45]:
answer = respond_to_question(
    'What do the Cleveland and St. Louis Federal Reserve Banks say about Finance?',
    )

print('Answer:', answer)

CreateEmbeddingResponse(data=[Embedding(embedding=[-0.017801659181714058, -0.030249634757637978, 0.055541180074214935, 0.01187436655163765, 0.028113435953855515, 0.010535945184528828, -0.024065207690000534, 0.018052201718091965, -0.014135836623609066, -0.04549312964081764, 0.02166527882218361, -0.05770375207066536, -0.06582657992839813, -0.004954795353114605, 0.008136018179357052, 0.02213999070227146, -0.008294254541397095, 0.0500820018351078, 0.03220122307538986, 0.011379876174032688, -3.4717359085334465e-05, -0.004321847576647997, -0.006988799665123224, -0.003903178730979562, -0.0423547625541687, -0.013179821893572807, 0.001326882978901267, 3.427952833590098e-05, -0.012540280818939209, -0.014241327531635761, 0.06144869327545166, -0.03156827762722969, 0.02054443396627903, 0.023524563759565353, -0.029194721952080727, 0.023801477625966072, -0.011346910148859024, 0.009692015126347542, 0.06593207269906998, -0.012329298071563244, -0.007094291038811207, 0.004710846580564976, 0.0019680724944