In [None]:
# brew install libmagic
# https://python.langchain.com/v0.2/docs/how_to/document_loader_directory/

In [None]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('../documents', glob="sec_filing_tables.txt")
docs = loader.load()


In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load example document
with open("../documents/sec_filing_tables.txt") as f:
    state_of_the_union = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
)
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])

In [None]:
# pip install langchain faiss-cpu tiktoken openai

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory
import tiktoken

# Step 1: Load the text file
text_loader = TextLoader('../documents/sec_filing_combined.txt')
documents = text_loader.load()

In [None]:

# Find tokens to calculate number of tokens in the text to find cost
text_content = documents[0].page_content
# Step 2: Tokenize the text
# Select the appropriate tokenizer for the OpenAI model you'll be using
tokenizer = tiktoken.get_encoding('cl100k_base')  # Adjust encoding for your specific model if necessary
tokens = tokenizer.encode(text_content)

# Calculate the number of tokens
num_tokens = len(tokens)
print(f"Total number of tokens in the text file: {num_tokens}")


In [None]:

# Step 3: Estimate cost for embeddings
# Assume you're using OpenAI's embedding model with cost estimates
# Example costs: $0.0004 per 1K tokens for text-embedding-ada-002 (as of August 2024)
cost_per_1k_tokens = 0.0004
cost = (num_tokens / 1000) * cost_per_1k_tokens
print(f"Estimated cost for generating embeddings: ${cost:.4f}")

In [None]:

# Step 2: Split the text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(documents)

In [None]:

# Step 3: Generate embeddings for the text chunks
embeddings = OpenAIEmbeddings()
doc_embeddings = embeddings.embed_documents([doc.page_content for doc in split_documents])


In [None]:
type(doc_embeddings)

In [None]:
doc_embeddings[0]

In [None]:

# Step 4: Store the embeddings in a local FAISS vector store
vectorstore = FAISS.from_documents(split_documents, embeddings)


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

retrieved_docs = retriever.invoke("Tell me about Microsoft Cloud gross margin?")

len(retrieved_docs)
retrieved_docs

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI

# Assuming you've already created the vector store and retriever as in previous steps
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Set up the conversational retrieval chain without memory
llm = OpenAI()
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=None)

# Query the stored embeddings for similar documents
query = "Tell me about Microsoft Cloud gross margin?"
response = qa_chain.run({"question": query, "chat_history": []})

print(response)

### If you want more control over the process and avoid using chains that might be missing, you can manually retrieve the documents and then pass them to the LLM for answering:

In [53]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Assuming you've already created the vector store and retriever as in previous steps
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})

# Define a prompt template for LLM processing
prompt_template = """
You are a finance expert known to assess 10 K documents and provide data as an input to Sankey charts. 
Based on the following documents, provide factually correct answers from document. Be concise and precise.
{documents}

Question: {question}
Answer:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["documents", "question"])


# Set up the LLM and chain
llm = OpenAI()
qa_chain = LLMChain(llm=llm, prompt=prompt)


In [54]:
# Retrieve documents
query = "in table format provide the revenue generated from each business like azure, linkedin etc?"
retrieved_docs = retriever.invoke(query)

# Combine the retrieved document content
combined_docs_content = "\n".join([doc.page_content for doc in retrieved_docs])

# Run the chain with the retrieved documents and query
response = qa_chain.run({"documents": combined_docs_content, "question": query})

print(response)


| Business Segment | Revenue (in millions) |
|------------------|-----------------------|
| Server products and cloud services | $97,726 |
| Office products and cloud services | $54,875 |
| Windows | $23,244 |
| Gaming | $21,503 |
| LinkedIn | $16,372 |
| Search and news advertising | $12,576 |
| Enterprise and partner services | $7,594 |
| Dynamics products and cloud services | $6,481 |
| Devices | $4,706 |
| Other | $45 |
| Total | $245,122 |


In [None]:

# Query the stored embeddings for similar documents
query = "Tell me about Microsoft Cloud gross margin?"
response = qa_chain.run({"question": query, "chat_history": []})

print(response)


In [None]:

# Step 5: Set up a conversational retrieval chain
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
llm = OpenAI()
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory)


In [None]:

# Step 6: Query the stored embeddings
query = "Tell me about Microsoft Cloud gross margin"
response = qa_chain.run({"question": query, "chat_history": []})

print(response)


In [None]:

# Chat loop (optional)
while True:
    query = input("Ask a question: ")
    response = qa_chain.run({"question": query, "chat_history": []})
    print(response)