In [20]:
#Load the .env varibale file 
import os 
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [28]:
# Import Libraries
import re 
from pypdf import PdfReader
import chromadb
from chromadb.utils import embedding_functions
import dspy
from dspy.retrieve.chromadb_rm import ChromadbRM



In [25]:
def load_documents(file_path):
    reader = PdfReader(file_path)
    return [p.extract_text().strip() for p in reader.pages if p.extract_text()]

def clean_text(text):
    """Normalize whitespace and remove non-printable characters."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

# Split the text with chunk size of 1000
def chunk_text(text, size=1000):
    """Split text into chunks of approximately 'size' characters."""
    words = text.split()
    chunks, current_chunk, current_length = [], [], 0
    for word in words:
        if current_length + len(word) + 1 > size:
            chunks.append(' '.join(current_chunk))
            current_chunk, current_length = [word], len(word) + 1
        else:
            current_chunk.append(word)
            current_length += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def process_pdf(file_path):
    """Load, clean, and chunk text from a PDF."""
    pdf_texts = load_documents(file_path)
    full_text = '\n'.join(pdf_texts)
    cleaned_text = clean_text(full_text)
    return chunk_text(cleaned_text)

# Load PDF file and prepare text
pdf_file = './data/tesla10K.pdf'
token_split_texts = process_pdf(pdf_file)

In [29]:
# Initialize the ChromaDB client and create/get collection
chroma_client = chromadb.PersistentClient(path="./teslasec")
collection = chroma_client.get_or_create_collection(
    name="tesla",
    embedding_function=embedding_functions.DefaultEmbeddingFunction()
)
# Add documents to the collection
collection.add(documents=token_split_texts, ids=[str(i) for i in range(len(token_split_texts))])

/home/yordanoff/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz:   0%|          | 0.00/79.3M [00:00<?, ?iB/s]

/home/yordanoff/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:39<00:00, 2.13MiB/s]


In [31]:
# DSPY Retriever Model

llm_model = dspy.OpenAI(model = "gpt-4o-mini")
retriever_model = ChromadbRM(
    collection_name='tesla', 
    persist_directory="./teslasec",
    embedding_function=embedding_functions.DefaultEmbeddingFunction(),
    k=5
)
# Configure DSPY settings with the language model and retriever model
dspy.settings.configure(lm=llm_model, rm=retriever_model)

In [32]:
retriever_model("revenue")

[{'id': '355',
  'score': 0.9241970777511597,
  'long_text': 'manufacture, installation, sales and leasing of solar energy generation and energy storage products and related services and sales of solar energy systems incentives. Our CODM does not evaluate operating segments using asset or liability information. The following table presents revenues and gross profit by reportable segment (in millions): Year Ended December 31, 2023 2022 2021 Automotive segment Revenues $ 90,738 $ 77,553 $ 51,034 Gross profit $ 16,519 $ 20,565 $ 13,735 Energy generation and storage segment Revenues $ 6,035 $ 3,909 $ 2,789 Gross profit $ 1,141 $ 288 $ ( 129 ) The following table presents revenues by geographic area based on the sales location of our products (in millions): Year Ended December 31, 2023 2022 2021 United States $ 45,235 $ 40,553 $ 23,973 China 21,745 18,145 13,844 Other international 29,793 22,764 16,006 Total $ 96,773 $ 81,462 $ 53,823 The following table presents long-lived assets by geogra

## Let's Create The RAG Pipeline 

In [40]:
import chromadb
from chromadb.utils import embedding_functions
import dspy
from dspy.retrieve.chromadb_rm import ChromadbRM
from dspy.datasets import HotPotQA

class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""
    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")


class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""
    context = dspy.InputField(desc="may contain   relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()


class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)


#
my_question = "what is the increase in total revenue in 2023 prior to last year, answer based on context only?"

# Get the prediction. This contains `pred.context` and `pred.answer`.
rag_pipeline=RAG()
pred=rag_pipeline(my_question)

rag_pipeline 

generate_answer = Predict(StringSignature(context, question -> rationale, answer
    instructions='Answer questions with short factoid answers.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'may contain relevant facts', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'often between 1 and 5 words', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
))

In [41]:
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")


Question: what is the increase in total revenue in 2023 prior to last year, answer based on context only?
Predicted Answer: $2.13 billion
Retrieved Contexts (truncated): ['used vehicle revenue driven by increases in volume, body shop and part sales revenue, non-warranty maintenance services revenue, paid Supercharging revenue and insurance services revenue, all of which...', 'billion for the release of valuation allowance on certain deferred tax assets. We continue to focus on further cost reductions and operational efficiencies while maximizing delivery volumes. We ended ...', 'December 31, 2022. This was driven by a $447 million increase in employee and labor costs primarily from increased headcount, including professional services and a $363 million increase in facilities ...']
