# FinanceGPT: Using RAG Based LLM

In [104]:
import os
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()

MONGODB_CONNECTION_STRING = os.getenv("MONGODB_URI")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
REPLICATE_API_KEY = os.getenv("REPLICATE_API_TOKEN")


Python-dotenv could not parse statement starting at line 3


## Clean reddit personal finance corpus

In [13]:
def clean_text(text, remove_headers_until=4):
    """
    Clean the input text by removing header lines, normalizing whitespace,
    and converting to lowercase.
    
    Args:
    text (str): The input text to clean.
    remove_headers_until (int): Number of initial lines to remove as headers.
    
    Returns:
    str: The cleaned text.
    """
    # Split text into lines and remove header lines
    lines = text.splitlines()
    cleaned_lines = lines[remove_headers_until:]
    
    # Join lines back and normalize text
    cleaned_text = "\n".join(cleaned_lines).strip().lower()
    
    return cleaned_text

## Load Personal finance books

In [14]:
import os
import re
import fitz  # PyMuPDF

# Function to preprocess and clean text
def preprocess_text_mupdf(text):
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove empty lines
    text = re.sub(r'[^A-Za-z0-9.,;:!?()\'\"\n]+', ' ', text)  # Remove special characters but keep punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

raw_path = '/Users/mrinoyb2/git/FinanceGPT/data/pf_books/pdfs'
clean_text_dir = '/Users/mrinoyb2/git/FinanceGPT/data/pf_books/clean'
os.makedirs(clean_text_dir, exist_ok=True)

# Process each PDF file in the extracted directory
for root, dirs, files in os.walk(raw_path):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            pdf_document = fitz.open(pdf_path)

            cleaned_text_mupdf = ""
            for page_number in range(pdf_document.page_count):
                page = pdf_document.load_page(page_number)
                text = page.get_text()
                cleaned_text_mupdf += preprocess_text_mupdf(text)

            pdf_document.close()

            # Save the cleaned text to a corresponding file
            clean_text_path = os.path.join(clean_text_dir, os.path.splitext(file)[0] + '.txt')
            with open(clean_text_path, 'w') as file:
                file.write(cleaned_text_mupdf)

## Store chunks in MongoDB database

In [118]:
import os
import pymongo
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Download the Punkt tokenizer models


# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

# Function to chunk text by sentence
def chunk_by_sentence(text):
    """
    Improved function to chunk text by sentence using NLTK's sent_tokenize.
    
    Args:
    text (str): The input text to chunk.
    
    Returns:
    list: A list of sentences extracted from the input text.
    """
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences]

# Path to the directory containing cleaned text files
clean_text_dir = '/Users/mrinoyb2/git/FinanceGPT/data/All_texts'

# Initialize a counter for unique MongoDB document IDs
doc_id = 0

# Iterate through each cleaned text file
for file_name in os.listdir(clean_text_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(clean_text_dir, file_name)

        # Read the cleaned text from the file
        with open(file_path, 'r') as file:
            cleaned_text = file.read()

        # Chunk the text
        chunks = chunk_by_sentence(cleaned_text)

        # Store chunks in MongoDB
        for chunk in chunks:
            # Create a document for each chunk
            document = {"_id": doc_id, "text": chunk}
            # Insert the document into the collection
            collection.insert_one(document)
            # Increment the doc_id for the next document
            doc_id += 1

print(f"Total chunks stored in MongoDB: {doc_id}")



[nltk_data] Downloading package punkt to /Users/mrinoyb2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total chunks stored in MongoDB: 11597


## Implement RAG

### Create word embeddings

In [119]:
from sentence_transformers import SentenceTransformer
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to update documents with embeddings
def update_documents_with_embeddings():
    for document in chunks_collection.find():
        # Generate embedding
        embedding = model.encode(document['text'], convert_to_tensor=False)
        # Update document with embedding
        chunks_collection.update_one({'_id': document['_id']}, {'$set': {'embedding': embedding.tolist()}})

# Uncomment the following line to run the embedding update
update_documents_with_embeddings()

# Check the first document to see if the embedding was added
print(chunks_collection.find_one())


{'_id': 0, 'text': 'the-usd-inr-pair\n4.1 – the contract\nwe make an extremely critical assumption at this stage – we will assume you are familiar with how future and options contracts work.', 'embedding': [-0.07872910797595978, 0.021708624437451363, -0.032432589679956436, 0.007473672740161419, -0.009834710508584976, -0.029723696410655975, -0.018552307039499283, 0.07715348154306412, 0.042258694767951965, 0.005934880115091801, 0.018288377672433853, 0.001879948889836669, -0.049384526908397675, 0.06370308250188828, 0.021877873688936234, -0.0008213091059587896, -0.02738482505083084, -0.12179657071828842, -0.04125848412513733, 0.03758013993501663, -0.05928162485361099, -0.08154621720314026, -0.04822477325797081, -0.034306205809116364, 0.03087666630744934, -0.08323254436254501, 0.007064621429890394, -0.01629713550209999, 0.00443514296784997, -0.018304815515875816, -0.02091113105416298, 0.11329317837953568, -0.0031237865332514048, -0.06396238505840302, -0.05167407914996147, -0.012983483262360

### Semantic search retrieval

In [126]:
from sentence_transformers import SentenceTransformer
import pymongo
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Function to perform semantic search
def semantic_search(query, top_k=5):
    # Encode the query using the provided embedding model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = embedding_model.encode(query, convert_to_tensor=False)
    
    # Retrieve all embeddings from MongoDB and calculate similarity
    similarities = []
    for document in chunks_collection.find():
        doc_embedding = np.array(document['embedding'])
        # Use scikit-learn's cosine_similarity function to calculate the cosine distance
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        # Store document ID and similarity score. Remove documents with same similarity scores.
        if document['text'] not in [x[2] for x in similarities] and similarity not in [x[1] for x in similarities]:
            similarities.append((document['_id'], similarity, document['text']))
    
    # Sort by similarity score in descending order and remove duplicates
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Return top_k most similar documents
    return similarities[:top_k]

# Example usage
query = "What are index funds?"
results = semantic_search(query)
for idx, (doc_id, similarity, text) in enumerate(results, start=1):
    print(f"Result {idx} (Score: {similarity:.3f}): {text}...")

Result 1 (Score: 0.667): |
index funds
|
exchange-traded funds
|end of the day nav
|real-time pricing....
Result 2 (Score: 0.647): with etfs, you can express tactical strategies better than index funds because you can’t buy and sell index funds immediately....
Result 3 (Score: 0.636): It is a mutual fund....
Result 4 (Score: 0.634): today, it’s a no-brainer to look at index funds in the large-cap space....
Result 5 (Score: 0.611): to recap, a mutual fund is a pooled investment vehicle that collects the money from various investors, invests and manages that money on their behalf....


## Setting up Gemini Pro

In [21]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [22]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

genai.configure(api_key=GEMINI_API_KEY)

In [23]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


In [121]:
Age = 25
Annual_Income = 10000
Employment = "Full Time"
Debt = 15000
Assets = 10000
Credit_Score = 500

Financial_Goal = "Saving for a house"
Risk_Tolerance = "High"
Time_Horizon = "Short Term"
Life_Event = "Buying a car"

# Create dictionary to store user input
user_input = {
    "Age": Age,
    "Income": Annual_Income,
    "Employment": Employment,
    "Debt": Debt,
    "Assets": Assets,
    "Credit_Score": Credit_Score,
    "Financial_Goal": Financial_Goal,
    "Risk_Tolerance": Risk_Tolerance,
    "Time_Horizon": Time_Horizon,
    "Life_Event": Life_Event
}




## Connect LLM model

In [127]:
import replicate
from replicate.client import Client
# Function to generate an answer using RAG enabled Gemini pro
def generate_RAG_answer(question, user_input, max_context_length=20000):
    # Assume semantic_search is defined and returns relevant context as a single string
    context_results = semantic_search(question, top_k=5)
    # Combine the top 5 context results into a single string
    context = " ".join([result[2] for result in context_results])
    # Truncate context if it exceeds the maximum length
    if len(context) > max_context_length:
        context = context[:max_context_length]

    rag_prompt = f"""[INST]
    Persona: You are a highly knowledgeable and personable financial advisor with access to a vast database of financial resources. You prioritize clear, actionable advice tailored to the user's unique situation and goals.
    
    Context: {context}

    Example 1:
    Question: "I'm 30 years old making $60,000 a year with no debt and $20,000 in savings. How can I start investing for retirement?"
    User Context: Financial Situation: Age: 30, income: 60000, employment: "Full Time", debt: 0, assets: 20000. Financial Goals: Long-term (retirement). Risk Tolerance: Moderate. Life Events: N/A.
    Answer: "Given your stable income, no debt, and a moderate risk tolerance, you're in a great position to start investing for retirement. A balanced mix of stocks and bonds in a tax-advantaged retirement account like a Roth IRA would be a good start. Consider allocating 70% to a diversified stock fund and 30% to bonds. Adjust the allocation as you age or as your risk tolerance changes."

    Example 2:
    Question: "I'm 22, just started my first job earning $45,000, and have $10,000 in student loans. What's my best strategy for saving?"
    User Context: Financial Situation: Age: 22, income: 45000, employment: "Full Time", debt: 10000, assets: 5000. Financial Goals: Short-term (emergency fund), mid-term (debt repayment). Risk Tolerance: Low. Life Events: Starting first job.
    Answer: "Starting with your student loans and building an emergency fund are your first steps. Aim to pay more than the minimum on your loans to reduce interest costs over time. For your emergency fund, start by saving three months' worth of expenses in a high-yield savings account, gradually increasing to six months. Once these goals are met, you can start saving for other short- and mid-term goals."

    Your Question: "{question}"
    Your User Context: Financial Situation: Age: {user_input['Age']}, income: {user_input['Income']}, employment: "{user_input['Employment']}", 
    debt: {user_input['Debt']}, assets: {user_input['Assets']}, credit score: {user_input['Credit_Score']}, Financial Goals: {user_input['Financial_Goal']}, 
    Risk Tolerance: "{user_input['Risk_Tolerance']}", Time Horizon: "{user_input['Time_Horizon']}", Life Events: "{user_input['Life_Event']}".

    Response Style: 
    * Clarity: Explain complex concepts in simple terms.
    * Actionable: Provide specific recommendations and next steps. 
    * Personalization: Tailor the advice to the user's situation and goals.
    * Transparency: Acknowledge limitations and suggest further resources if needed.
    [/INST]"""

    client = Client(api_token=REPLICATE_API_KEY)

    # Generate the answer using LLama2 from Replicate
    # The mistralai/mixtral-8x7b-instruct-v0.1 model can stream output as it's running.
    for event in client.stream(
        "mistralai/mixtral-8x7b-instruct-v0.1",
        input={
            "top_k": 50,
            "top_p": 1,
            "prompt": rag_prompt,
            "temperature": 0.5,
            "max_new_tokens": 1024,
            "prompt_template": "<s>[INST] {prompt} [/INST] ",
            "presence_penalty": 0,
            "frequency_penalty": 0
        },
    ):
        print(str(event), end="")

# Function to generate an answer using Gemini pro
def generate_non_RAG_answer(question, user_input):
    # Assume semantic_search is defined and returns relevant context as a single string
    non_rag_prompt = f"""[INST]
    Persona: You are a highly knowledgeable and personable financial advisor with access to a vast database of financial resources. You prioritize clear, actionable advice tailored to the user's unique situation and goals.

    Example 1:
    Question: "I'm 30 years old making $60,000 a year with no debt and $20,000 in savings. How can I start investing for retirement?"
    User Context: Financial Situation: Age: 30, income: 60000, employment: "Full Time", debt: 0, assets: 20000. Financial Goals: Long-term (retirement). Risk Tolerance: Moderate. Life Events: N/A.
    Answer: "Given your stable income, no debt, and a moderate risk tolerance, you're in a great position to start investing for retirement. A balanced mix of stocks and bonds in a tax-advantaged retirement account like a Roth IRA would be a good start. Consider allocating 70% to a diversified stock fund and 30% to bonds. Adjust the allocation as you age or as your risk tolerance changes."

    Example 2:
    Question: "I'm 22, just started my first job earning $45,000, and have $10,000 in student loans. What's my best strategy for saving?"
    User Context: Financial Situation: Age: 22, income: 45000, employment: "Full Time", debt: 10000, assets: 5000. Financial Goals: Short-term (emergency fund), mid-term (debt repayment). Risk Tolerance: Low. Life Events: Starting first job.
    Answer: "Based on your situation, you should first pay off your student loans. If you set aside $500 per month, you can pay off your student loans in 20 months. After that, you should start building an emergency fund. You should aim to save at least 3 months of living expenses. Once you have your emergency fund, you can start investing in a low-cost index fund."

    Example 3:
    Question: "I'm 25, just started my first job earning $20,000, and have $10,000 in student loans. I would like to buy a house in the next 12 months, what's the best strategy for saving?"
    User Context: Financial Situation: Age: 25, income: 20,000, employment: "Full Time", debt: 10000, assets: 5000, credit_score = 500. Financial Goals: Short-term (emergency fund), mid-term (debt repayment). Risk Tolerance: Low. Life Events: Starting first job.
    Answer: "Based on your situation, you are not ready to buy a house yet. You should first pay off your student loans and focus on improving your credit score. In order to pay your student loans, set aside $500 per month, you can pay off your student loans in 20 months. After that, you should start building an emergency fund. You should aim to save at least 3 months of living expenses. Once you have your emergency fund, you can start investing in a low-cost index fund. Once you have paid off your student loans, improved your credit score, and saved for an emergency fund, you can start saving for a house."

    
    Your Question: "{question}"
    Your User Context: Financial Situation: Age: {user_input['Age']}, income: {user_input['Income']}, employment: "{user_input['Employment']}", 
    debt: {user_input['Debt']}, assets: {user_input['Assets']}, credit score: {user_input['Credit_Score']}, Financial Goals: {user_input['Financial_Goal']}, 
    Risk Tolerance: "{user_input['Risk_Tolerance']}", Time Horizon: "{user_input['Time_Horizon']}", Life Events: "{user_input['Life_Event']}".

    Response Style: 
    * Clarity: Explain complex concepts in simple terms.
    * Actionable: Provide specific recommendations and next steps. 
    * Personalization: Tailor the advice to the user's situation and goals.
    * Transparency: Acknowledge limitations and suggest further resources if needed.
    [/INST]"""
    model = genai.GenerativeModel('gemini-1.0-pro-latest')
    response = model.generate_content(non_rag_prompt)
    return response.text

In [128]:
# Example query
query = "What are index funds? Should I invest in them?"
generate_RAG_answer(query, user_input)

Context: once you have, invest in low-cost broad market index funds and move on with your life. today, it’s a no-brainer to look at index funds in the large-cap space. but if you are lazy like me and want to make as fewer choices as possible, then index funds are a better choice. |
index funds
|
exchange-traded funds
|end of the day nav
|real-time pricing. with etfs, you can express tactical strategies better than index funds because you can’t buy and sell index funds immediately.








Sure, I'd be happy to help explain index funds and provide some guidance based on your financial situation!

Index funds are a type of investment vehicle that aims to track the performance of a specific market index, such as the S&P 500. By investing in an index fund, you're essentially buying a diversified portfolio of stocks or bonds that mirrors the holdings of the underlying index. This can be a simple and cost-effective way to invest, as index funds typically have lower expense ratios than active

In [129]:
# Example query
query = "What are index funds?"
non_rag_answer = generate_non_RAG_answer(query, user_input)

print(non_rag_answer)

**What are index funds?**

An index fund is a type of mutual fund that tracks a specific market index, like the S&P 500 or the Nasdaq Composite. This means that the fund holds all of the stocks that are in the index, in the same proportions.

Index funds are popular because they are:

* **Low-cost:** Index funds typically have lower fees than actively managed mutual funds.
* **Diversified:** Index funds hold a large number of stocks, which helps to reduce risk.
* **Transparent:** Index funds are required to disclose their holdings, so you know exactly what you're investing in.

**Are index funds right for you?**

Index funds can be a good investment for investors of all ages and risk tolerances. However, they may be particularly suitable for:

* **Investors who are new to investing.** Index funds are a simple and easy way to get started with investing.
* **Investors who do not have a lot of time to research individual stocks.** Index funds allow you to invest in a diversified portfolio

## Evaluating RAG vs Non-RAG Approach

In order to quantify this, I will take the following steps:
1. Think of a query which has a well defined answer in the book.
2. Find the true answer to a query from the actual source (book pdf). 
3. Then pass the query through the RAG based LLM and the regular LLM.
4. Save both generated answers along with the true answer and find word embeddings for each. 
5. Finally, compare the word embeddings of the true answer with the RAG vs Non-RAG based LLM word embeddings using cosine similarity. 

Following these steps will help quantify the performace and accuracy of information in the two answers.

In [33]:
query = "Best practices for repaying loans"

# True Answer from the Huberman Lab Newsletter
true_answer = """"""

### Implementing functions to evaluate performance. 

In [None]:
import torch

def get_embedding(text):
    """
    Generate an embedding for a given text.

    Args:
    - text (str): The input text.
    
    Returns:
    - The sentence embedding.
    """
    # Load the sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Generate the sentence embeddings
    embeddings = model.encode(text, convert_to_tensor=False)
    return embeddings


def calculate_cosine_similarity(embedding1, embedding2):
    """
    Calculate the cosine similarity between two embeddings.

    Args:
    - embedding1 (torch.Tensor): The first embedding.
    - embedding2 (torch.Tensor): The second embedding.

    Returns:
    - The cosine similarity score.
    """
    # Calculate the cosine similarity
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity


def calculate_similarity_scores(true_answer, rag_answer, non_rag_answer):
    """
    Calculate the cosine similarity scores between the true answer and both RAG-based and non-RAG-based answers.

    Args:
    - true_answer (str): The true answer text.
    - rag_answer (str): The RAG-based model's answer text.
    - non_rag_answer (str): The non-RAG-based model's answer text.

    Returns:
    - A dictionary with cosine similarity scores.
    """
    # Convert the answers to embeddings
    true_answer_embedding = get_embedding(true_answer)
    rag_answer_embedding = get_embedding(rag_answer)
    non_rag_answer_embedding = get_embedding(non_rag_answer)
    
    # Calculate cosine similarity scores
    rag_similarity = calculate_cosine_similarity(true_answer_embedding, rag_answer_embedding)
    non_rag_similarity = calculate_cosine_similarity(true_answer_embedding, non_rag_answer_embedding)

    
    # Return the scores
    return {
        "RAG Similarity Score": rag_similarity,
        "Non-RAG Similarity Score": non_rag_similarity
    }


# Calculate the similarity scores
similarity_scores = calculate_similarity_scores(true_answer, rag_answer, non_rag_answer)
print(similarity_scores)
