#Leveraging RAG for Context-Aware Analysis of Tesla’s 10-K Filing

In [None]:
# imports

# !pip install faiss-cpu
# !pip install -U sec-edgar-downloader
# !pip install python-dotenv
# !pip install tqdm
# !gcloud projects list
# !pip install sentence-transformers

from sec_edgar_downloader import Downloader
import os
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import requests
from dotenv import load_dotenv
from google.colab import auth
auth.authenticate_user()
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import textwrap
#from bs4 import BeautifulSoup
from IPython.display import Markdown

## Data Acquisition

In [None]:
# initialize
dl = Downloader('University of Illinois', 'laura.rosok@example.com')

# download Tesla’s latest 10-K filings
dl.get("10-K","TSLA",limit=1);

##Text Preprocessing and RAG Model Implementation


In [None]:
# Define the directory where filings are stored
download_folder = 'sec-edgar-filings/TSLA/10-K/'

filing_content = []
# Walk through all files and directories in the download folder
for root, dirs, files in os.walk(download_folder):
    for file in files:
            file_path = os.path.join(root, file)

            # Open the file and read its content
            with open(file_path, 'r', encoding='utf-8') as f:
                filing_content.append(f.read())  # Read the entire content
filing_content = ''.join(filing_content)

In [None]:
def clean_filing_content(content):
    # Remove SEC tags (e.g., <SEC-DOCUMENT>, <SEC-HEADER>, etc.)
    content = re.sub(r'<.*?>', '', content)  # Remove XML tags

    # Remove specific metadata fields (dates, accession numbers, etc.)
    content = re.sub(r'ACCESSION NUMBER:.*?(\n|$)', '', content)  # Remove accession number and related info
    content = re.sub(r'CONFORMED SUBMISSION TYPE:.*?(\n|$)', '', content)
    content = re.sub(r'PUBLIC DOCUMENT COUNT:.*?(\n|$)', '', content)
    content = re.sub(r'FORM TYPE:.*?(\n|$)', '', content)
    content = re.sub(r'FILM NUMBER:.*?(\n|$)', '', content)
    content = re.sub(r'BUSINESS ADDRESS:.*?(\n|$)', '', content)
    content = re.sub(r'MAIL ADDRESS:.*?(\n|$)', '', content)

    # Remove other irrelevant or unwanted data patterns
    content = re.sub(r'\s+', ' ', content)  # Normalize excessive whitespace and newlines to single space
    content = content.strip()  # Remove leading and trailing whitespace

    return content

In [None]:
def split_filing_into_chunks(content, chunk_size=1000):
    # Split content into chunks of roughly `chunk_size` characters
    chunks = []
    for i in range(0, len(content), chunk_size):
        chunks.append(content[i:i+chunk_size])

    return chunks

In [None]:
# Load pre-trained Sentence-BERT model
model = SentenceTransformer('all-mpnet-base-v2') # or 'paraphrase-MiniLM-L6-v2'

# Function to embed text (chunks) into numerical vectors
def embed_text(texts):
    embeddings = model.encode(texts)
    return embeddings

# Create FAISS index to store and retrieve embeddings
def create_faiss_index(chunks):
    embeddings = embed_text(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance for the dimension of the embeddings

    # Add embeddings to the index
    index.add(np.array(embeddings).astype(np.float32))

    return index

In [None]:
# Step 1: Clean the filing content
cleaned_content = clean_filing_content(filing_content)

# Step 2: Split into chunks
chunks = split_filing_into_chunks(cleaned_content)

# Step 3: Embed the chunks into vectors
embeddings = embed_text(chunks)

# Step 4: Create FAISS index
index = create_faiss_index(chunks)

In [None]:
def retrieve_relevant_chunks(query, index, top_k=5):
    query_embedding = embed_text([query])  # Convert query to embedding
    distances, indices = index.search(query_embedding.astype(np.float32), top_k)

    # Collect relevant chunks, but now prioritize chunks that might contain the answer
    relevant_chunks = [chunks[i] for i in indices[0]]

    # Concatenate top relevant chunks to give a better context to the model
    context = '\n'.join(relevant_chunks)
    return context

## Model Integration

In [None]:
# #Create an .env file in your current directory
# with open('.env', 'w') as f:
#     f.write('API_KEY=AIzaSyBjX-MHLX0xEcgl8NwzQB30WMxUV_8LLIQ')

load_dotenv()

API_KEY = os.getenv('API_KEY')

if not API_KEY:
    raise ValueError("API_KEY not found in .env file.")

PROJECT_ID = 'rag-tesla-10k-qa'
vertexai.init(project=PROJECT_ID, location="us-central1")

gemini_model = GenerativeModel("gemini-1.5-flash-002")

def generate_with_gemini(query, context):
    max_context_length = 2048
    truncated_context = context[:max_context_length]

    prompt = f"Context:\n{truncated_context}\n\nQuestion: {query}\n\nAnswer:"

    # Generate content using the Gemini model
    response = gemini_model.generate_content([Part.from_text(prompt)])

    # Return the generated text
    # Check if candidates list is not empty and access the text
    if response.candidates:
        return response.candidates[0].content.parts[0].text
    else:
        return "No response generated."

##Output

In [None]:
# Sample queries
queries = [
    "What are Tesla's major expenses?",
    "What is Tesla's approach to sustainability?",
    "How does Tesla manage its supply chain?"
]

# Function to format response
def bold_response(response):
    # Replace '**word**' with markdown bold formatting
    response = re.sub(r'\*\*(.*?)\*\*', r'\033[1m\1\033[0m', response)

    return response

# Retrieve and combine relevant chunks for each query
for query in queries:
    relevant_chunks = retrieve_relevant_chunks(query, index)
    context = '\n'.join(relevant_chunks)

    # Get the answer from Gemini
    response = generate_with_gemini(query, context)

    # Format the response as per the requested rules
    formatted_response = bold_response(response).strip()

    # Wrap response text to a fixed width
    wrapped_response = textwrap.fill(formatted_response, width=80)

    # Print formatted response
    print(f"Answer to '\033[1m{query}\033[0m':\n")
    print(wrapped_response)
    print("\n" + "-" * 50 + "\n")

Answer to '[1mWhat are Tesla's major expenses?[0m':

Based on the provided text excerpt, Tesla's major expenses include:  *
[1mAcquisitions of property and equipment:[0m  Amounts shown are $2,148,
$2,251, and $1,088 (likely across different reporting periods). * [1mInterest
payments (net of capitalized amounts):[0m $152, $266, and $444. * [1mTaxes
(net of refunds):[0m $1,203, $561, and $115.   The excerpt is incomplete, and
other expenses undoubtedly exist, but these are the only ones explicitly
detailed in this snippet.

--------------------------------------------------

Answer to '[1mWhat is Tesla's approach to sustainability?[0m':

Tesla's approach to sustainability is multifaceted and encompasses their entire
energy and transportation ecosystem.  They aim to accelerate the world's
transition to sustainable energy by addressing both energy generation and
consumption.  This includes:  * [1mDesigning and manufacturing electric
vehicles:[0m  Their core business is producin