### **Imports**

In [86]:
import numpy as np
from langchain_community.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
print("✅ Demo tools loaded.")

✅ Demo tools loaded.


### **Model**

In [87]:
# Initialize the open-source embedding model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

print(f"✅ Model '{model_name}' is loaded and ready.")

✅ Model 'sentence-transformers/all-MiniLM-L6-v2' is loaded and ready.


## **Text-to-Vector**
*(Main component of RAG)*

Here is a plain English question being converted to numeric vector.

**"Tell me about an accident in Virginia involving a person over 50"**

In [88]:
question = "Tell me about an accident in Virginia involving a person under 25"
vector = embeddings.embed_query(question)

print("--- Your Question ---")
print(f"'{question}'")

print("\n--- Becomes a 'Vector' (a list of numbers) ---")
print(np.array(vector))
print(f"\nTotal length of the vector: {len(vector)}")

--- Your Question ---
'Tell me about an accident in Virginia involving a person under 25'

--- Becomes a 'Vector' (a list of numbers) ---
[ 5.15307765e-04  9.48634148e-02  5.71960062e-02  3.66800022e-03
  3.42094824e-02  6.10682145e-02  4.36363788e-03  6.15937933e-02
 -8.74364823e-02  9.87631083e-02  1.03256188e-01 -6.08022092e-03
  1.06568299e-02 -4.79812063e-02 -1.02137499e-01 -3.21813188e-02
  6.50791079e-02  5.76639958e-02 -9.72741991e-02  3.72935534e-02
 -2.20543128e-02  3.70991975e-02 -4.73101735e-02 -5.49411867e-03
 -4.12638858e-02  3.22457636e-03 -5.51801063e-02  5.02467863e-02
 -6.80102967e-03  1.85090005e-02  6.05112873e-02 -4.82903011e-02
 -8.57970770e-03 -2.54554674e-02 -4.89624143e-02 -8.27616081e-02
 -5.60990954e-03  8.82488564e-02  1.50658684e-02 -1.08381193e-02
 -4.80964817e-02 -5.81862964e-02 -5.51176351e-03 -3.48491184e-02
 -2.23627221e-02  2.71932073e-02 -2.90675890e-02 -8.60272348e-03
  6.27174452e-02  1.03139095e-02 -4.09108363e-02 -8.25642888e-03
 -1.83683112e-02 

#### **Now, let's create two other sentences to compare**

In [89]:
text_similar = "What is the number of accidents in Virginia invovling alcohol under 25?"
text_dissimilar = "What's the weather or driving conditions like in California?"

vector_similar = embeddings.embed_query(text_similar)
vector_dissimilar = embeddings.embed_query(text_dissimilar)


print("✅ Embedded two more sentences to compare.")

✅ Embedded two more sentences to compare.


#### Now, let's use **'cosine similarity'** to see how **"close"** they are.

#### A score of **1.0** is a perfect match.

In [90]:
# We need to reshape the vectors for the function
v_question = np.array(vector).reshape(1, -1)
v_similar = np.array(vector_similar).reshape(1, -1)

# Using cosine similarity to compare
v_dissimilar = np.array(vector_dissimilar).reshape(1, -1)
sim_similar = cosine_similarity(v_question, v_similar)[0][0]
sim_dissimilar = cosine_similarity(v_question, v_dissimilar)[0][0]


print("--- Similarity Results ---")
print(f"Similarity to 'drunk driving': {sim_similar:.4f}")
print(f"Similarity to 'California weather': {sim_dissimilar:.4f}")

--- Similarity Results ---
Similarity to 'drunk driving': 0.6403
Similarity to 'California weather': 0.2646


## **RAG Implementation**

### Imports

In [91]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'
os.environ['OPENAI_API_KEY'] = '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'

import bs4
import sqlalchemy
from langchain_core.documents import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama 
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain import hub

print("✅ RAG tools loaded.")

✅ RAG tools loaded.


## **Connecting to FARS Database**
#### *(Fatality Analysis Reporting System)*


Here the local **SQL databse** is being connected. 


Running a **SQL query** to joins all three tables *(accident, person, vehicle)*.

In [92]:
# --- 1. CONNECT TO YOUR LOCAL MYSQL DATABASE ---
db_uri = "mysql+pymysql://root:NewStrongPass!123@localhost:3306/fars"
engine = sqlalchemy.create_engine(db_uri)

# --- 2. FETCH DATA & SERIALIZE ---
documents_to_index = []

# Helper maps for state names, etc.
print("Connecting to database...")
with engine.connect() as connection:
    # query JOINS your three tables to get rich data for each accident
    query = sqlalchemy.text("""
        SELECT 
            a.ST_CASE, a.YEAR, a.STATE, a.MONTH, a.PERSONS, a.VE_FORMS,
            p.AGE, p.SEX, p.PER_TYP,
            v.MAKE, v.MODEL
        FROM 
            accident_master a
        LEFT JOIN 
            person_master p ON a.ST_CASE = p.ST_CASE
        LEFT JOIN 
            vehicle_master v ON a.ST_CASE = v.ST_CASE
        LIMIT 5000; 
    """)
    
    result = connection.execute(query)
    
    for row in result:
        
        # 1. Create the text snippet (page_content)
        content_snippet = (
            f"Accident Case {row.ST_CASE} in {row.YEAR} involved "
            f"{row.PERSONS} persons and {row.VE_FORMS} vehicles. "
            f"Details include: Person (Age: {row.AGE}, Sex: {row.SEX}), "
            f"Vehicle (Make: {row.MAKE}, Model: {row.MODEL})."
        )
        # 2. Create the metadata (for 100% traceability)
        metadata = {
            "source_table": "accident_master",
            "ST_CASE": row.ST_CASE,
            "YEAR": row.YEAR,
        }
        
        doc = Document(page_content=content_snippet, metadata=metadata)
        documents_to_index.append(doc)

print(f"Created {len(documents_to_index)} Documents from MySQL.")

Connecting to database...
Created 5000 Documents from MySQL.


## **Indexing: Creating the Vector Store**

This cell is were vectorization happens. This is the **"R" (Retrieval)** component.

This is our "vectorizer"—a specialized tool that reads text and converts its semantic meaning into a list of numbers (a vector).

In [93]:
# 1. Initialize an open-source embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Build and persist the vector store, runs model through 5000 documents 
vectorstore = Chroma.from_documents(
    documents=documents_to_index, 
    embedding=embeddings,
    persist_directory="./capstone_chroma_db" # folder where it will be saved
)


print("--- SUCCESSFULLY LOADED ---")
print(f"Vector store created at './capstone_chroma_db'")
print(f"Total documents indexed: {vectorstore._collection.count()}")

--- SUCCESSFULLY LOADED ---
Vector store created at './capstone_chroma_db'
Total documents indexed: 35000


## **Retrieval**

In [94]:
# Load the persisted vector store from disk
print("Loading vector store from disk...")
vectordb = Chroma(
    persist_directory="./capstone_chroma_db", 
    embedding_function=embeddings
)

# Creating the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 5}) # 'k=5' finds the top 5 snippets

print("Retriever created.")

# Testing the retriever
print("\n--- Retriever Test ---")
test_docs = retriever.invoke("Accidents in Virginia")
print(f"Found {len(test_docs)} relevant docs for 'Accidents in Virginia'")
print(f"Top result: {test_docs[0].page_content}")

Loading vector store from disk...
Retriever created.

--- Retriever Test ---
Found 5 relevant docs for 'Accidents in Virginia'
Top result: Accident Case 40208 in 75 involved 4 persons and 2 vehicles. Details include: Person (Age: 22, Sex: 2), Vehicle (Make: 12, Model: 0).


## **RAG Chain (Prompt, LLM, and Chain)**


In [95]:
# 1. Get the RAG prompt from the hub
prompt = hub.pull("rlm/rag-prompt")

# 2. Initialize an open-source LLM via Ollama
llm = ChatOllama(model="llama3")

# 3. Create the RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain created successfully.")

RAG chain created successfully.


## **(Ask a Question)**

In [None]:
question = "Tell me about an accident in Virginia involving a person under 25"
# question = "Tell me about an accident in Virginia that involved animals"

# .stream() gives you the answer as it's being generated
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)

I don't have enough information to answer this question. The provided context does not mention alcohol involvement in any accidents in Virginia.