In [1]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.document_loaders import BSHTMLLoader # BeautifulSoup HTML Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.environ.get('LANGCHAIN_API_KEY')

In [3]:
os.environ["GOOGLE_API_KEY"] = os.environ.get('GOOGLE_GEMNI_API_KEY')

In [21]:
HTML_FILE_PATH = 'Source_Material/Example_game.html'
EMBEDDING_MODEL_NAME = "models/text-embedding-004"
GENERATION_MODEL_NAME = "gemini-2.0-flash"
CHROMA_PERSIST_DIR = "chroma_db_html" # Directory to store Chroma data
CHROMA_COLLECTION_NAME = "html_docs"   # Name for your collection within Chroma

In [5]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [5]:
print(f"1. Loading HTML from '{HTML_FILE_PATH}'...")
loader = BSHTMLLoader(HTML_FILE_PATH)
documents = loader.load()

1. Loading HTML from 'Source_Material/Example_game.html'...


In [39]:
print("\n2. Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100000,
    chunk_overlap=50000,
    separators=["\n\n", "\n", " ", ""] # Sensible defaults
)
chunks = text_splitter.split_documents(documents)


2. Splitting documents into chunks...


In [40]:
print("\n3. Initializing Gemini Embeddings model...")
embeddings = GoogleGenerativeAIEmbeddings(model=EMBEDDING_MODEL_NAME, task_type="retrieval_document")


3. Initializing Gemini Embeddings model...


In [41]:
print("\n4. Creating/Loading ChromaDB vector store...")
vector_store = None
# Try to load existing store if persist_directory exists
if os.path.exists(CHROMA_PERSIST_DIR):
    print(f"   Loading existing ChromaDB from '{CHROMA_PERSIST_DIR}'...")
    try:
        vector_store = Chroma(
            persist_directory=CHROMA_PERSIST_DIR,
            embedding_function=embeddings,
            collection_name=CHROMA_COLLECTION_NAME
        )
        print("   ChromaDB loaded successfully.")
        # A quick check to see if it has documents
        # This is a bit of a hacky way, a better check would be to see if collection has items
        try:
            if vector_store._collection.count() == 0 and chunks: # If loaded but empty, and we have chunks
                 print("   Existing ChromaDB is empty. Re-populating...")
                 vector_store = Chroma.from_documents(
                    documents=chunks,
                    embedding=embeddings,
                    persist_directory=CHROMA_PERSIST_DIR, # Specify directory to save data
                    collection_name=CHROMA_COLLECTION_NAME
                )
                 vector_store.persist() # Explicitly persist
                 print("   ChromaDB populated and persisted.")
            elif not chunks and vector_store._collection.count() == 0:
                 print("   ChromaDB is empty and no new chunks to add.")
            elif chunks and vector_store._collection.count() > 0:
                print(f"   ChromaDB loaded with {vector_store._collection.count()} documents. Not re-populating unless forced.")
                # Optionally, add logic here to decide if you want to re-index or add new documents
                # For simplicity, we'll assume if it exists and has content, we use it.
                # If you modify your HTML, you'd need to delete CHROMA_PERSIST_DIR or implement updating logic.

        except Exception as e: # If loading fails, or collection is empty and we want to rebuild
            print(f"   Error loading or checking existing ChromaDB, or it's empty: {e}. Rebuilding...")
            vector_store = Chroma.from_documents(
                documents=chunks,
                embedding=embeddings,
                persist_directory=CHROMA_PERSIST_DIR, # Specify directory to save data
                collection_name=CHROMA_COLLECTION_NAME
            )
            vector_store.persist() # Explicitly persist
            print("   New ChromaDB created and persisted.")

    except Exception as e:
        print(f"   Could not load ChromaDB from '{CHROMA_PERSIST_DIR}': {e}. Creating new one.")
        # Fall through to creating a new one if loading fails critically


4. Creating/Loading ChromaDB vector store...
   Loading existing ChromaDB from 'chroma_db_html'...
   ChromaDB loaded successfully.
   ChromaDB loaded with 69 documents. Not re-populating unless forced.


In [42]:
if vector_store is None and chunks: # If not loaded and we have chunks, create new
    print(f"   Creating new ChromaDB and persisting to '{CHROMA_PERSIST_DIR}'...")
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=CHROMA_PERSIST_DIR, # Specify directory to save data
        collection_name=CHROMA_COLLECTION_NAME
    )
    vector_store.persist() # Explicitly persist
    print("   New ChromaDB created and persisted.")
elif vector_store is None and not chunks:
    print("🔴 No chunks to process and no existing ChromaDB found. Exiting.")
    exit()


# 5. Create a Retriever from the Vector Store (same as before)
print("\n5. Creating retriever from vector store...")
retriever = vector_store.as_retriever(search_kwargs={'k': 3})
print("   Retriever created.")


5. Creating retriever from vector store...
   Retriever created.


In [43]:
# 6. Initialize LLM for Generation (same as before)
print("\n6. Initializing Gemini LLM for generation...")
llm = ChatGoogleGenerativeAI(model=GENERATION_MODEL_NAME, temperature=0)


6. Initializing Gemini LLM for generation...


In [44]:
print("\n7. Setting up RAG chain with LCEL...")
prompt_template_question = """You are a sports data analyst assistant.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.

Context:
{context}

Question: {question}

Answer:"""


7. Setting up RAG chain with LCEL...


In [45]:
prompt = PromptTemplate.from_template(prompt_template_question)

In [46]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [47]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)
print("   LCEL RAG chain created.")

   LCEL RAG chain created.


In [48]:
def ask_question(query, chain):
    print(f"\n❓ Querying: '{query}'")
    try:
        answer = chain.invoke(query)
        print(chain)
        print("🗣️ Answer:", answer)
    except Exception as e:
        print(f"🔴 Error during query: {e}")

In [49]:
user_query1 = "What were the keys to the Knicks and Pacers Game?"
ask_question(user_query1, rag_chain)


❓ Querying: 'What were the keys to the Knicks and Pacers Game?'
first={
  context: VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002E3AE09C130>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
} middle=[PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are a sports data analyst assistant.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002E3AE0AACB0>, default_metadata=(), model_kwargs={})] last=StrOutputParser(

In [50]:
user_query1 = "Who made the first basket of the game?"
ask_question(user_query1, rag_chain)


❓ Querying: 'Who made the first basket of the game?'
first={
  context: VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002E3AE09C130>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
} middle=[PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are a sports data analyst assistant.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002E3AE0AACB0>, default_metadata=(), model_kwargs={})] last=StrOutputParser()
🗣️ Answer

In [51]:
user_query1 = 'What can you tell me about game be as descriptive as possible?'
ask_question(user_query1, rag_chain)


❓ Querying: 'What can you tell me about game be as descriptive as possible?'
first={
  context: VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002E3AE09C130>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
} middle=[PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are a sports data analyst assistant.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000002E3AE0AACB0>, default_metadata=(), model_kwargs={})] last=Str

In [28]:
user_query1 = "Can you write python code to parse the html file, turn the play by play into tabular data and thencreate a Game Flow visual of the Kicks and Pacers Game"
ask_question(user_query1, rag_chain)


❓ Querying: 'Can you write python code to parse the html file, turn the play by play into tabular data and thencreate a Game Flow visual of the Kicks and Pacers Game'
🗣️ Answer: I am sorry, I am unable to provide code to parse the html.


In [37]:
chunks

[Document(metadata={'source': 'Source_Material/Example_game.html', 'title': '2025 NBA Eastern Conference Finals Game 4: Knicks vs Pacers, May 27, 2025 | Basketball-Reference.com'}, page_content="2025 NBA Eastern Conference Finals Game 4: Knicks vs Pacers, May 27, 2025 | Basketball-Reference.com\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n Sports\xa0Reference\u202f®\nBaseball\nFootball (college)\nBasketball (college)\nHockey\nFootball\nBlog\nStathead\u202f®\nImmaculate Grid\u202f®\nQuestions or Comments?\nWelcome \xa0·\xa0Your Account\nLogout\nAd-Free Login\nCreate Account\n\n\nMENU\n\nPlayers\nTeams\nSeasons\nLeaders\nScores\nWNBA\nDraft\nStathead\nNewsletter\nFull Site Menu Below\n\nYou are here: BBR Home Page  > Box Scores  > 2025 NBA Eastern Conference Finals Game 4: New York Knicks at Indiana Pacers Play-By-Play, May 27, 2025\nWelcome \xa0·\xa0Y