In [64]:
#Q3.1
%reset -f
import os
from haystack import Pipeline
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack.utils import Secret
from elasticsearch import Elasticsearch, NotFoundError
import huggingface_hub


In [None]:
#debugging
import requests
try:
    res = requests.get("http://localhost:9200")
    if res.status_code == 200:
        print("Elasticsearch is running!")
        print(res.json())
    else:
        print(f"Elasticsearch responded with status code {res.status_code}")
except Exception as e:
    print("Could not connect to Elasticsearch.")
    print("Error:", e)


✅ Elasticsearch is running!
{'name': '2a8208dd71bc', 'cluster_name': 'docker-cluster', 'cluster_uuid': '_hkO__sjSjq07Fxmo5KzJw', 'version': {'number': '8.12.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1665f706fd9354802c02146c1e6b5c0fbcddfbc9', 'build_date': '2024-01-11T10:05:27.953830042Z', 'build_snapshot': False, 'lucene_version': '9.9.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:

document_store = ElasticsearchDocumentStore(
    hosts="http://localhost:9200",
    index="transcripts"
)

print("Connected to Elasticsearch!")

Connected to Elasticsearch!


In [None]:
import zipfile
import os

zip_path = "./transcripts.zip"  
extract_dir = "transcripts_unzipped"


if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

print(f"Transcripts extracted to: {extract_dir}")


✅ Transcripts extracted to: transcripts_unzipped


In [None]:
from haystack.dataclasses import Document

transcript_folder = os.path.join(extract_dir, "transcripts")

docs = []
for filename in os.listdir(transcript_folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(transcript_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
            docs.append(Document(content=content, meta={"name": filename}))

# Index the docs
document_store.write_documents(docs)


In [None]:
#Q3.2
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret

generator = HuggingFaceAPIGenerator(api_type="serverless_inference_api",
                                    api_params={"model": "microsoft/Phi-3.5-mini-instruct"},
                                    token=Secret.from_token("huggingface_token"),)



In [35]:
template = """
  <|system|>
  You are a helpful assistant.<|end|>
  <|user|>
  Given the following information, answer the question.

  Context: 
  {% for document in documents %}
      {{ document.content }}
  {% endfor %}

  Question: {{ query }}?<|end|>
  <|assistant|>"""

In [43]:

retriever = ElasticsearchBM25Retriever(document_store=document_store)
generator = HuggingFaceAPIGenerator(
    api_type="serverless_inference_api",
    api_params={"model": "microsoft/Phi-3.5-mini-instruct", "max_new_tokens": 200 })

In [None]:
%%writefile app.py


from openai import OpenAI
import streamlit as st
import os
from haystack import Pipeline
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack.utils import Secret
from haystack.dataclasses import Document
from elasticsearch import Elasticsearch, NotFoundError
import huggingface_hub
import zipfile
import os
# connect to your existing document store
document_store = ElasticsearchDocumentStore(
    hosts="http://localhost:9200",
    index="transcripts"
)

if document_store.count_documents() == 0:
    print("📄 Index is empty. Ingesting documents...")

    zip_path = "./transcripts.zip"  
    extract_dir = "transcripts_unzipped"

    if not os.path.exists(extract_dir):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)

    transcript_folder = os.path.join(extract_dir, "transcripts")

    docs = []
    for filename in os.listdir(transcript_folder):
        if filename.endswith(".txt"):
            filepath = os.path.join(transcript_folder, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                content = f.read()
                docs.append(Document(content=content, meta={"name": filename}))

    document_store.write_documents(docs)
    
# retriever and generator setup
retriever = ElasticsearchBM25Retriever(document_store=document_store)


template = """
<|system|>
You are a helpful assistant.<|end|>
<|user|>
Given the following information, answer the question.

Context: 
{% for document in documents %}
    {{ document.content[:5000] }}
{% endfor %}

Question: {{ query }}?<|end|>
<|assistant|>
"""

generator = HuggingFaceAPIGenerator(
    api_type="serverless_inference_api",
    api_params={"model": "microsoft/Phi-3.5-mini-instruct", "max_new_tokens": 200},
    token=Secret.from_token("huggingface_hub_token")  # Replace with your Hugging Face Hub token
)

# define RAG pipeline
rag_pipeline = Pipeline()
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=template))
rag_pipeline.add_component("llm", generator)
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
with st.sidebar:
    openai_api_key = st.text_input("OpenAI API Key", key="chatbot_api_key", type="password")
    "[Get an OpenAI API key](https://platform.openai.com/account/api-keys)"
    "[View the source code](https://github.com/streamlit/llm-examples/blob/main/Chatbot.py)"
    "[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/streamlit/llm-examples?quickstart=1)"

st.title("💬 Course Chatbot")
st.caption("🚀 Interactive Q&A with Elasticsearch, Haystack, and HuggingFace")

if "messages" not in st.session_state:
    st.session_state.messages = []

for msg in st.session_state.messages:
    st.chat_message(msg["role"]).write(msg["content"])

if prompt := st.chat_input("Ask a question about the course transcripts"):
    st.session_state.messages.append({"role": "user", "content": prompt})
    st.chat_message("user").write(prompt)

    
    retrieved_docs = retriever.run(query=prompt)["documents"]
    limited_docs = retrieved_docs[:1]  # you can do [:2] or [:3] depending on length

    
    prompt_builder = PromptBuilder(template=template, required_variables=["documents", "query"])
    final_prompt = prompt_builder.run(documents=limited_docs, query=prompt)["prompt"]

    
    response = generator.run(prompt=final_prompt)["replies"][0]
    
    st.session_state.messages.append({"role": "assistant", "content": response})
    st.chat_message("assistant").markdown(response)


Overwriting app.py


In [79]:
#q3.3
test_prompts = [
    "What is eda?",
    "What is arima?"]

" EDA stands for Exploratory Data Analysis. It is a crucial step in the data science process, where data scientists and analysts explore and analyze data to discover patterns, relationships, and insights. The goal of EDA is to understand the data's underlying structure, identify outliers or anomalies, and formulate hypotheses for further investigation.
In the given context, the speaker is discussing the importance of data management in the field of data science, particularly in the context of the abundance of data generated by generative AI. EDA would be an essential part of this process, as it helps the students in CS 639: Data Management for Data Science to understand and make sense of the vast amounts of data they will encounter.
During EDA, various techniques and tools are employed, such as:
Summarizing data using descriptive statistics (mean, median, mode, standard deviation, etc.)
Visualizing data through charts, graphs, and plots (histograms, scatter plots, box plots, etc.)
Identifying correlations and relationships between variables
Detecting outliers and anomalies
Formulating hypotheses and questions for further analysis
By engaging in EDA, students in this course will develop a strong foundation in data analysis and management, enabling them to effectively handle and interpret the massive amounts of data generated in today's data-driven world.",
"Arima, short for AutoRegressive Integrated Moving Average, is a statistical modeling technique used in time series analysis and forecasting. It is not directly mentioned in the provided context, but I can explain it for you.

The Arima model consists of three components:

AutoRegressive (AR): This part of the model captures the relationship between an observation and a specified number of lagged observations. It helps to understand how past values influence the current value.

Integrated (I): This component involves differencing the data to make it stationary, meaning that the mean, variance, and autocorrelation structure do not change over time. This step is crucial for handling non-stationary data, which is common in many real-world time series.

Moving Average (MA): This component models the relationship between an observation and a residual error from a moving average model applied to lagged observations. It helps to capture the impact of random shocks or noise on the time series.

The combination of these three components allows the Arima model to capture complex patterns and relationships in time series data, making it a powerful tool for forecasting and understanding the underlying dynamics of the data.

In the context of data management for data science, understanding and applying techniques like Arima can help data scientists and analysts make more accurate predictions and gain deeper insights from the vast amounts of data generated by various sources, including generative AI.",
"

In [2]:
# Overall, the RAG pipeline gave more accurate and grounded answers, especially for fact-based questions. The fine-tuned model hallucinated course content.
# However, the fine-tuned model was slightly more fluent in writing style.