In [None]:
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.chat_engine import ContextChatEngine
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.base.llms.types import ChatMessage, MessageRole
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.prompts import PromptTemplate
from dotenv import load_dotenv
import streamlit as st
import os

In [None]:
DATA_DIR = "/Users/nnaumova/Desktop/Data Science Course/Projects/comchatbot/data"              # data directory
PERSIST_DIR = "/Users/nnaumova/Desktop/Data Science Course/Projects/comchatbot/vector_index"   # directiry for index
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HF_TOKEN = os.getenv("HF_TOKEN")

In [5]:
llm = Groq(model="llama-3.1-8b-instant", api_key=GROQ_API_KEY)

embeddings = HuggingFaceEmbedding(
    model_name=EMBEDDING_MODEL,
    cache_folder="/Users/nnaumova/Desktop/Data Science Course/Projects/comchatbot/embedding_model"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def get_or_create_index():
    if os.path.exists(PERSIST_DIR):
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        index = load_index_from_storage(storage_context, embed_model=embeddings)
    else:
        documents = SimpleDirectoryReader(DATA_DIR).load_data()
        text_splitter = SentenceSplitter(chunk_size=800, chunk_overlap=150)
        index = VectorStoreIndex.from_documents(
            documents,
            transformations=[text_splitter],
            embed_model=embeddings
        )
        index.storage_context.persist(persist_dir=PERSIST_DIR)
    return index


In [None]:
input_template = """Here is the context:
{context_str}
Answer the question based on the above context.
Answer the question as a marxist would.
Question: {query_str}
Answer:"""

prompt = PromptTemplate(template=input_template)


In [None]:
# -------------------------------
# Streamlit UI
# -------------------------------
st.title("Marxist chatbot")

st.write("Get answers to your questions about the economy and society from the three volumes of Capital.")

# Loading index
index = get_or_create_index()
chat_engine = index.as_chat_engine(chat_mode="condense_question", llm=llm, verbose=True)

# UI for questions
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

user_input = st.text_input("Ask a question:")

if user_input:
    retriever = index.as_retriever(similarity_top_k=3)
    context_nodes = retriever.retrieve(user_input)
    context_text = "\n".join([node.get_text() for node in context_nodes])

    filled_prompt = prompt.format(context_str=context_text, query_str=user_input)

    response = chat_engine.chat(filled_prompt)
    answer_text = response.response if hasattr(response, "response") else str(response)

    st.session_state.chat_history.append(("You", user_input))
    st.session_state.chat_history.append(("Marxist-bot", answer_text))

# Memory
for speaker, message in st.session_state.chat_history:
    st.markdown(f"**{speaker}:** {message}")