# Embed text chunks

### Core imports and directory vars

In [1]:
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

import faiss
import numpy as np

from pathlib import Path

root_dir = Path.cwd().parent
data_dir = root_dir / 'data'

  from .autonotebook import tqdm as notebook_tqdm


### Load data (text chunks)

In [2]:
# Load chunks from jsonl file
def load_chunks(file_path):
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            chunks.append(json.loads(line))
    return chunks

data = load_chunks(data_dir / 'transcript_chunks.jsonl')
text = [item['text'] for item in tqdm(data)]

100%|██████████| 33186/33186 [00:00<00:00, 15495065.41it/s]


### Load model and embed text data

In [3]:
# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")
model = model.to("cuda")

# Embed text data
# cached embeddings path
embeddings_path = data_dir / 'embeddings.npy'

if embeddings_path.exists():
    print(f"Loading cached embeddings from {embeddings_path}")
    embeddings = np.load(embeddings_path)
else:
    print("Generating embeddings...")
    embeddings = model.encode(text, batch_size=64, show_progress_bar=True, 
                              device="cuda", normalize_embeddings=True)
    np.save(embeddings_path, embeddings)

Loading cached embeddings from c:\Users\manth\apps\PhilGPT\data\embeddings.npy


## `FAISS` Index

### Build and save `FAISS` index

In [None]:
# convert embeddings to float32
embedding_matrix = np.array(embeddings).astype('float32')

# use IndexFlatIP for cosine similarity
index = faiss.IndexFlatIP(embedding_matrix.shape[1]) # dims = 384
index.add(embedding_matrix) # add embeddings to index

# Save the index
save_path = data_dir / 'faiss_transcript_index.index'
faiss.write_index(index, str(save_path)) #because faiss.write_index expects a string path

### Store metadata for retrieval
... and referencing?

Another motivation to do this is **version control** because I will have a synced version of my index and metadata for consistent retrieval.

In [8]:
# Store metadata for retrieval
metadata_dir = data_dir / 'faiss_transcript_metadata.json'
with open(str(metadata_dir), 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

#### a very quick test

In [None]:
# Load index
index = faiss.read_index(str(save_path))

# Embed query
query = ""
query_embedding = model.encode([query], normalize_embeddings=True).astype("float32")

# Search
D, I = index.search(query_embedding, k=10)  # D = similarity, I = indices

# Get top results with basic deduplication
seen_chunks = set()
results = []
for idx in I[0]:
    chunk_id = data[idx]["chunk_id"]
    if chunk_id not in seen_chunks:
        seen_chunks.add(chunk_id)
        results.append(idx)

for unique_idx in results:
    print(data[unique_idx]['text'])
    print("-----")

sts; they still strongly believe in the world that Marxism’s trying to bring about, but they’re highly critical of Marx for a few different reasons. Many critiques, but two of the major ones are that Marx doesn’t talk enough about the concept of personal liberty within his system and he certainly doesn’t do enough to consider the individual.  Remember, the thinkers of the Frankfurt School are living right around the same time as Husserl and Heidegger and Sartre and all these other thinkers that,
-----
e living in…if some of Marx’s predictions ultimately turned out to be…WRONG…maybe…Simone Weil might say…that has something to do with the fact that he didn’t REALLY understand what it FELT like to BE a worker. He certainly was a great economist. He certainly understood what it felt like to READ about being a worker…but what Marx DIDN’T know is what it felt like to CLOCK IN every day at four am with every JOINT in your BODY aching, just praying a forklift falls on top of you so you can at 

## Ollama

At this stage, I have
- clean text chunks
- embeddings
- `faiss` index
- metadata
- tested retrieval with basic dedepulication

So far: *retrieval → top-k results → console print.*

Next: *retrieval → top-k chunks → format into prompt → send to LLM via Ollama → generate final answer*

TODO: Consider using MMR (`max_marginal_relevance`) to retrieve diverse results from faiss index

In [4]:
SYSTEM_PROMPT = """
You are a knowledgeable assistant who is interested in philosophy and about teaching 
others about philosophy. When asked questions, you answer them in a concise and informative manner,
drawing from the provided context. If the context does not contain relevant information, you will say
"I don't know" or "I don't have enough information to answer that question."
"""

In [5]:
def retrieval(user_query):
    # Load index
    save_path = data_dir / 'faiss_transcript_index.index'
    index = faiss.read_index(str(save_path))

    # Embed query
    query = user_query
    query_embedding = model.encode([query], normalize_embeddings=True).astype("float32")

    # Search
    D, I = index.search(query_embedding, k=10)  # D = similarity, I = indices

    # Get top results with basic deduplication
    seen_chunks = set()
    results = []
    for idx in I[0]:
        chunk_id = data[idx]["chunk_id"]
        if chunk_id not in seen_chunks:
            seen_chunks.add(chunk_id)
            results.append(idx)

    text_results = []
    for unique_idx in results:
        # print(data[unique_idx]['text'])
        # print("-----")
        text_results.append(data[unique_idx]['text'])
    
    return "\n".join(text_results)

In [22]:
import ollama

user_query = input("Ask your PhilGPT your question: ")

response = ollama.chat(
    model="gemma3:4b",
    messages=[
        {'role': "system", 'content': SYSTEM_PROMPT},
        {'role': "assistant", 'content': retrieval(user_query)},
        {'role': "user", 'content': user_query}
    ])

print(f"Your question: {user_query}")
print("PhilGPT's response:")
print(response['message']['content'])

Your question: can you tell me a little bit more about the myth of sisyphus?
PhilGPT's response:
Okay, let’s delve a bit deeper into the myth of Sisyphus.

Essentially, Sisyphus was a king of Corinth, and he was a particularly crafty and deceitful man. He wasn’t known for his piety or adherence to the gods. Instead, he was famous for tricking death itself!

Here’s the core of the story:

*   **His Deception:** Sisyphus tricked the god of the underworld, Hades, and his wife Persephone into eating pomegranate seeds. Under the terms of a pact, anyone who eats pomegranate seeds in the underworld is bound there for eternity. Sisyphus tricked Persephone into eating them, condemning her to spend half the year with Hades.

*   **His Punishment:** As punishment for this deception, Sisyphus was condemned to eternally roll a massive boulder up a hill, only to have it roll back down every time he neared the top. This was a symbol of futile, repetitive labor – a task with no real purpose or outcome