In [1]:
import sys
sys.path.append("../src")        # points to project/src
from config import OPENAI_API_KEY, PINECONE_API_KEY

# Following Pinecone Quickstart
* [https://app.pinecone.io/](https://app.pinecone.io/)

In [2]:
import openai
from openai import OpenAI
import os

In [3]:
from pinecone import Pinecone
import os, time

pc = Pinecone(api_key=PINECONE_API_KEY)
client = OpenAI(api_key=OPENAI_API_KEY)

index_name = "developer-quickstart-py"
pc_index = pc.Index(index_name)   

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map": {"text": "chunk_text"}
        }
    )
    # optional: wait until the index is ready
    while pc.describe_index(index_name).status.get("ready") is False:
        time.sleep(5)


In [4]:
# wipe everything in namespace ns1
# pc_index.delete(namespace="ns1", delete_all=True)


In [5]:
def pinecone_hosted_search(query:str, top_k:int = 5, namespace="ns1"):
    """Search a Pinecone index that embeds text server-side."""
    resp = pc_index.search(
        namespace = namespace,
        query     = {
            "top_k": top_k,
            "inputs": {"text": query}
        },
        fields = ["chunk_text", "category"]
    )

    # resp -> dict -> records list
    for_match = resp.to_dict().get("records", [])
    docs = [
        Document(
            page_content = rec["metadata"]["chunk_text"],
            metadata     = {
                "id":       rec["id"],
                "category": rec["metadata"].get("category", ""),
                "score":    rec["score"],
            }
        )
        for rec in for_match
    ]
    return docs


In [6]:
index = pc.Index(index_name)  

In [7]:
from pathlib import Path
from itertools import islice

# --- 1. Collect records -------------------------------------------------
records = []
pages_dir = Path("../data/talmud-pages")

for page_path in pages_dir.glob("*.txt"):
    page_id = page_path.stem                    # e.g. "Berakhot_02a"
    text = page_path.read_text(encoding="utf-8")

    # split on blank lines → paragraphs
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    for para_idx, para in enumerate(paragraphs):
        records.append({
            "_id": f"{page_id}-{para_idx}",     # unique id
            "chunk_text": para,
            "category": "talmud",
            # add any other fields you like here
        })

print(f"Prepared {len(records):,} paragraph records from {len(list(pages_dir.glob('*.txt')))} pages")


Prepared 2,297 paragraph records from 2297 pages


In [8]:
# --- 2. Upsert to Pinecone in batches -----------------------------------
BATCH = 50               # well below Pinecone’s 2 MB request limit

for start in range(0, len(records), BATCH):
    batch = records[start : start + BATCH]
    index.upsert_records("ns1", batch)

print("✅ Finished upserting all paragraphs.")


✅ Finished upserting all paragraphs.


In [9]:
resp = pc_index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {"text": "Shema"}
    },
    fields=["chunk_text", "category"]   # return metadata fields
)

print(resp.to_dict())


{'result': {'hits': [{'_id': 'brachot15.html-0', '_score': 0.4486818015575409, 'fields': {'category': 'talmud', 'chunk_text': 'Brachot 15 - Proper Way Rabbi Yochanan taught: "One who wishes to accept on himself Heaven\'s sovereignty in a complete manner, should relieve himself and wash his hands, don tefillin, then say the Shema, and the Amidah (Standing) prayer - and this is complete acceptance." If one said the Shema, but did not make it audible to his ears - this not the perfect way, but it counts. Rabbi Yose disagrees and says that he has not fulfilled his obligation and will have to repeat the Shema. What is the argument? Rabbi Yose says that the word "Shema" itself, which means "Hear" teaches that one should hear his words. And the first teacher (usually it is Rabbi Meir), what does he say? He says, "Shema," or "Hear" means "Understand" and tells us that one can say the Shema in any language that he understands. And Rabbi Yose? He surelly agrees that one can say the Shema in any 

In [10]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 2297}},
 'total_vector_count': 2297,
 'vector_type': 'dense'}


In [11]:
records = resp.to_dict().get("records", [])
records

[]

In [12]:
resp = pc_index.search(
    namespace="ns1",
    query={"top_k": 5, "inputs": {"text": "What is Shema?"}},
    fields=["chunk_text", "category"]
)
print(resp.to_dict())
resp.to_dict()["result"]["hits"]

{'result': {'hits': [{'_id': 'brachot2.html-0', '_score': 0.43021392822265625, 'fields': {'category': 'talmud', 'chunk_text': 'Brachot 2 - Saying the Shema Prayer Twice a day, evening and morning, one must say that he will listen to God and love God. This is the Shema prayer, "Listen, Israel, God is our Lord, God is One." Since the Torah continues, "when you lie down and when you get up," we understand that this paragraph needs to be said twice. However, the Shema is not said when one actually goes to sleep or wakes up, but rather in the general period when people lie down and get up. When is this? In the evening - when the Kohanim, who were impure, are returning from the mikveh to eat priestly portion (terumah), that is, at nightfall. That is when the time to say the Shema in the evening begins, but when does it end? Rabbi Eliezer said, "Until the end of the first watch, that is, the first part of the night." Rabbi Eliezer understands "when you lie down" as the time when people go to 

[{'_id': 'brachot2.html-0',
  '_score': 0.43021392822265625,
  'fields': {'category': 'talmud',
   'chunk_text': 'Brachot 2 - Saying the Shema Prayer Twice a day, evening and morning, one must say that he will listen to God and love God. This is the Shema prayer, "Listen, Israel, God is our Lord, God is One." Since the Torah continues, "when you lie down and when you get up," we understand that this paragraph needs to be said twice. However, the Shema is not said when one actually goes to sleep or wakes up, but rather in the general period when people lie down and get up. When is this? In the evening - when the Kohanim, who were impure, are returning from the mikveh to eat priestly portion (terumah), that is, at nightfall. That is when the time to say the Shema in the evening begins, but when does it end? Rabbi Eliezer said, "Until the end of the first watch, that is, the first part of the night." Rabbi Eliezer understands "when you lie down" as the time when people go to sleep. The Sa

In [13]:
query = "When to read Shema"
results = index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    }
)

print(results)

{'result': {'hits': [{'_id': 'brachot2.html-0',
                      '_score': 0.5441867113113403,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Brachot 2 - Saying the Shema '
                                               'Prayer Twice a day, evening '
                                               'and morning, one must say that '
                                               'he will listen to God and love '
                                               'God. This is the Shema prayer, '
                                               '"Listen, Israel, God is our '
                                               'Lord, God is One." Since the '
                                               'Torah continues, "when you lie '
                                               'down and when you get up," we '
                                               'understand that this paragraph '
                                              

In [14]:
reranked_results = index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 5,
        "rank_fields": ["chunk_text"]
    },
    fields=["category", "chunk_text"]
)

print(reranked_results)

{'result': {'hits': [{'_id': 'brachot11.html-0',
                      '_score': 0.8480936288833618,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Brachot 11 - Lying Down to Say '
                                               'the Shema Bait Shammai say '
                                               'that in the evening one must '
                                               'lie down in order to read the '
                                               'Shema, and in the morning he '
                                               'needs to be standing, since '
                                               'the Torah said, "When you lie '
                                               'down and when you arise." '
                                               'However, Beit Hillel say that '
                                               'one can say the Shema in any '
                                               'positio

In [15]:
# --- Helper: convert Pinecone hits → context ---------------------------------
def top_k_to_context(results, max_chars=2_000):
    """
    Take a list of LangChain Documents *or* (doc, score) tuples and concatenate
    their page_content into one context string, truncated to `max_chars`.
    """
    # unwrap (doc, score) tuples if needed
    docs = [r[0] if isinstance(r, tuple) else r for r in results]
    chunks = [d.page_content.strip() for d in docs]

    context = "\n\n---\n\n".join(chunks)
    return context[:max_chars]

# --- Main: ask ChatGPT with retrieved context --------------------------------
def ask_chatgpt_with_context(question, pinecone_hits, model="gpt-4o-mini", k_context=5):
    """
    * question: user question string
    * pinecone_hits: list from similarity_search / search
    * model: any chat-capable OpenAI model
    * k_context: how many top docs to include
    """
    context_block = top_k_to_context(pinecone_hits[:k_context])

    messages = [
        {
            "role": "system",
            "content": (
                "You are MosesAI, a helpful Talmudic assistant. "
                "Answer the user using ONLY the context provided. "
                "If the answer isn't in the context, reply 'I don’t know based on the documents I have.'"
            )
        },
        {
            "role": "user",
            "content": f"Context:\n{context_block}\n\n---\n\nQuestion: {question}"
        }
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.2
    )
    return response.choices[0].message.content

# --- Example usage -----------------------------------------------------------
from langchain_pinecone import PineconeVectorStore
query = "What is Shema?"
pinecone_hits = pinecone_hosted_search(query, top_k=5)

print(len(pinecone_hits))
summary = pc_index.describe_index_stats(namespace="ns1").to_dict()
print(summary)

resp = pc_index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {"text": "Shema"}
    },
    fields=["chunk_text", "category"]   # return metadata fields
)

print(resp.to_dict())

# Peek at the raw records
for doc in pinecone_hits:
    print(doc.metadata["id"], "→", doc.metadata.get("score"))
    print(doc.page_content[:200], "...\n")


answer = ask_chatgpt_with_context(query, pinecone_hits)
print(answer)


0
{'namespaces': {'ns1': {'vector_count': 2297}}, 'index_fullness': 0.0, 'total_vector_count': 2297, 'dimension': 1024, 'metric': 'cosine', 'vector_type': 'dense'}
{'result': {'hits': [{'_id': 'brachot15.html-0', '_score': 0.4486818015575409, 'fields': {'category': 'talmud', 'chunk_text': 'Brachot 15 - Proper Way Rabbi Yochanan taught: "One who wishes to accept on himself Heaven\'s sovereignty in a complete manner, should relieve himself and wash his hands, don tefillin, then say the Shema, and the Amidah (Standing) prayer - and this is complete acceptance." If one said the Shema, but did not make it audible to his ears - this not the perfect way, but it counts. Rabbi Yose disagrees and says that he has not fulfilled his obligation and will have to repeat the Shema. What is the argument? Rabbi Yose says that the word "Shema" itself, which means "Hear" teaches that one should hear his words. And the first teacher (usually it is Rabbi Meir), what does he say? He says, "Shema," or "Hear" 

In [16]:
k_context = 5
context_block = top_k_to_context(pinecone_hits[:k_context])
print("CONTEXT SENT TO GPT:\n", context_block[:500], "\n--- END CONTEXT ---")


CONTEXT SENT TO GPT:
  
--- END CONTEXT ---


In [17]:
print("hits list length:", len(pinecone_hits))
for i, d in enumerate(pinecone_hits, 1):
    print(i, repr(getattr(d, "page_content", "")[:120]))


hits list length: 0


In [18]:
from langchain.schema import Document

def pinecone_hosted_search(query, top_k=5, namespace="ns1"):
    resp = pc_index.search(
        namespace=namespace,
        query={"top_k": top_k, "inputs": {"text": query}},
        fields=["chunk_text", "category"]
    )
    hits = resp.to_dict()["result"]["hits"]
    return [
        Document(
            page_content=h["fields"]["chunk_text"],
            metadata={
                "id": h["_id"],
                "score": h["_score"],
                "category": h["fields"].get("category", "")
            }
        )
        for h in hits
    ]


In [19]:
pinecone_hits = pinecone_hosted_search("Can I read Shema in a filthy place?", top_k=5)
print(len(pinecone_hits))
print(pinecone_hits[0].page_content[:200])

5
Brachot 25 - Reciting the Shema in a filthy alley Rav Huna said in the name of Rabbi Yochanan, "If one was walking in a filthy alley (where excrement or urine is found), and the time for Shema was pas


In [20]:
k_context = 5                           # or fewer if you prefer
context_block = top_k_to_context(pinecone_hits[:k_context])

print("CONTEXT PREVIEW:\n", context_block[:500], "\n--- END ---")


CONTEXT PREVIEW:
 Brachot 25 - Reciting the Shema in a filthy alley Rav Huna said in the name of Rabbi Yochanan, "If one was walking in a filthy alley (where excrement or urine is found), and the time for Shema was passing, he may place his hand over his mouth, and then say the Shema. Rav Chisda answered him, "My God! Even if Rabbi Yochanan himself told me this, I would not listen to him." Rather he should pause. If he did not, the verse from Ezekiel applies to him, "I (God) too gave you decrees that were not goo 
--- END ---


In [21]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)     # run once per kernel

answer = ask_chatgpt_with_context("Can I read Shema in a filthy place?", pinecone_hits, k_context=5)
print("GPT answer:\n", answer)


GPT answer:
 Yes, you may read the Shema in a filthy alley by placing your hand over your mouth, according to Rabbi Yochanan. However, Rav Chisda disagrees and suggests that one should pause instead.
