In [14]:
import sys
sys.path.append("../src")        # points to project/src
from config import OPENAI_API_KEY, PINECONE_API_KEY

# Following Pinecone Quickstart
* [https://app.pinecone.io/](https://app.pinecone.io/)

In [17]:
import openai
from openai import OpenAI
import os

In [18]:
from pinecone import Pinecone
import os, time

pc = Pinecone(api_key=PINECONE_API_KEY)
client = OpenAI(api_key=OPENAI_API_KEY)

index_name = "developer-quickstart-py"
pc_index = pc.Index(index_name)   



if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map": {"text": "chunk_text"}
        }
    )
    # optional: wait until the index is ready
    while pc.describe_index(index_name).status.get("ready") is False:
        time.sleep(5)


In [19]:
def pinecone_hosted_search(query:str, top_k:int = 5, namespace="ns1"):
    """Search a Pinecone index that embeds text server-side."""
    resp = pc_index.search(
        namespace = namespace,
        query     = {
            "top_k": top_k,
            "inputs": {"text": query}
        },
        fields = ["chunk_text", "category"]
    )

    # resp -> dict -> records list
    for_match = resp.to_dict().get("records", [])
    docs = [
        Document(
            page_content = rec["metadata"]["chunk_text"],
            metadata     = {
                "id":       rec["id"],
                "category": rec["metadata"].get("category", ""),
                "score":    rec["score"],
            }
        )
        for rec in for_match
    ]
    return docs


In [20]:
index = pc.Index(index_name)  

records = [
    { "_id": "rec1", "chunk_text": "The Eiffel Tower was completed in 1889 and stands in Paris, France.", "category": "history" },
    { "_id": "rec2", "chunk_text": "Photosynthesis allows plants to convert sunlight into energy.", "category": "science" },
    { "_id": "rec3", "chunk_text": "Albert Einstein developed the theory of relativity.", "category": "science" },
    { "_id": "rec4", "chunk_text": "The mitochondrion is often called the powerhouse of the cell.", "category": "biology" },
    { "_id": "rec5", "chunk_text": "Shakespeare wrote many famous plays, including Hamlet and Macbeth.", "category": "literature" },
    { "_id": "rec6", "chunk_text": "Water boils at 100°C under standard atmospheric pressure.", "category": "physics" },
    { "_id": "rec7", "chunk_text": "The Great Wall of China was built to protect against invasions.", "category": "history" },
    { "_id": "rec8", "chunk_text": "Honey never spoils due to its low moisture content and acidity.", "category": "food science" },
    { "_id": "rec9", "chunk_text": "The speed of light in a vacuum is approximately 299,792 km/s.", "category": "physics" },
    { "_id": "rec10", "chunk_text": "Newton's laws describe the motion of objects.", "category": "physics" },
    { "_id": "rec11", "chunk_text": "Shema is read two times a day, morning and evening.", "category": "talmud" }
]

index.upsert_records("ns1", records)

In [21]:
from pathlib import Path
from itertools import islice

# --- 1. Collect records -------------------------------------------------
records = []
pages_dir = Path("../data/talmud-pages")

for page_path in pages_dir.glob("*.txt"):
    page_id = page_path.stem                    # e.g. "Berakhot_02a"
    text = page_path.read_text(encoding="utf-8")

    # split on blank lines → paragraphs
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    for para_idx, para in enumerate(paragraphs):
        records.append({
            "_id": f"{page_id}-{para_idx}",     # unique id
            "chunk_text": para,
            "category": "talmud",
            # add any other fields you like here
        })

print(f"Prepared {len(records):,} paragraph records from {len(list(pages_dir.glob('*.txt')))} pages")


Prepared 2,297 paragraph records from 2297 pages


In [22]:
# --- 2. Upsert to Pinecone in batches -----------------------------------
BATCH = 50               # well below Pinecone’s 2 MB request limit

for start in range(0, len(records), BATCH):
    batch = records[start : start + BATCH]
    index.upsert_records("ns1", batch)

print("✅ Finished upserting all paragraphs.")


✅ Finished upserting all paragraphs.


In [23]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'ns1': {'vector_count': 2308}},
 'total_vector_count': 2308,
 'vector_type': 'dense'}


In [24]:
query = "When to read Shema"
results = index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    }
)

print(results)

{'result': {'hits': [{'_id': 'rec11',
                      '_score': 0.5928104519844055,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Shema is read two times a day, '
                                               'morning and evening.'}},
                     {'_id': 'brachot2.html-0',
                      '_score': 0.5441867113113403,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Brachot 2 - Saying the Shema '
                                               'Prayer Twice a day, evening '
                                               'and morning, one must say that '
                                               'he will listen to God and love '
                                               'God. This is the Shema prayer, '
                                               '"Listen, Israel, God is our '
                                               'Lord, God is One." S

In [25]:
reranked_results = index.search(
    namespace="ns1",
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 5,
        "rank_fields": ["chunk_text"]
    },
    fields=["category", "chunk_text"]
)

print(reranked_results)

{'result': {'hits': [{'_id': 'rec11',
                      '_score': 0.9874235987663269,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Shema is read two times a day, '
                                               'morning and evening.'}},
                     {'_id': 'brachot11.html-0',
                      '_score': 0.8456876873970032,
                      'fields': {'category': 'talmud',
                                 'chunk_text': 'Brachot 11 - Lying Down to Say '
                                               'the Shema Bait Shammai say '
                                               'that in the evening one must '
                                               'lie down in order to read the '
                                               'Shema, and in the morning he '
                                               'needs to be standing, since '
                                               'the Torah said, "When y

In [27]:
# --- Helper: convert Pinecone hits → context ---------------------------------
def top_k_to_context(results, max_chars=2_000):
    """
    Take a list of LangChain Documents *or* (doc, score) tuples and concatenate
    their page_content into one context string, truncated to `max_chars`.
    """
    # unwrap (doc, score) tuples if needed
    docs = [r[0] if isinstance(r, tuple) else r for r in results]
    chunks = [d.page_content.strip() for d in docs]

    context = "\n\n---\n\n".join(chunks)
    return context[:max_chars]

# --- Main: ask ChatGPT with retrieved context --------------------------------
def ask_chatgpt_with_context(question, pinecone_hits, model="gpt-4o-mini", k_context=5):
    """
    * question: user question string
    * pinecone_hits: list from similarity_search / search
    * model: any chat-capable OpenAI model
    * k_context: how many top docs to include
    """
    context_block = top_k_to_context(pinecone_hits[:k_context])

    messages = [
        {
            "role": "system",
            "content": (
                "You are MosesAI, a helpful Talmudic assistant. "
                "Answer the user using ONLY the context provided. "
                "If the answer isn't in the context, reply 'I don’t know based on the documents I have.'"
            )
        },
        {
            "role": "user",
            "content": f"Context:\n{context_block}\n\n---\n\nQuestion: {question}"
        }
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.2
    )
    return response.choices[0].message.content

# --- Example usage -----------------------------------------------------------
from langchain_pinecone import PineconeVectorStore
query = "What is shema Shema?"
pinecone_hits = pinecone_hosted_search(query, top_k=5)

answer = ask_chatgpt_with_context(query, pinecone_hits)
print(answer)


I don’t know based on the documents I have.
