In [1]:
import pandas as pd

In [2]:
# Jupyter cell: robust redis connection helper
import os
import redis
import urllib.parse
import ssl
from typing import Optional

def connect_redis_from_url(url: str,
                           timeout: int = 5,
                           insecure_verify: bool = False,
                           max_connections: Optional[int] = None):
    """
    Connect to Redis from a URL robustly (works for local redis:// and rediss:// cloud URLs).
    - url: full redis URL (e.g. redis://user:pass@host:port/0 or rediss://...)
    - insecure_verify: if True and using rediss://, skip certificate verification (use only for testing)
    - timeout: socket connect timeout in seconds
    - max_connections: optional ConnectionPool max_connections
    Returns redis.Redis client (connected).
    """
    if not url:
        raise ValueError("Please provide a Redis URL.")

    parsed = urllib.parse.urlparse(url)
    scheme = parsed.scheme.lower()
    is_tls = scheme == "rediss" or scheme.endswith("+ssl")

    # Base kwargs safe for both plain and TLS
    conn_kwargs = {
        "socket_connect_timeout": timeout,
        "decode_responses": True,
    }
    if max_connections:
        conn_kwargs["max_connections"] = max_connections

    # If URL wants TLS, set ssl flag. We don't add ssl_cert_reqs blindly.
    if is_tls:
        # You can pass ssl=True or pass an SSLContext via 'ssl' to redis.from_url
        # We'll build an SSLContext depending on insecure_verify.
        if insecure_verify:
            ctx = ssl.create_default_context()
            # disable verification (INSECURE — testing only)
            ctx.check_hostname = False
            ctx.verify_mode = ssl.CERT_NONE
        else:
            ctx = ssl.create_default_context()  # verifies by default
        # Pass the SSLContext object to redis which will use it for TLS.
        conn_kwargs["ssl"] = ctx

    # Try creating client. If redis-py version forwards ssl_cert_reqs unexpectedly and fails,
    # we detect and retry without the offending kwarg.
    try:
        client = redis.from_url(url, **conn_kwargs)
        # quick sanity check:
        print("Attempting PING...")
        print("PING:", client.ping())
        info = client.info(section="server")
        print("Redis server:", info.get("redis_version"), "mode:", info.get("redis_mode"))
        return client
    except TypeError as te:
        msg = str(te)
        # If the TypeError complains about ssl_cert_reqs or other unsupported kw, retry after removing ssl-related kwargs
        if "ssl_cert_reqs" in msg or "ssl_certfile" in msg or "ssl_keyfile" in msg or "ssl_ca_certs" in msg:
            print("TypeError from redis library about SSL kwargs detected. Retrying without SSL kwargs (fallback).")
            # remove ssl-related kwargs and retry
            for k in ["ssl", "ssl_cert_reqs", "ssl_certfile", "ssl_keyfile", "ssl_ca_certs"]:
                conn_kwargs.pop(k, None)
            client = redis.from_url(url, **conn_kwargs)
            print("PING (fallback):", client.ping())
            info = client.info(section="server")
            print("Redis server (fallback):", info.get("redis_version"), "mode:", info.get("redis_mode"))
            return client
        # otherwise re-raise
        raise
    except Exception as e:
        # helpful debugging info for other connection errors
        print("Connection failed:", type(e).__name__, e)
        raise

# ---------- Example usage ----------
# Replace the URL below with your Railway URL (do NOT paste secrets in public places)
redis_url = "redis://default:SaWEETzovzyuGLHfbhgyqTUhGlHIaPUA@switchyard.proxy.rlwy.net:13999"
client = connect_redis_from_url(redis_url)   # for TLS use a rediss:// URL and set insecure_verify=True if needed


Attempting PING...
PING: True
Redis server: 8.2.1 mode: standalone


In [3]:
# use the `client` returned by the previous cell
k = "test:starter"
client.set(k, "hello-from-jupyter", ex=60)   # ex=seconds TTL
v = client.get(k)
print("Got value:", v)


Got value: hello-from-jupyter


In [2]:
df = pd.read_csv('text_files_data2.csv')
df[df['Class']=="Class 11th"]

Unnamed: 0,Class,Subject,Chapter,File_Data,Board
189,Class 11th,Chemistry,Chapter 9 Hydrocarbons,Hydrocarbons 295\r\nUnit 9\r\nHydrocarbons\r\n...,CBSE
190,Class 11th,Chemistry,Chapter 6 Equilibrium,Unit 6\r\nEqUilibriUm\r\nChemical equilibria a...,CBSE
191,Class 11th,Chemistry,Chapter 4 Chemical Bonding and Molecular Struc...,Unit 4\r\nCHEMiCAL BOnDinG AnD\r\nMOLECULAR St...,CBSE
192,Class 11th,Chemistry,Chapter 7 Redox Reactions,redox reactions 235\r\nUNIT 7\r\nREDOX REACTIO...,CBSE
193,Class 11th,Chemistry,Mock Paper,Based on the provided sample paper structure a...,CBSE
...,...,...,...,...,...
249,Class 11th,Biology,Chapter 12 Respiration in Plants,RESPIRATION IN PLANTS 153\r\nC 12\r\nHAPTER\r\...,CBSE
250,Class 11th,Biology,Chapter 9 Biomolecules,104 BIOLOGY\r\nC 9\r\nHAPTER\r\nB\r\nIOMOLECUL...,CBSE
251,Class 11th,Biology,Chapter 1 The Living World,U 1\r\nNIT\r\nDIVERSITY IN THE LIVING WORLD\r\...,CBSE
252,Class 11th,Biology,Chapter 3 Plant Kingdom,PLANT KINGDOM 23\r\nC 3\r\nHAPTER\r\nP K\r\nLA...,CBSE


In [4]:
# df[df['Class']=="Class 10th"]
document_content_df = df[
    (df["Board"] == "CBSE")
    & (df["Class"] == "Class 12th")
    & (df["Subject"] == "Maths")
    # & (df["Chapter"] == "Chapter 12 Respiration in Plants")
]
document_content_df

Unnamed: 0,Class,Subject,Chapter,File_Data,Board
146,Class 12th,Maths,Chapter 6 – Application of Derivatives,6\r\nChapter\r\nAPPLICATION OF\r\nDERIVATIVES\...,CBSE
147,Class 12th,Maths,Mock Paper,Here is a structured guideline to prepare a Ma...,CBSE
148,Class 12th,Maths,Chapter 8 – Application of Integrals,292 MATHEMATICS\r\n8\r\nChapter\r\nAPPLICATION...,CBSE
149,Class 12th,Maths,Chapter 10 – Vector Algebra,338 MATHEMATICS\r\n10\r\nChapter\r\nVECTOR ALG...,CBSE
150,Class 12th,Maths,Chapter 9 – Differential Equations,300 MATHEMATICS\r\n9\r\nChapter\r\nDIFFERENTIA...,CBSE
151,Class 12th,Maths,Chapter 3 – Matrices,34 MATHEMATICS\r\n3\r\nChapter\r\nMATRICES\r\n...,CBSE
152,Class 12th,Maths,Chapter 7 – Integrals,INTEGRALS 225\r\n7\r\nChapter\r\nINTEGRALS\r\n...,CBSE
153,Class 12th,Maths,Chapter 5 – Continuity and Differentiability,104 MATHEMATICS\r\n5\r\nChapter\r\nCONTINUITY ...,CBSE
154,Class 12th,Maths,Chapter 2 – Inverse Trigonometric Functions,18 MATHEMATICS\r\n2\r\nChapter\r\nINVERSE TRIG...,CBSE
155,Class 12th,Maths,Chapter 1 – Relations and Functions,1\r\nChapter\r\nRELATIONS AND FUNCTIONS\r\nv\r...,CBSE


In [8]:
print(document_content_df['File_Data'].tolist()[0])

6
Chapter
APPLICATION OF
DERIVATIVES
v
With the Calculus as a key, Mathematics can be successfully applied
to the explanation of the course of Nature.” — WHITEHEAD
v
6.1 Introduction
In Chapter 5, we have learnt how to find derivative of composite functions, inverse
trigonometric functions, implicit functions, exponential functions and logarithmic functions.
In this chapter, we will study applications of the derivative in various disciplines, e.g., in
engineering, science, social science, and many other fields. For instance, we will learn
how the derivative can be used (i) to determine rate of change of quantities, (ii) to find
the equations of tangent and normal to a curve at a point, (iii) to find turning points on
the graph of a function which in turn will help us to locate points at which largest or
smallest value (locally) of a function occurs. We will also use derivative to find intervals
on which a function is increasing or decreasing. Finally, we use the derivative to find
appr

In [1]:
import os
from pinecone import Pinecone

# Make sure your API key is set
pc = Pinecone(api_key="pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d")
index = pc.Index("papershapers2") # Your index name

print(index.describe_index_stats())

  from .autonotebook import tqdm as notebook_tqdm


{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 19953}},
 'total_vector_count': 19953,
 'vector_type': 'dense'}


In [2]:
# (Use the same index object from above)

# Fetch a few vectors without any filter to inspect them
fetch_response = index.fetch(ids=['a889f578-758d-56d1-bc76-37d685a6d4e8', '8d9331ea-1656-5e43-a77b-0733e2640ef1']) # Replace with actual IDs if you know them
# If you don't know any IDs, you can query for a generic term
# query_response = index.query(vector=[0.1]*384, top_k=5, include_metadata=True) # Vector dim must match your model

# Let's assume you got a query_response
# print(query_response)

# The most common issue is a mismatch in metadata keys or values.
# Your code is filtering for:
# 'class': '12'
# 'subject': 'Maths'

# Check if the metadata in Pinecone is ACTUALLY 'Maths' or maybe it's 'Math' or 'mathematics'.
# Check if the class is '12' or maybe '12th' or 'Class 12'.

In [3]:
fetch_response

FetchResponse(namespace='', vectors={'a889f578-758d-56d1-bc76-37d685a6d4e8': Vector(id='a889f578-758d-56d1-bc76-37d685a6d4e8', values=[-0.0179990865, 0.0083654318, -0.0081390189, -0.00365749211, 0.0451039597, -0.0276766028, 0.00731669832, -0.0124184834, 0.0215503797, 0.0101404935, 0.0546740294, 0.0456036106, 0.0587981381, -0.0333976932, -0.0421338342, 0.0587595962, -0.00735536311, 0.0190614834, -0.0370132811, -0.0152020017, -0.0473340042, 0.00256021833, 0.0254176855, -0.01092315, -0.00380834797, 0.0303797666, -0.00621743174, 0.109809525, 0.0114773782, -0.0944136456, 0.0229929537, 0.0132363662, 0.0115998806, -0.0166036338, -0.0466854125, -0.00186196319, -0.0261583179, -0.00476834783, -0.0123469736, -0.0382361226, -0.00459942594, -0.0110522974, 0.069062911, 0.00261717825, -0.0151245827, -0.0456420667, 0.0229021199, -0.0561087132, -0.000670899753, -0.0246797446, 0.0211833175, -0.00802863296, 0.0474648401, -0.0386447236, 0.0210931711, 0.0271478817, 0.0235121343, 0.0502581038, 0.00723273493

In [4]:
import os
import json
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import numpy as np

# --- 1. SETUP: Configure your API Keys and Model ---
# Make sure these environment variables are set in your notebook environment
# You can also replace os.environ.get(...) with the actual string keys for this test.
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d")
PINECONE_INDEX_NAME = "papershapers2"
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" # This MUST match your index dimension (384)

# Load the exact same embedding model used in your application
print(f"Loading embedding model: {EMBED_MODEL_NAME}...")
model = SentenceTransformer(EMBED_MODEL_NAME)
print("Model loaded successfully.")

# Connect to Pinecone
print(f"Connecting to Pinecone index: {PINECONE_INDEX_NAME}...")
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)
print("Connected to Pinecone.")


# --- 2. DEFINE QUERY AND FILTERS ---
# This is the objective your RAG pipeline would generate for a Maths section
query_text = "Generate Hard Long Answer questions for Class 12th Maths focusing on calculus and integration"

# This is the filter your code is currently using. We suspect this is the problem.
problematic_filter = {
    'class': {'$eq': '12'},
    'subject': {'$eq': 'Maths'}
}

print("\n--- 3. GENERATING QUERY EMBEDDING ---")
# Generate the vector for our query text
query_embedding = model.encode(query_text).tolist()
print(f"Embedding generated with dimension: {len(query_embedding)}")

# --- 4. DIAGNOSTIC STEP 1: Query WITHOUT Filters ---
# This is the most important step. We will see what Pinecone returns
# when we ONLY search by semantic meaning, and then we'll inspect the metadata.
print("\n" + "="*50)
print("RUNNING QUERY 1: SEMANTIC SEARCH *WITHOUT ANY FILTERS*")
print("="*50)

results_no_filter = index.query(
    vector=query_embedding,
    top_k=5, # Get the top 5 closest results from the entire index
    include_metadata=True
)

print(f"Found {len(results_no_filter['matches'])} matches without filters.")
if results_no_filter['matches']:
    print("--- Top 5 Most Similar Snippets in Your Entire Index ---")
    for i, match in enumerate(results_no_filter['matches']):
        print(f"\n--- Result {i+1} ---")
        print(f"  Score: {match['score']:.4f}")
        print(f"  ID: {match['id']}")
        # Pretty print the metadata so we can inspect it
        print(f"  Metadata: {json.dumps(match['metadata'], indent=4)}")
else:
    print("WARNING: The query returned ZERO results even without filters. This could indicate an issue with the embedding model or the query text itself.")

# --- 5. DIAGNOSTIC STEP 2: Query WITH The Problematic Filters ---
# We run the query exactly as your application does. We expect this to return 0 results.
print("\n" + "="*50)
print("RUNNING QUERY 2: SEARCH *WITH THE PROBLEMATIC FILTERS*")
print(f"Using filter: {problematic_filter}")
print("="*50)

results_with_filter = index.query(
    vector=query_embedding,
    top_k=5,
    filter=problematic_filter,
    include_metadata=True
)

print(f"Found {len(results_with_filter['matches'])} matches when applying the filter.")
if not results_with_filter['matches']:
    print("SUCCESSFUL DIAGNOSIS: As expected, the filter returned 0 results, confirming a metadata mismatch.")
else:
    print("UNEXPECTED: The filter returned results. Please double-check the logic in your main application.")

Loading embedding model: sentence-transformers/all-MiniLM-L6-v2...
Model loaded successfully.
Connecting to Pinecone index: papershapers2...
Connected to Pinecone.

--- 3. GENERATING QUERY EMBEDDING ---
Embedding generated with dimension: 384

RUNNING QUERY 1: SEMANTIC SEARCH *WITHOUT ANY FILTERS*
Found 5 matches without filters.
--- Top 5 Most Similar Snippets in Your Entire Index ---

--- Result 1 ---
  Score: 0.2428
  ID: 9020e3e8-25d7-5f65-b60c-e74dba900c12
  Metadata: {
    "board": "CBSE",
    "chapter": "Chapter 7 \u2013 Integrals",
    "class": "12",
    "row_index": 152.0,
    "snippet_index": 1.0,
    "subject": "Maths",
    "text": "Integral\nCalculus is motivated by the problem of defining and\ncalculating the area of the region bounded by the graph of\nthe functions. If a function f is differentiable in an interval I, i.e., its\nderivative f \u2032exists at each point of I, then a natural question\narises that given f \u2032at each point of I, can we determine\nthe functio

In [None]:
# This would be the corrected filter based on the example above
corrected_filter = {
    'class': {'$eq': '12'},
    'subject': {'$eq': 'Maths'}
}

In [None]:
# In merged_app.py inside process_section_sync()

# ...
# 2) Query Pinecone with the new, specific objective and the CORRECTED filters
from full_paper.run_full_pipeline import retrieve_from_pinecone


class_digit = ''.join([c for c in class_label if c.isdigit()]) # This might not be what you need!

# INSTEAD, USE THE CORRECT VALUES YOU FOUND FROM THE NOTEBOOK
# For example, if you found the class is stored as "Class 12th":
correct_class_label = "Class 12th" # Or whatever you discovered
correct_subject_label = "Mathematics" # Or whatever you discovered

filters = {'class': {'$eq': correct_class_label}, 'subject': {'$eq': correct_subject_label}}
candidates = retrieve_from_pinecone(specific_objective, filters, top_k=50)
# ...

In [28]:
df[df['Chapter']=="Mock Paper"].to_csv('instructions.csv', index=False)

In [None]:
ddf 

In [6]:
"""
step1_ingest_embed_pinecone_improved.py

WHAT IT DOES (Step 1):
 - Reads your CSV with columns: Class, Subject, Chapter, File_Data, Board.
 - Cleans text, chunks it into overlapping snippets.
 - Computes embeddings using a local Hugging Face model: 'BAAI/bge-large-en-v1.5'.
 - Normalizes embeddings for cosine similarity.
 - Creates a Pinecone index if it doesn't exist (dimension=1024, metric='cosine').
 - Upserts the vectors into Pinecone in batches.

USAGE:
  1) pip install -r requirements.txt
  2) Set your PINECONE_API_KEY as an environment variable or directly in the script.
  3) Update the `INPUT_CSV_PATH` and `PINECONE_INDEX_NAME` variables in the `if __name__ == '__main__':` block.
  4) Run the script: python step1_ingest_embed_pinecone_improved.py

REQUIREMENTS (example):
  pip install "transformers" "torch" "accelerate" "pinecone-client" pandas tqdm "numpy" "safetensors"
"""

import os
import re
import uuid
from typing import List
from tqdm import tqdm
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone, ServerlessSpec

# Suppress the symlinks warning if you cannot enable Developer Mode on Windows
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

# -----------------------
# Utilities: text cleaning
# -----------------------
def clean_text(s: str) -> str:
    """Cleans a string by removing extra newlines and other artifacts."""
    if s is None:
        return ""
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"\n{2,}", "\n\n", s)
    s = re.sub(r"^\s*\d+\s*\n", "", s)
    s = re.sub(r"CHAPTER\s*\n", "", s, flags=re.IGNORECASE)
    return s.strip()

def sentence_split(text: str) -> List[str]:
    """Splits text into sentences or paragraphs for more logical chunking."""
    if not text:
        return []
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    parts = []
    for p in paragraphs:
        sent = re.split(r'(?<=[\.\?\!])\s+', p)
        sent = [s.strip() for s in sent if s.strip()]
        if len(sent) <= 3:
            parts.append(p)
        else:
            parts.extend(sent)
    return parts

def chunk_sentences_to_snippets(sentences: List[str], max_words=120, overlap_words=20) -> List[str]:
    """Combines sentences into overlapping snippets of a target word count."""
    if not sentences:
        return []
    snippets = []
    current_chunk = []
    current_word_count = 0
    for sent in sentences:
        word_count = len(sent.split())
        if not current_chunk or current_word_count + word_count <= max_words:
            current_chunk.append(sent)
            current_word_count += word_count
        else:
            snippets.append(" ".join(current_chunk))
            if overlap_words > 0:
                overlap_chunk = []
                overlap_count = 0
                for s in reversed(current_chunk):
                    overlap_chunk.insert(0, s)
                    overlap_count += len(s.split())
                    if overlap_count >= overlap_words:
                        break
                current_chunk = overlap_chunk
            else:
                current_chunk = []
            
            current_chunk.append(sent)
            current_word_count = sum(len(s.split()) for s in current_chunk)
            
    if current_chunk:
        snippets.append(" ".join(current_chunk))
    return snippets

# -----------------------
# Embedding helpers
# -----------------------
def load_bge_model(device: torch.device, model_name: str):
    """Loads the BGE embedding model and tokenizer from Hugging Face."""
    print(f"Loading tokenizer and model '{model_name}' on device '{device}'...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)
    model.eval()
    print("Model loaded successfully.")
    return tokenizer, model

# --- CHANGED: This function is now simplified to use CLS Pooling for the new model ---
def embed_texts(texts: List[str], tokenizer, model, device: torch.device, batch_size=32) -> np.ndarray:
    """Computes and normalizes embeddings for a list of texts using CLS pooling."""
    all_embs = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize the sentences
            encoded_input = tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
            
            # Compute token embeddings
            model_output = model(**encoded_input)
            
            # Perform CLS Pooling (take the embedding of the [CLS] token)
            sentence_embeddings = model_output[0][:, 0]
            
            # Normalize embeddings
            normalized_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
            
            all_embs.append(normalized_embeddings.cpu().numpy())
            
    return np.vstack(all_embs)

# -----------------------
# Pinecone helpers (Updated SDK)
# -----------------------
def get_pinecone_client(api_key: str) -> Pinecone:
    """Initializes and returns a Pinecone client."""
    if not api_key:
        raise ValueError("Pinecone API key is required. Set the PINECONE_API_KEY environment variable.")
    return Pinecone(api_key=api_key)

def ensure_index(pc: Pinecone, index_name: str, dimension: int, metric: str = "cosine"):
    """Checks if a Pinecone index exists and creates it if it doesn't."""
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name in existing_indexes:
        print(f"Index '{index_name}' already exists.")
        return
    
    print(f"Creating a new serverless index '{index_name}' with dimension={dimension} and metric='{metric}'.")
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric=metric,
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print(f"Index '{index_name}' created successfully.")

def upsert_embeddings(index: object, records: List[dict], namespace: str = None, batch_size=100):
    """Upserts embedding records into the specified Pinecone index."""
    for i in tqdm(range(0, len(records), batch_size), desc="Upserting to Pinecone"):
        batch = records[i:i+batch_size]
        vectors_to_upsert = []
        for record in batch:
            vec = record["vector"].tolist() if isinstance(record["vector"], np.ndarray) else record["vector"]
            vectors_to_upsert.append({
                "id": record["snippet_id"],
                "values": vec,
                "metadata": record.get("metadata", {})
            })
        
        if vectors_to_upsert:
            index.upsert(vectors=vectors_to_upsert, namespace=namespace)
    
    print(f"Upserted {len(records)} vectors to index.")

# -----------------------
# Data Loading
# -----------------------
def load_data_from_csv(input_csv: str) -> pd.DataFrame:
    """Loads data from a CSV and validates required columns."""
    print(f"Loading data from {input_csv}...")
    df = pd.read_csv(input_csv)
    required_columns = {"Class", "Subject", "Chapter", "File_Data", "Board"}
    
    assert required_columns.issubset(set(df.columns)), \
        f"CSV must contain the following columns: {required_columns}"
        
    print(f"Loaded {len(df)} rows.")
    return df

# -----------------------
# Main pipeline
# -----------------------
# REPLACE this whole function in step1_ingest_embed_pinecone_final.py

def process(df: pd.DataFrame, index_name: str, pinecone_api_key: str, model_name: str, namespace: str = None,
            device_str: str = None, max_words=120, overlap=20):
    """
    Main processing pipeline to chunk, embed, and upsert data to Pinecone.
    """
    # 1. Chunk data into snippets
    snippet_rows = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating Snippets"):
        raw_text = str(row.get("File_Data", "") or "")
        cleaned_text = clean_text(raw_text)
        sentences = sentence_split(cleaned_text)
        snippets = chunk_sentences_to_snippets(sentences, max_words=max_words, overlap_words=overlap)
        
        for i, snippet_text in enumerate(snippets):
            snippet_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{row['Chapter']}-{idx}-{i}"))
            snippet_rows.append({
                "snippet_id": snippet_id,
                "text": snippet_text, # We keep this here for the embedding step
                "metadata": {
                    # --- FIX: ADDED THE TEXT TO THE METADATA FOR STORAGE ---
                    "text": snippet_text, 
                    "row_index": int(idx),
                    # --- FIX: INCLUDED THE ROBUST CLASS NAME FIX ---
                    "class": re.sub(r'[^0-9]', '', row["Class"]),
                    "subject": row["Subject"],
                    "chapter": row["Chapter"],
                    "board": row["Board"],
                    "snippet_index": int(i)
                }
            })

    if not snippet_rows:
        print("No snippets were generated. Please check your input data.")
        return

    print(f"Created {len(snippet_rows)} snippets.")

    # 2. Load embedding model
    device = torch.device(device_str or ("cuda" if torch.cuda.is_available() else "cpu"))
    tokenizer, model = load_bge_model(device=device, model_name=model_name)

    # 3. Embed snippets in batches
    texts_to_embed = [row["text"] for row in snippet_rows]
    embeddings = embed_texts(texts_to_embed, tokenizer, model, device)

    embedding_dim = embeddings.shape[1]
    print(f"Embeddings created with dimension {embedding_dim}.")

    # 4. Initialize Pinecone and create index
    pc = get_pinecone_client(api_key=pinecone_api_key)
    ensure_index(pc, index_name, dimension=embedding_dim, metric="cosine")
    pinecone_index = pc.Index(index_name)
    
    # 5. Prepare records and upsert
    for i, row in enumerate(snippet_rows):
        row["vector"] = embeddings[i]

    # The upsert function is now correct because "text" is inside "metadata"
    upsert_embeddings(pinecone_index, snippet_rows, namespace=namespace, batch_size=100)
    
    # 6. Save snippets for reference
    snippets_df = pd.DataFrame([
        {
            "snippet_id": r["snippet_id"],
            "text": r["text"],
            **r["metadata"]
        } for r in snippet_rows
    ])
    
    output_filename = f"{index_name}_snippets.csv"
    snippets_df.to_csv(output_filename, index=False)
    print(f"Step 1 complete: Snippets saved to '{output_filename}' and uploaded to Pinecone.")
# -----------------------
# Script Execution
# -----------------------
if __name__ == "__main__":
    # --- Configuration ---
    # 1. Set the path to your input CSV file


    PINECONE_HOST = os.getenv("PINECONE_HOST", "https://papershapers-y07ysa9.svc.aped-4627-b74a.pinecone.io")
    PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d")
    PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX", "papershapers2")
    PINECONE_NAMESPACE = os.getenv("PINECONE_NAMESPACE", "default")
    PINECONE_DIM = int(os.getenv("PINECONE_DIM", "1024"))
    INPUT_CSV_PATH = "text_files_data2 copy.csv"
    
    # 2. Set your desired Pinecone index name
    
    # --- CHANGED: Switched to the alternative BGE model ---
    MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
    
    # 4. Your Pinecone API key (best practice is to use environment variables)
    
    # --- Run the pipeline ---
    try:
        data_df = load_data_from_csv(INPUT_CSV_PATH)
        
        process(
            df=data_df, 
            index_name=PINECONE_INDEX_NAME, 
            pinecone_api_key=PINECONE_API_KEY,
            model_name=MODEL_NAME
        )
        
    except FileNotFoundError:
        print(f"Error: The file '{INPUT_CSV_PATH}' was not found. Please check the path.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


Loading data from text_files_data2 copy.csv...
Loaded 254 rows.


Generating Snippets: 100%|██████████| 254/254 [00:01<00:00, 188.15it/s]


Created 19953 snippets.
Loading tokenizer and model 'sentence-transformers/all-MiniLM-L6-v2' on device 'cuda'...
Model loaded successfully.


Embedding batches: 100%|██████████| 624/624 [01:27<00:00,  7.13it/s]


Embeddings created with dimension 384.
Index 'papershapers2' already exists.


Upserting to Pinecone: 100%|██████████| 200/200 [03:25<00:00,  1.03s/it]


Upserted 19953 vectors to index.
Step 1 complete: Snippets saved to 'papershapers2_snippets.csv' and uploaded to Pinecone.


In [8]:
# test_retrieval.py

import os
from pinecone import Pinecone

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(PINECONE_INDEX_NAME)

# Test query vector (embed one query using same model as Step 1)
from transformers import AutoTokenizer, AutoModel
import torch

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

def embed_query(text: str):
    tokens = tokenizer([text], padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**tokens)
    emb = output[0][:, 0]  # CLS pooling
    emb = torch.nn.functional.normalize(emb, p=2, dim=1)
    return emb.cpu().numpy().tolist()[0]

if __name__ == "__main__":
    query = "Fleming’s right-hand rule?"
    query_vec = embed_query(query)

    results = index.query(
        vector=query_vec,
        top_k=5,
        include_metadata=True
    )

    print("Top matches for query:", query)
    for match in results['matches']:
        print(f"Score: {match['score']:.4f}")
        print("Metadata:", match['metadata'])
        print("------")


Top matches for query: Fleming’s right-hand rule?
Score: 0.8460
Metadata: {'board': 'CBSE', 'chapter': 'chapter-13 Magnetic Effects of Electric Current', 'class': '10', 'row_index': 7.0, 'snippet_index': 64.0, 'subject': 'Science', 'text': 'If the forefinger indicates the direction of the\nmagnetic field and the thumb shows the direction of motion\nof conductor, then the middle finger will show the direction\nFFFFFiiiiiggggguuuuurrrrreeeee 1111133333.....1111188888\nof induced current. This simple rule is called Fleming’s\nFleming’s right-hand rule\nright-hand rule. Magnetic Effects of Electric Current 235\n2022-23 Q U E S T I O N\n? 1. Explain different ways to induce current in a coil. 1111133333.....66666 EEEEELLLLLEEEEECCCCCTTTTTRRRRRIIIIICCCCC GGGGGEEEEENNNNNEEEEERRRRRAAAAATTTTTOOOOORRRRR\nBased on the phenomenon of electromagnetic induction, the experiments\nstudied above generate induced current, which is usually very small. This principle is also employed to produce large curre

In [None]:
# query = "Explain Mendel's law of inheritance"
# query_vec = embed_query(query)

# results = index.query(
#     vector=query_vec, 
#     top_k=5, 
#     include_metadata=True,
#     filter={
#         "board": {"$eq": "CBSE"},
#         "class": {"$eq": "Class 12th"},
#         "subject": {"$eq": "Biology"}
#     }
# )
# results

{'matches': [], 'namespace': '', 'usage': {'read_units': 1}}

In [10]:
"""
step2_retrieval_final.py

WHAT IT DOES (Step 2):
 - Connects to your existing Pinecone index.
 - Loads the embedding model.
 - Takes a user query and metadata filters to retrieve relevant text snippets.

USAGE:
  1) Make sure you have run the corrected Step 1 ingestion script.
  2) Set your PINECONE_API_KEY environment variable.
  3) Configure the QUERY and FILTER_CRITERIA variables below.
  4) Run the script: python step2_retrieval_final.py
"""

import os
import torch
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
from typing import Dict

# --- 1. Embedding Functionality ---

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

def embed_query(text: str) -> list:
    """Embeds a single query text into a vector."""
    tokens = tokenizer([text], padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**tokens)
    # Perform CLS Pooling and normalize
    emb = output[0][:, 0]
    emb = torch.nn.functional.normalize(emb, p=2, dim=1)
    return emb.cpu().numpy().tolist()[0]

# --- 2. Main Retrieval Logic ---

if __name__ == "__main__":
    # --- Configuration ---
    PINECONE_API_KEY = "pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d"
    PINECONE_INDEX_NAME = "papershapers2"

    # --- What to search for ---
    QUERY = "What is Fleming’s right-hand rule?"
    TOP_K = 5  # Number of relevant snippets to retrieve
    
    # --- Filters to narrow down the search (Uses the corrected data format) ---
    # To search everything, leave the dictionary empty: {}
    FILTER_CRITERIA = {
        "subject": {"$eq": "Science"},
        "class": {"$eq": "10"} # IMPORTANT: Use the clean, numeric string
    }

    # --- Initialization & Execution ---
    if not PINECONE_API_KEY:
        raise ValueError("PINECONE_API_KEY environment variable not set.")
        
    try:
        # Connect to Pinecone
        pc = Pinecone(api_key=PINECONE_API_KEY)
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Successfully connected to Pinecone index '{PINECONE_INDEX_NAME}'.")
        
        # Embed the query
        print(f"Embedding query: '{QUERY}'...")
        query_vector = embed_query(QUERY)
        
        # Query Pinecone
        print(f"Applying filters: {FILTER_CRITERIA}")
        results = index.query(
            vector=query_vector,
            filter=FILTER_CRITERIA,
            top_k=TOP_K,
            include_metadata=True
        )

        # Display Results
        print("\n--- Retrieved Snippets ---")
        if not results['matches']:
            print("No relevant snippets found for the given query and filters.")
        else:
            for i, match in enumerate(results['matches']):
                metadata = match.get('metadata', {})
                text = metadata.get('text', 'No text found in metadata.') # Graceful handling
                score = match.get('score', 'N/A')
                
                print(f"Result {i+1}: (Score: {score:.4f})")
                print(f"  Source: Class {metadata.get('class')}, {metadata.get('subject')}, Ch: {metadata.get('chapter')}")
                print(f"  Snippet: \"{text}\"\n")

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Successfully connected to Pinecone index 'papershapers2'.
Embedding query: 'What is Fleming’s right-hand rule?'...
Applying filters: {'subject': {'$eq': 'Science'}, 'class': {'$eq': '10'}}

--- Retrieved Snippets ---
Result 1: (Score: 0.8502)
  Source: Class 10, Science, Ch: chapter-13 Magnetic Effects of Electric Current
  Snippet: "If the forefinger indicates the direction of the
magnetic field and the thumb shows the direction of motion
of conductor, then the middle finger will show the direction
FFFFFiiiiiggggguuuuurrrrreeeee 1111133333.....1111188888
of induced current. This simple rule is called Fleming’s
Fleming’s right-hand rule
right-hand rule. Magnetic Effects of Electric Current 235
2022-23 Q U E S T I O N
? 1. Explain different ways to induce current in a coil. 1111133333.....66666 EEEEELLLLLEEEEECCCCCTTTTTRRRRRIIIIICCCCC GGGGGEEEEENNNNNEEEEERRRRRAAAAATTTTTOOOOORRRRR
Based on the phenomenon of electromagnetic induction, the experiments
studied above generate induced current

In [27]:
# ddf = pd.read_csv('papershapers_snippets.csv')
# ddf  

In [None]:
"""
step3_qa_generation_final_corrected.py

WHAT IT DOES (Step 3):
 - Retrieves relevant context snippets from Pinecone using a query and filters.
 - Joins the snippets into a single context string.
 - Sends the context to the Google Gemini API with a structured prompt.
 - Generates and prints structured Question/Answer pairs.

USAGE:
  1) Make sure the retrieval script (Step 2) works correctly.
  2) Add your API keys directly into the main block below.
  3) Configure the QUERY, FILTERS, and NUM_QUESTIONS variables.
  4) Run the script: python step3_qa_generation_final_corrected.py
"""
import os
import json
import torch
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
import google.generativeai as genai

# --- 1. Embedding and Retrieval Functionality ---

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

def embed_query(text: str) -> list:
    tokens = tokenizer([text], padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(**tokens)
    emb = output[0][:, 0]
    emb = torch.nn.functional.normalize(emb, p=2, dim=1)
    return emb.cpu().numpy().tolist()[0]

def retrieve_contexts(index, query_text: str, filters: dict, top_k: int) -> list[str]:
    query_vector = embed_query(query_text)
    results = index.query(vector=query_vector, filter=filters, top_k=top_k, include_metadata=True)
    return [match['metadata'].get('text', '') for match in results.get('matches', [])]

# --- 2. Gemini LLM for Q&A Generation ---

def generate_qa_with_gemini(contexts: list[str], num_questions: int, api_key: str):
    """Generates Q&A pairs from context using Google Gemini."""
    if not any(contexts):
        print("No context was retrieved. Halting generation.")
        return []

    print(f"\n--- Generating {num_questions} Q/A pairs with Google Gemini... ---")
    
    genai.configure(api_key=api_key)
    llm = genai.GenerativeModel('models/gemini-2.5-flash-lite')
    
    full_context = "\n\n".join(contexts)

    prompt = f"""
    You are an expert AI assistant creating educational materials. Based ONLY on the CONTEXT below, generate exactly {num_questions} unique question-and-answer pairs.

    **CONTEXT:**
    ---
    {full_context}
    ---

    **INSTRUCTIONS:**
    Return a valid JSON list of objects. Each object must have keys: "question", "answer", "type" (Short Answer, True/False, or Multiple Choice), "difficulty" (Easy, Medium, or Hard), and "marks" (a number).
    """
    
    try:
        response = llm.generate_content(prompt)
        json_text = response.text.strip().lstrip("```json").rstrip("```")
        qa_pairs = json.loads(json_text)
        return qa_pairs
    except Exception as e:
        # --- THIS IS THE CORRECTED PART ---
        # We now print the actual error 'e' from the API call, which will tell us the real problem.
        print(f"An error occurred while calling the Gemini API or parsing the response.")
        print(f"THE REAL ERROR IS: {e}")
        return []

# --- 3. Main Execution Logic ---

os.environ['PINECONE_API_KEY'] = 'pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d'
os.environ['GOOGLE_API_KEY'] = 'AIzaSyCUjeyTb4Cql7gwFUU6CrqIHFYojNsy2oo'


if __name__ == "__main__":
    # --- Configuration ---
    PINECONE_API_KEY = "pcsk_5quHci_5XnxNCMXz9BfCzJAQTz8M5Sp8RLUqGyPacdGVw7esczTLAaoPC6mY8UHVBeg38d"
    GOOGLE_API_KEY = "AIzaSyCUjeyTb4Cql7gwFUU6CrqIHFYojNsy2oo"
    PINECONE_INDEX_NAME = "papershapers2"
    
    # --- What to search for ---
    QUERY = "What are the magnetic effects of electric current?"
    TOP_K = 4 
    FILTER_CRITERIA = { "class": {"$eq": "10"}, "subject": {"$eq": "Science"} }
    
    # --- What to generate ---
    NUM_QUESTIONS_TO_GENERATE = 3
    if not (PINECONE_API_KEY and GOOGLE_API_KEY):
        raise ValueError("Ensure both PINECONE_API_KEY and GOOGLE_API_KEY environment variables are set.")
        
    try:
        pc = Pinecone(api_key=PINECONE_API_KEY)
        index = pc.Index(PINECONE_INDEX_NAME)
        print(f"Successfully connected to Pinecone index '{PINECONE_INDEX_NAME}'.")
        
        retrieved_contexts = retrieve_contexts(index, QUERY, FILTER_CRITERIA, TOP_K)
        
        generated_qa = generate_qa_with_gemini(
            contexts=retrieved_contexts, 
            num_questions=NUM_QUESTIONS_TO_GENERATE,
            api_key=GOOGLE_API_KEY
        )
        
        print("\n--- Generated Question and Answer Items ---")
        if generated_qa:
            for i, qa in enumerate(generated_qa):
                print(f"\n--- Item {i+1} ---")
                print(f"  Question:   {qa.get('question')}")
                print(f"  Answer:     {qa.get('answer')}")
                print(f"  Type:       {qa.get('type')}")
                print(f"  Difficulty: {qa.get('difficulty')}")
                print(f"  Marks:      {qa.get('marks')}")
    
    except Exception as e:
        print(f"An unexpected error occurred in the main pipeline: {e}")

Successfully connected to Pinecone index 'papershapers2'.

--- Generating 3 Q/A pairs with Google Gemini... ---

--- Generated Question and Answer Items ---

--- Item 1 ---
  Question:   What accidental discovery did Hans Christian Oersted make in 1820?
  Answer:     Hans Christian Oersted accidentally discovered that a compass needle got deflected when an electric current passed through a metallic wire placed nearby.
  Type:       Short Answer
  Difficulty: Easy
  Marks:      2

--- Item 2 ---
  Question:   What is electromagnetic induction?
  Answer:     Electromagnetic induction is the process by which a changing magnetic field in a conductor induces a current in another conductor.
  Type:       Short Answer
  Difficulty: Medium
  Marks:      2

--- Item 3 ---
  Question:   Choose the correct option: The magnetic field inside a long straight solenoid-carrying current
  Answer:     (d) is the same at all points.
  Type:       Multiple Choice
  Difficulty: Easy
  Marks:      1
