In [19]:
# Core libs
!pip install -qU pinecone sentence-transformers transformers accelerate python-dotenv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
import os
from dotenv import load_dotenv
load_dotenv()  # expects .env with PINECONE_API_KEY=...

from pinecone import Pinecone, ServerlessSpec

API_KEY = os.getenv("PINECONE_API_KEY")
assert API_KEY, "PINECONE_API_KEY missing from your .env"

pc = Pinecone(api_key=API_KEY)

INDEX_NAME = "squad-e5-dev"
DIM = 768            # E5-base-v2
METRIC = "cosine"    # we normalized vectors when building, so cosine is right

# Create if not exists
if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric=METRIC,
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        deletion_protection="disabled",
    )

index = pc.Index(INDEX_NAME)
NAMESPACE = "squad-dev-v1.1"   # keep dev/train separate

In [21]:
import numpy as np
import pandas as pd

CHUNKS_CSV = "../Data/squad_prepared/processed_chunks.csv"
EMBS_NPY   = "../Data/squad_prepared/embeddings.npy"

chunks = pd.read_csv(CHUNKS_CSV)
embs   = np.load(EMBS_NPY)  # shape = (num_chunks, 768)

print(chunks.shape, embs.shape)
assert embs.shape[1] == DIM, f"Embedding dim {embs.shape[1]} != {DIM}"

(2099, 11) (2099, 768)


In [22]:
# You can rerun safely; upsert overwrites by ID.
# IDs should be unique. Use your existing chunk_id column if present.

BATCH = 200
def row_to_item(row, vec):
    md = {
        "text": str(row["chunk_text"]),
        "title": str(row["contract_title"]) if "contract_title" in row else str(row.get("title","")),
        "para_idx": int(row.get("paragraph_index", -1)),
        "chunk_idx": int(row.get("chunk_index", -1)),
    }
    return {"id": str(row["chunk_id"]), "values": vec.tolist(), "metadata": md}

vectors = []
for i, row in chunks.iterrows():
    vectors.append(row_to_item(row, embs[i]))
    if len(vectors) == BATCH:
        index.upsert(vectors=vectors, namespace=NAMESPACE)
        vectors = []
if vectors:
    index.upsert(vectors=vectors, namespace=NAMESPACE)

print("Upsert complete.")

Upsert complete.


In [23]:
from sentence_transformers import SentenceTransformer
e5 = SentenceTransformer("intfloat/e5-base-v2")

def encode_query(q: str):
    return e5.encode(["query: " + q], normalize_embeddings=True, convert_to_numpy=True)[0]

In [24]:
def dense_retrieve(query, topk=30, filt=None, namespace=NAMESPACE):
    qv = encode_query(query).tolist()
    res = index.query(
        vector=qv,
        top_k=topk,
        include_metadata=True,
        namespace=namespace,
        filter=filt,
    )
    return res["matches"]

In [25]:
from sentence_transformers import CrossEncoder
# A strong, fast reranker:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def retrieve_and_rerank(query, topk_dense=30, topk_final=5, filt=None):
    matches = dense_retrieve(query, topk=topk_dense, filt=filt)
    pairs = [(query, m["metadata"]["text"]) for m in matches]
    ce_scores = reranker.predict(pairs)  # array of floats
    ranked = sorted(zip(matches, ce_scores), key=lambda x: x[1], reverse=True)[:topk_final]
    return [{
        "text": m["metadata"]["text"],
        "title": m["metadata"].get("title"),
        "dense": float(m["score"]),
        "ce": float(s)
    } for (m, s) in ranked]

In [26]:
from transformers import pipeline
reader = pipeline("question-answering",
                  model="deepset/roberta-base-squad2",
                  tokenizer="deepset/roberta-base-squad2")

def answer_from_contexts(question, contexts, max_ctx=5):
    preds = []
    for c in contexts[:max_ctx]:
        out = reader(question=question, context=c["text"])
        preds.append({
            "title": c["title"], "ce": c["ce"], "dense": c["dense"],
            "answer": out.get("answer",""), "score": float(out.get("score",0.0)),
            "context": c["text"][:240].replace("\n"," ") + ("…" if len(c["text"])>240 else "")
        })
    return sorted(preds, key=lambda x: x["score"], reverse=True)

# Quick sanity test
q = "Who developed the theory of relativity?"
ctx = retrieve_and_rerank(q, topk_dense=30, topk_final=5)
preds = answer_from_contexts(q, ctx, max_ctx=5)
for p in preds[:3]:
    print(f"Ans={p['answer']!r}  (reader={p['score']:.3f}) | CE={p['ce']:.3f} | dense={p['dense']:.3f} | Title={p['title']}\n→ {p['context']}\n")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use mps:0


Ans='einstein'  (reader=0.934) | CE=3.457 | dense=0.848 | Title=Force
→ philosophers in antiquity used the concept of force in the study of stationary and moving objects and simple machines, but thinkers such as aristotle and archimedes retained fundamental errors in understanding force. in part this was due to…

Ans='albert einstein'  (reader=0.558) | CE=3.944 | dense=0.833 | Title=Force
→ it was only the orbit of the planet mercury that newton's law of gravitation seemed not to fully explain. some astrophysicists predicted the existence of another planet ( vulcan ) that would explain the discrepancies ; however, despite some…

Ans='isaac newton'  (reader=0.005) | CE=-0.279 | dense=0.792 | Title=Force
→ the development of fundamental theories for forces proceeded along the lines of unification of disparate ideas. for example, isaac newton unified the force responsible for objects falling at the surface of the earth with the force responsib…



In [27]:
import json
SQUAD_DEV = "../Data/SQuAD/dev-v1.1.json"

with open(SQUAD_DEV, "r", encoding="utf-8") as f:
    raw = json.load(f)

# Build (q, answers[], title) triples
eval_queries = []
for art in raw["data"]:
    title = art["title"]
    for para in art["paragraphs"]:
        for qa in para["qas"]:
            qtext = qa["question"]
            answers = [a["text"] for a in qa["answers"] if a.get("text")]
            if qtext and answers:
                eval_queries.append((qtext, answers, title))

len(eval_queries)

10570

In [29]:
import math
from tqdm.auto import tqdm
def contains_any_answer(text, answers):
    t = text.lower()
    return any(a.lower() in t for a in answers if a)

def eval_retrieval_string_only(queries, ks=(1,3,5,10), per_doc=False):
    hits = {k: 0 for k in ks}
    total = 0
    for q, answers, title in tqdm(queries):
        total += 1
        if per_doc:
            # namespace already isolates the dataset; just rely on rerank top-k
            ctxs = retrieve_and_rerank(q, topk_dense=30, topk_final=max(ks))
        else:
            ctxs = retrieve_and_rerank(q, topk_dense=30, topk_final=max(ks))
        first_hit_rank = None
        for rank, c in enumerate(ctxs, start=1):
            if contains_any_answer(c["text"], answers):
                first_hit_rank = rank
                break
        for k in ks:
            if first_hit_rank and first_hit_rank <= k:
                hits[k] += 1
    return {k: hits[k] / max(1, total) for k in ks}

print("Recall@K (string-only):", eval_retrieval_string_only(eval_queries, ks=(1,3,5,10)))

  0%|          | 0/10570 [00:00<?, ?it/s]

Recall@K (string-only): {1: 0.8123935666982025, 3: 0.8727530747398297, 5: 0.8837275307473983, 10: 0.8909176915799433}


In [30]:
import re

def normalize_text(s):
    s = s.lower()
    s = re.sub(r"\b(a|an|the)\b", " ", s)
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def em_f1(pred, gold_texts):
    pred_n = normalize_text(pred)
    gold_ns = [normalize_text(g) for g in gold_texts]
    em = max(int(pred_n == g) for g in gold_ns)
    def f1_single(p, g):
        p_tokens = p.split()
        g_tokens = g.split()
        common = len(set(p_tokens) & set(g_tokens))
        if not p_tokens and not g_tokens: return 1.0
        if not common: return 0.0
        prec = common / max(1, len(p_tokens))
        rec  = common / max(1, len(g_tokens))
        return 2 * prec * rec / (prec + rec)
    f1 = max(f1_single(pred_n, g) for g in gold_ns) if gold_ns else 0.0
    return em, f1

def eval_reader(queries, topk_dense=30, topk_final=5):
    total, em_sum, f1_sum = 0, 0.0, 0.0
    for q, answers, title in tqdm(queries):
        ctxs = retrieve_and_rerank(q, topk_dense=topk_dense, topk_final=topk_final)
        preds = answer_from_contexts(q, ctxs, max_ctx=topk_final)
        pred_ans = preds[0]["answer"] if preds else ""
        em, f1 = em_f1(pred_ans, answers)
        em_sum += em
        f1_sum += f1
        total += 1
    return {"EM": em_sum/max(1,total), "F1": f1_sum/max(1,total), "N": total}

reader_scores = eval_reader(eval_queries[:1000])  # start with a subset for speed
reader_scores

  0%|          | 0/1000 [00:00<?, ?it/s]

{'EM': 0.717, 'F1': 0.7691748740148737, 'N': 1000}