In [None]:
!pip install -q transformers sentence-transformers spacy protobuf==3.20.3
!pip install torch torchvision torchaudio torch_geometric

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import pandas as pd
import numpy as np
import spacy
import ast
from typing import List, Dict
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import torch_geometric


In [None]:
import numpy as np

edge_embeddings = np.load("/kaggle/input/edge-embedding/conceptnet_edge_embeddings (1).npy")
 

In [None]:
pairs_df = pd.read_csv("/kaggle/input/canndy/pairs_extracted.csv")
conceptnet_df = pd.read_csv("/kaggle/input/mintoc/conceptnet_df_clean.csv",sep="\t",
    header=None)  # relation, head

#conceptnet_df.columns = ["relation", "head", "tail", "weight"]

conceptnet_df = pd.read_csv("/kaggle/input/mintoc/conceptnet_df_clean.csv")
conceptnet_df

***Recall candidates (high recall, low precision)***

In [None]:
def recall_edges(concepts,
                 numbers,
                 units,
                 relations,
                 qa_text,
                 sim_threshold=0.1,
                 recall_top_k=60):

    # --- 1️⃣ Concept-based candidate filtering (safe & fast) ---
    #concept_set = set(norm(c) for c in concepts)

    concept_set = set(concepts)

    mask = (
        conceptnet_df["head"].isin(concept_set) |
        conceptnet_df["tail"].isin(concept_set)
    )

    candidate_idx = np.where(mask)[0]
    if len(candidate_idx) == 0:
        return []

    # --- 2️⃣ Enriched QA text (semantic signal injection) ---
    extra_signal = " ".join(
        list(concepts) +
        list(relations) +
        [f"{n} {u}" for n, u in zip(numbers, units)]
    )

    enriched_qa_text = qa_text + " " + extra_signal

    # --- 3️⃣ QA embedding ---
    qa_emb = bi_encoder.encode(
        enriched_qa_text,
        convert_to_numpy=True,
        normalize_embeddings=True
    ).reshape(-1)

    # --- 4️⃣ Edge similarity (edge_text already embedded) ---
    edge_embs = edge_embeddings[candidate_idx]
    sims = np.dot(edge_embs, qa_emb)

    # --- 5️⃣ Weight as soft prior (non-dominant) ---
    weights = conceptnet_df.iloc[candidate_idx]["weight"].values
    weights = np.log1p(weights)
    sims = sims * (1.0 + 0.3 * weights)

    # --- 6️⃣ Filter + rank ---
    keep = sims >= sim_threshold
    idx = candidate_idx[keep]
    sims = sims[keep]

    order = np.argsort(-sims)[:recall_top_k]

    return [(int(idx[i]), float(sims[i])) for i in order]


In [None]:
from sentence_transformers import SentenceTransformer

bi_encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


**Cross encoder reranker**

In [None]:
from typing import List, Tuple, Optional
import math

from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cuda")


def rerank_edges(
    qa_text: str,
    candidates: List[Tuple[int, float, Optional[List[str]]]],  # (idx, prior_score, path_opt)
    top_k: int = 40,
    batch_size: int = 32,
    include_path_in_prompt: bool = False,
    combine_with_prior: bool = True,
    prior_weight: float = 0.3  # only used if combine_with_prior True
) -> List[Tuple[int, float, Optional[List[str]]]]:
    """
    Args:
      candidates: sequence of tuples (idx, prior_score, path) where path may be None or list of str.
                  If your candidates are (idx,score) you can convert to (idx,score,None).
      include_path_in_prompt: if True, the QA side of pair will include the hop path context.
      combine_with_prior: if True, final score = cross_score * (1 + prior_weight * normalized_prior)
    Returns:
      list of (idx, final_score, path) sorted by final_score desc (length <= top_k)
    """
    if not candidates:
        return []

    # Normalize prior scores (if present)
    priors = [c[1] if len(c) > 1 and c[1] is not None else 0.0 for c in candidates]
    max_prior = max(priors) if priors else 1.0
    min_prior = min(priors) if priors else 0.0
    def norm_prior(p):
        if max_prior == min_prior:
            return 0.0
        return (p - min_prior) / (max_prior - min_prior)

    # Build texts for cross-encoder: pair of [qa_text (+ optional path), edge_text]
    texts = []
    metas = []  # store (idx, prior, path)
    for idx, prior, *rest in candidates:
        path = rest[0] if rest else None
        if include_path_in_prompt and path:
            left = qa_text + " Context path: " + " -> ".join(path)
        else:
            left = qa_text
        right = conceptnet_df.iloc[int(idx)]["edge_text"]
        texts.append([left, right])
        metas.append((int(idx), float(prior if prior is not None else 0.0), path))

    # Batched prediction
    scores = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_scores = cross_encoder.predict(batch)
        scores.extend(batch_scores)

    # Combine and rank
    results = []
    for (idx, prior, path), cross_score in zip(metas, scores):
        final_score = float(cross_score)
        if combine_with_prior:
            np_prior = norm_prior(prior)
            final_score = final_score * (1.0 + prior_weight * np_prior)
        results.append((idx, final_score, path))

    results.sort(key=lambda x: -x[1])
    return results[:top_k]


**COT only if ambiguous**

In [None]:
from huggingface_hub import login
login("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")

llama = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.1-8B",
    max_new_tokens=120,
    do_sample=False,
    device_map="auto",
    return_full_text=False
)



In [None]:
import re
import json

# llama pipeline already created. Recommend calling with deterministic generation params at call-time:
# e.g. llama(prompt, temperature=0.0, max_new_tokens=120, do_sample=False)

def extract_json_from_text(text: str):
    """Find first {...} JSON-like substring and parse it robustly."""
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        return None
    s = m.group()
    try:
        return json.loads(s)
    except Exception:
        # Try minimal cleanup: replace single quotes -> double quotes, remove trailing commas
        s2 = s.replace("'", '"')
        s2 = re.sub(r",\s*}", "}", s2)
        s2 = re.sub(r",\s*]", "]", s2)
        try:
            return json.loads(s2)
        except Exception:
            return None

def cot_disambiguate(
    qa_text: str,
    edges: List[Tuple[int, float, Optional[List[str]]]],
    top_n: int = None,
    llama_batch: bool = False,
    temperature: float = 0.0
) -> List[Tuple[int, float]]:
    """
    Args:
      edges: list of (idx, prior_score, path) or (idx, prior_score). Prefer top-k filtered before calling.
      top_n: if set, limit to top_n edges to query the LLM (recommended small like 10).
      llama_batch: not used by default (left for advanced batching).
    Returns:
      list of (idx, llm_score) sorted desc by llm_score
    """
    if not edges:
        return []

    # limit
    if top_n is not None:
        edges = edges[:top_n]

    refined = []
    for item in edges:
        # unpack forms (idx, score, path) or (idx,score)
        if len(item) >= 3:
            idx, prior, path = item
        else:
            idx, prior = item
            path = None

        edge = conceptnet_df.iloc[int(idx)]
        # prompt: be explicit that ONLY a JSON object should be returned
        prompt = f"""Question & Answer:
{qa_text}

Edge:
{edge['head']} {edge['relation']} {edge['tail']}

Task: Decide whether the above edge logically supports the answer. Output ONLY a single JSON object with one key "score" whose value is a number between 0.0 (no support) and 1.0 (strong support).
Example valid output:
{{"score": 0.75}}
Do not add any other text outside the JSON object.
"""
        # call LLM deterministically
        out = llama(prompt, temperature=temperature, max_new_tokens=120, do_sample=False)[0]["generated_text"]

        parsed = extract_json_from_text(out)
        score = 0.0
        if parsed and "score" in parsed:
            try:
                score = float(parsed["score"])
            except Exception:
                score = 0.0
        else:
            # fallback heuristic: try to find a decimal in the output
            m = re.search(r"([01](?:\.\d+)?)", out)
            if m:
                try:
                    score = float(m.group(1))
                except:
                    score = 0.0

        score = max(0.0, min(1.0, score))
        refined.append((int(idx), float(score)))

    refined.sort(key=lambda x: -x[1])
    return refined


In [None]:
def norm(s: str) -> str: 
    if not isinstance(s, str): 
        s = str(s) 
        return s.strip().lower()

**Full extraction per QA Pair**

In [None]:
import pickle

def extract_and_store_graph_edges(row):
    qa_text = row["question"] + " " + row["answer"]

    recalled = recall_edges(
        concepts=row["concepts"],
        numbers=row["numbers"],
        units=row["units"],
        relations=row["relations"],
        qa_text=qa_text,
        sim_threshold=0.45,
        recall_top_k=80
    )
    reranked = rerank_edges(qa_text, recalled, top_k=50)

    if len(reranked) > 1 and abs(reranked[0][1] - reranked[1][1]) < 0.05:
        cot_edges = cot_disambiguate(qa_text, reranked, top_n=15)
        cot_dict = {idx: score for idx, score in cot_edges}
        reranked = [(idx, cot_dict.get(idx, score), *rest) for idx, score, *rest in reranked]

    graph_edges = reranked[:40]   # store 40 for multi-hop
    final_display = graph_edges[:3]

    display_list = [
        {
            "head": conceptnet_df.iloc[idx]["head"],
            "relation": conceptnet_df.iloc[idx]["relation"],
            "tail": conceptnet_df.iloc[idx]["tail"],
            "score": score
        }
        for idx, score, *rest in final_display
    ]

    # store both: display (pruned) and full graph_edges
    return {"display": display_list, "graph_edges": graph_edges}

pairs_df["cn_result"] = pairs_df.apply(extract_and_store_graph_edges, axis=1)

# Save the graph edges column to disk (pickle)
with open("pairs_conceptnet_graph_edges.pkl", "wb") as f:
    pickle.dump(pairs_df["cn_result"].tolist(), f)


In [None]:
pairs_df.to_csv("pairs_df.csv", index=False)


In [None]:
pairs_df["cn_result"]