In [None]:
# Cell 1 - installs & imports (run once)
!pip install -q transformers sentence-transformers spacy protobuf==3.20.3
!pip install -q torch torchvision torchaudio torch_geometric sentencepiece
# (installing sentencepiece helps some models; adjust as needed)

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import pandas as pd
import numpy as np
import spacy
import ast
from typing import List, Dict, Tuple, Optional
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import torch
from tqdm.auto import tqdm
import pickle

import os, ast, re, json, pickle
from tqdm.auto import tqdm
from collections import defaultdict
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd
import torch

# device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device =", DEVICE)


In [None]:
from huggingface_hub import login
login("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")

In [None]:
pairs_path = "/kaggle/input/canndy/pairs_extracted.csv"
conceptnet_path = "/kaggle/input/mintoc/conceptnet_df_clean.csv"

pairs_df = pd.read_csv(pairs_path)
print("pairs_df shape:", pairs_df.shape)
print(pairs_df.columns.tolist()[:20])
display(pairs_df.head(2))

In [None]:
conceptnet_df = pd.read_csv(conceptnet_path, sep="\t", header=None)
required_cols = ["relation", "head", "tail", "weight", "edge_text"]
missing = [c for c in required_cols if c not in conceptnet_df.columns]

if missing:
    raise ValueError(f"Missing columns: {missing}")

# ensure string type
for col in ["head", "tail", "relation", "edge_text"]:
    conceptnet_df[col] = conceptnet_df[col].astype(str)

# ensure weight exists
conceptnet_df["weight"] = conceptnet_df["weight"].fillna(1.0)

print("conceptnet_df:", conceptnet_df.shape)
display(conceptnet_df.head(3))

In [None]:
# load edge embeddings from Kaggle dataset
import numpy as np
from sentence_transformers import SentenceTransformer

edge_embeddings = np.load(
    "/kaggle/input/edge-embedding/conceptnet_edge_embeddings (1).npy"
)

print("Loaded edge_embeddings. Shape:", edge_embeddings.shape)

# load bi-encoder (only needed later for queries, NOT for edges now)
print("Loading bi-encoder model...")
bi_encoder = SentenceTransformer("all-MiniLM-L6-v2", device=DEVICE)

# sanity check
assert edge_embeddings.shape[0] == len(conceptnet_df), "Mismatch with conceptnet_df rows"
print("edge_embeddings OK. dim =", edge_embeddings.shape[1])


In [None]:
# Cell 4 - cross-encoder for reranking (use CPU if CUDA not available)
cross_encoder_model = "cross-encoder/ms-marco-MiniLM-L-6-v2"
try:
    ce_device = DEVICE if DEVICE == "cuda" else "cpu"
    cross_encoder = CrossEncoder(cross_encoder_model, device=ce_device)
    print("Loaded CrossEncoder on", ce_device)
except Exception as e:
    print("CrossEncoder load failed; falling back to CPU. Error:", e)
    cross_encoder = CrossEncoder(cross_encoder_model, device="cpu")
    print("CrossEncoder loaded on cpu")


In [None]:
# Cell 5 - small helpers to parse list-like fields
def ensure_list_field(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, (tuple, set)):
        return list(x)
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            if isinstance(v, (list, tuple, set)):
                return list(v)
            # fallback: comma split
            if "," in x:
                return [s.strip() for s in x.split(",") if s.strip()]
            return [x]
        except Exception:
            if "," in x:
                return [s.strip() for s in x.split(",") if s.strip()]
            return [x]
    return [x]


***Recall candidates (high recall, low precision)***

In [None]:
# Cell 6 - recall edges (uses bi_encoder for QA embedding and edge_embeddings for similarity)
def recall_edges(concepts, numbers, units, relations, qa_text,
                 sim_threshold=0.05, recall_top_k=120):
    # 1. concept filtering
    concept_set = set([str(c) for c in concepts if c is not None])
    mask = conceptnet_df["head"].isin(concept_set) | conceptnet_df["tail"].isin(concept_set)
    candidate_idx = np.where(mask)[0]
    if len(candidate_idx) == 0:
        return []

    # 2. enriched QA text
    extra_signal = " ".join(list(concepts) + list(relations) + [f"{n} {u}" for n, u in zip(numbers, units)])
    enriched_qa_text = (qa_text or "") + " " + extra_signal

    # 3. encode QA
    qa_emb = bi_encoder.encode(enriched_qa_text, convert_to_numpy=True, normalize_embeddings=True).reshape(-1)

    # 4. similarity (cosine because both normalized)
    edge_embs = edge_embeddings[candidate_idx]
    sims = np.dot(edge_embs, qa_emb)

    # 5. soft prior by weight
    weights = conceptnet_df.iloc[candidate_idx]["weight"].astype(float).values
    sims = sims * (1.0 + 0.3 * np.log1p(weights))

    # 6. filter & rank
    keep_mask = sims >= sim_threshold
    if keep_mask.sum() == 0:
        return []
    idx_kept = candidate_idx[keep_mask]
    sims_kept = sims[keep_mask]
    order = np.argsort(-sims_kept)[:recall_top_k]
    results = [(int(idx_kept[i]), float(sims_kept[i])) for i in order]
    return results


In [None]:
# Cell 7 - cross-encoder reranker
def rerank_edges(qa_text: str,
                 candidates: List[Tuple[int, float, Optional[List[str]]]],
                 top_k: int = 40,
                 batch_size: int = 32,
                 include_path_in_prompt: bool = False,
                 combine_with_prior: bool = True,
                 prior_weight: float = 0.3):
    if not candidates:
        return []

    # normalize priors
    priors = [c[1] if len(c) > 1 and c[1] is not None else 0.0 for c in candidates]
    max_prior = max(priors) if priors else 1.0
    min_prior = min(priors) if priors else 0.0
    def norm_prior(p):
        if max_prior == min_prior:
            return 0.0
        return (p - min_prior) / (max_prior - min_prior)

    texts = []
    metas = []
    for idx, prior, *rest in candidates:
        path = rest[0] if rest else None
        left = qa_text
        if include_path_in_prompt and path:
            left = qa_text + " Context path: " + " -> ".join(path)
        right = conceptnet_df.iloc[int(idx)]["edge_text"]
        texts.append([left, right])
        metas.append((int(idx), float(prior if prior is not None else 0.0), path))

    scores = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_scores = cross_encoder.predict(batch)  # returns floats
        scores.extend(batch_scores)

    results = []
    for (idx, prior, path), cross_score in zip(metas, scores):
        final_score = float(cross_score)
        if combine_with_prior:
            np_prior = norm_prior(prior)
            final_score = final_score * (1.0 + prior_weight * np_prior)
        results.append((idx, final_score, path))

    results.sort(key=lambda x: -x[1])
    return results[:top_k]


In [None]:
# Cell 8 - CoT disambiguation using an LLM pipeline if available.
# If the LLM pipeline (llama) is not available or fails, we fall back to a simple lexical heuristic score.

from transformers import pipeline
LLM_AVAILABLE = False
llm = None
try:
    # try to create a small deterministic text-gen pipeline (user had Llama; this may not work in your env)
    # If your environment has Llama-3 or other LLM, change this model name accordingly.
    llm = pipeline("text-generation", model="meta-llama/Llama-3.1-8B", device_map="auto", do_sample=False, max_new_tokens=120)
    LLM_AVAILABLE = True
    print("LLM pipeline created (CoT enabled).")
except Exception as e:
    print("LLM not available in this environment, CoT will be skipped. Error:", e)
    LLM_AVAILABLE = False

def extract_json_from_text(text: str):
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        return None
    s = m.group()
    try:
        return json.loads(s)
    except Exception:
        s2 = s.replace("'", '"')
        s2 = re.sub(r",\s*}", "}", s2)
        s2 = re.sub(r",\s*]", "]", s2)
        try:
            return json.loads(s2)
        except:
            return None

def cot_disambiguate(qa_text: str, edges: List[Tuple[int, float, Optional[List[str]]]], top_n: int = 10, temperature: float = 0.0):
    if not edges:
        return []
    edges = edges[:top_n]
    refined = []
    if LLM_AVAILABLE:
        for item in edges:
            if len(item) >= 3:
                idx, prior, path = item
            else:
                idx, prior = item
                path = None
            edge = conceptnet_df.iloc[int(idx)]
            prompt = f"""Question & Answer:
{qa_text}

Edge:
{edge['head']} {edge['relation']} {edge['tail']}

Task: Decide whether the above edge logically supports the answer. Output ONLY a single JSON object with one key "score" (0.0-1.0).
Example: {{"score": 0.75}}
Do not add any other text.
"""
            out = llm(prompt, temperature=temperature, max_new_tokens=40, do_sample=False)[0]["generated_text"]
            parsed = extract_json_from_text(out)
            score = 0.0
            if parsed and "score" in parsed:
                try:
                    score = float(parsed["score"])
                except:
                    score = 0.0
            else:
                m = re.search(r"([01](?:\.\d+)?)", out)
                if m:
                    try:
                        score = float(m.group(1))
                    except:
                        score = 0.0
            score = max(0.0, min(1.0, score))
            refined.append((int(idx), float(score)))
    else:
        # fallback: lexical overlap + normalized prior (fast heuristic)
        for item in edges:
            if len(item) >= 3:
                idx, prior, path = item
            else:
                idx, prior = item
            edge = conceptnet_df.iloc[int(idx)]
            # count overlap tokens between QA and edge_text
            qa_tokens = set(re.findall(r"\w+", qa_text.lower()))
            edge_tokens = set(re.findall(r"\w+", edge["edge_text"].lower()))
            overlap = len(qa_tokens & edge_tokens) / max(1, len(edge_tokens))
            # combine with prior (scaled)
            score = 0.6 * overlap + 0.4 * (prior / (abs(prior) + 1.0))  # simple scaling
            score = max(0.0, min(1.0, score))
            refined.append((int(idx), float(score)))
    refined.sort(key=lambda x: -x[1])
    return refined


In [None]:
# Cell 9 - main extraction function + loop through pairs_df, saving results for GCN later
import math

def extract_and_store_graph_edges(row,
                                  sim_threshold=0.08,
                                  recall_top_k=120,
                                  rerank_top_k=50,
                                  cot_top_n=15,
                                  final_keep=40):
    qa_text = (str(row.get("question","")) + " " + str(row.get("answer",""))).strip()
    concepts = ensure_list_field(row.get("concepts", []))
    numbers = ensure_list_field(row.get("numbers", []))
    units = ensure_list_field(row.get("units", []))
    relations = ensure_list_field(row.get("relations", []))

    # 1. recall
    recalled = recall_edges(concepts, numbers, units, relations, qa_text, sim_threshold=sim_threshold, recall_top_k=recall_top_k)
    if not recalled:
        return {"display": [], "graph_edges": []}

    # 2. rerank
    # rerank expects candidate list as (idx, prior, path_opt) â€” our recall returns (idx,score)
    candidates = [(idx, score, None) for idx, score in recalled]
    reranked = rerank_edges(qa_text, candidates, top_k=rerank_top_k)

    # 3. If top two are close, use CoT / LLM to disambiguate
    if len(reranked) > 1 and abs(reranked[0][1] - reranked[1][1]) < 0.05:
        cot_edges = cot_disambiguate(qa_text, reranked, top_n=cot_top_n)
        if cot_edges:
            cot_dict = {idx: score for idx, score in cot_edges}
            # replace reranked scores with cot scores where available
            reranked = [(idx, float(cot_dict.get(idx, score)), *rest) for idx, score, *rest in reranked]

    # 4. prune low supporting edges (keep top final_keep or those > threshold)
    reranked_sorted = sorted(reranked, key=lambda x: -x[1])
    pruned = reranked_sorted[:final_keep]

    # build human display for top 3
    final_display = pruned[:3]
    display_list = [
        {"head": conceptnet_df.iloc[idx]["head"],
         "relation": conceptnet_df.iloc[idx]["relation"],
         "tail": conceptnet_df.iloc[idx]["tail"],
         "score": float(score)}
        for idx, score, *rest in final_display
    ]

    # return both pruned graph edges and display
    return {"display": display_list, "graph_edges": pruned}

# Run the extraction loop with progress bar and save results
results = []
for i, row in tqdm(pairs_df.iterrows(), total=len(pairs_df)):
    try:
        res = extract_and_store_graph_edges(row, sim_threshold=0.08, recall_top_k=120, rerank_top_k=50, cot_top_n=15, final_keep=40)
        results.append(res)
    except Exception as e:
        print("Row", i, "failed:", e)
        results.append({"display": [], "graph_edges": []})

pairs_df["cn_result"] = results

# Save results
pairs_df.to_csv("pairs_with_cn_results.csv", index=False)
with open("pairs_conceptnet_graph_edges.pkl", "wb") as f:
    pickle.dump(pairs_df["cn_result"].tolist(), f)
print("Saved pairs_with_cn_results.csv and pairs_conceptnet_graph_edges.pkl")
