In [None]:
!pip install -q torch torch-geometric transformers pandas networkx scikit-learn
!pip install -q --upgrade pip
!pip install -q --upgrade transformers huggingface_hub


In [None]:
!pip install -q transformers sentence-transformers spacy protobuf==3.20.3

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import pandas as pd
import numpy as np
import spacy
import ast
from typing import List, Dict
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline

In [None]:
!pip install -q torch torchvision torchaudio
!pip install -q torch-geometric  # follow PyG install page for CUDA/CPU matching your env
!pip install -q sentence-transformers networkx pandas

In [2]:
import pandas as pd
qa_df = pd.read_csv("/kaggle/input/common2/10th-Biology.csv")   # your dataset
qa_df = qa_df[['English Question', 'A', 'B', 'C', 'D','Correct Answer']]  # example
qa_df.head(2)

Unnamed: 0,English Question,A,B,C,D,Correct Answer
0,Which is the suitable temperature for photosyn...,12\degree C - 25\degree C,17\degree C - 30\degree C,22\degree C - 35\degree C,27\degree C - 40\degree C,c
1,Where is the solar energy stored as the potent...,Oxygen,Water,Carbon-dioxide,Carbohydrate,d


In [3]:
qa_df['English Question'] = qa_df['English Question'].str.lower().str.strip()
qa_df['A'] = qa_df['A'].str.lower().str.strip()
qa_df['B'] = qa_df['B'].str.lower().str.strip()
qa_df['C'] = qa_df['C'].str.lower().str.strip()
qa_df['D'] = qa_df['D'].str.lower().str.strip()
qa_df['Correct Answer'] = qa_df['Correct Answer'].str.lower().str.strip()


In [4]:
qa_df = qa_df
qa_df

Unnamed: 0,English Question,A,B,C,D,Correct Answer
0,which is the suitable temperature for photosyn...,12\degree c - 25\degree c,17\degree c - 30\degree c,22\degree c - 35\degree c,27\degree c - 40\degree c,c
1,where is the solar energy stored as the potent...,oxygen,water,carbon-dioxide,carbohydrate,d
2,mr. rahim's youngest daughter tania is 20 year...,1801.5 calorie,1705.0 calorie,1595.0 calorie,1407.0 calorie,d
3,rashed's-\ni. bmr 1596.5 calorie\nii. bmi 24.7...,i and ii,i and iii,ii and iii,"i, ii and iii",a
4,how many macro-nutrients are needed for normal...,6,10,16,20,b
...,...,...,...,...,...,...
346,rafiq crossed between a tall and a short bean ...,100,75,50,25,a
347,rafiq crossed between a tall and a short bean ...,i and ii,i and iii,ii and iii,"i, ii and iii",c
348,which of the following is destroyed in thalass...,white blood cell,red blood cell,platelets,plasma,b
349,what is the reason for the extinction of north...,variation of variants,inter specific struggle,intra specific struggle,struggle with environment,d


In [5]:
import itertools

pairs = []

for idx, row in qa_df.iterrows():
    q = row['English Question']
    options = [
        row['A'], 
        row['B'], 
        row['C'], 
        row['D']
    ]
    
    for opt in options:
        pairs.append({
            "qid": idx,
            "question": q,
            "answer": opt
        })


In [6]:
pairs_df = pd.DataFrame(pairs)
pairs_df.head()
pairs_df.to_csv("qa_pairs.csv", index=False)


In [None]:
#pairs_df=pairs_df.head(8)
pairs_df

text_cols = ['English Question']

option_cols = ['A', 'B', 'C', 'D']

for col in text_cols + option_cols:

    qa_df[col] = qa_df[col].apply(
    
        lambda x: x.lower().strip() if isinstance(x, str) and '\\' not in x else x
    )


**Base extractor (math + text + riddle)**

In [None]:
import re
import json
import spacy
from typing import Dict, List

nlp = spacy.load("en_core_web_sm")

MEASUREMENT_UNITS = {
    "m": "m", "meter": "m", "meters": "m",
    "cm": "cm", "mm": "mm", "km": "km",
    "degree": "deg", "degrees": "deg"
}

STOP_EXTRA = set([
    "find", "calculate", "what", "which",
    "value", "determine", "choose"
])

def extract_base_concepts(text: str) -> Dict[str, List[str]]:
    text = text.lower()
    doc = nlp(text)

    concepts = set()
    numbers = set()
    units = set()
    relations = set()

    # POS-based extraction
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN", "VERB", "ADJ"] and not token.is_stop:
            if token.text not in STOP_EXTRA:
                concepts.add(token.lemma_)

        if token.like_num:
            numbers.add(token.text)

    # Named Entity Recognition (works for riddles & commonsense)
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE", "NORP"]:
            concepts.add(ent.text.lower())
        elif ent.label_ in ["CARDINAL", "QUANTITY"]:
            numbers.add(ent.text)

    # Units
    for u in MEASUREMENT_UNITS:
        if u in text:
            units.add(MEASUREMENT_UNITS[u])

    # Lightweight relation heuristics
    relation_triggers = [
        "because", "capable of", "used for",
        "part of", "made of", "causes",
        "diagonal of", "type of"
    ]
    for r in relation_triggers:
        if r in text:
            relations.add(r)

    return {
        "concepts": list(concepts),
        "numbers": list(numbers),
        "units": list(units),
        "relations": list(relations)
    }


**LLaMA prompt (domain-independent)**

In [None]:
def llama_prompt(question: str, answer: str) -> str:
    return f"""
Extract key reasoning concepts and relations from the QA pair.

Question:
{question}

Candidate Answer:
{answer}

Return ONLY valid JSON with keys using double quotes around all property names.
Do NOT add any extra text.
If you cannot find concepts or relations, return empty lists like {{"concepts": [], "relations": []}}.

Example:
{{"concepts":["bird","wing"],"relations":["capable of flying"]}}
"""


**Load meta-llama/Llama-3.1-8B**

In [None]:
from transformers import pipeline
from huggingface_hub import login
login("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")

llama = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.1-8B",
    max_new_tokens=200,
    do_sample=False,
    #temperature=0.7,
    #top_p=0.9,
    device_map="auto",
    return_full_text=False

)


2025-12-25 15:39:36.075315: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766677176.277566      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766677176.339334      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766677176.811865      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766677176.811907      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766677176.811910      55 computation_placer.cc:177] computation placer alr

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

**Call LLaMA and parse JSON**

In [None]:
import json

def extract_with_llama(question: str, answer: str) -> Dict[str, List[str]]:
    prompt = llama_prompt(question, answer)
    output = llama(prompt)[0]["generated_text"]

    # If output is empty, return empty dict
    if not output:
        return {"concepts": [], "relations": []}

    # Extract JSON safely
    start = output.find("{")
    end = output.rfind("}") + 1
    json_text = output[start:end]

    try:
        return json.loads(json_text)
    except json.JSONDecodeError:
        # fallback: replace single quotes with double quotes
        json_text_fixed = json_text.replace("'", '"')
        return json.loads(json_text_fixed)

**Unified extractor (THIS is what you use)**

In [None]:
def extract_concepts_unified(question: str, answer: str) -> Dict[str, List[str]]:
    base = extract_base_concepts(question + " " + answer)

    # Heuristic: weak extraction → use LLaMA
    weak = (
        len(base["concepts"]) < 3 or
        len(base["relations"]) == 0
    )

    if weak:
        try:
            llama_out = extract_with_llama(question, answer) or {"concepts": [], "relations": []}

            base["concepts"] = list(
                set(base["concepts"]) | set(llama_out.get("concepts", []))
            )
            base["relations"] = list(
                set(base["relations"]) | set(llama_out.get("relations", []))
            )
        except Exception as e:
            print("LLaMA extraction failed:", e)

    return base


In [None]:
all_extracted = []

for item in pairs:
    out = extract_concepts_unified(
        item["question"],
        item["answer"]
    )
    all_extracted.append({
        "qid": item["qid"],
        "question": item["question"],
        "answer": item["answer"],
        **out
    })
extracted_df = pd.DataFrame(all_extracted)

pairs_df = pd.concat([pairs_df.reset_index(drop=True),
                      extracted_df.reset_index(drop=True)], axis=1)

In [None]:
pairs_df.to_csv("pairs_extracted.csv", index=False)

pairs_df

NEXT STEP

In [None]:
conceptnet_df = pd.read_csv("/kaggle/input/common2/conceptnet.en (1).csv",sep="\t",
    header=None)  # relation, head, tail, weight
conceptnet_df.columns = ["relation", "head", "tail", "weight"]
conceptnet_df

In [None]:
def ensure_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    return ast.literal_eval(x)

pairs_df["concepts"] = pairs_df["concepts"].apply(ensure_list)

In [None]:
bi_encoder = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")

conceptnet_df["edge_text"] = (
    conceptnet_df["head"] + " " +
    conceptnet_df["relation"] + " " +
    conceptnet_df["tail"]
)

edge_embeddings = bi_encoder.encode(
    conceptnet_df["edge_text"].tolist(),
    convert_to_numpy=True,
    show_progress_bar=True,
    batch_size=128
)

np.save("conceptnet_edge_embeddings.npy", edge_embeddings)

# 3️⃣ Save conceptnet_df separately (optional)
conceptnet_df.to_csv("conceptnet_df.csv", index=False)

#edge_embeddings = np.load("conceptnet_edge_embeddings.npy")


In [None]:
def recall_edges(concepts, qa_text,
                 sim_threshold=0.45,
                 recall_top_k=20):

    concept_set = set(norm(c) for c in concepts)

    mask = (
        conceptnet_df["head"].isin(concept_set) |
        conceptnet_df["tail"].isin(concept_set)
    )

    candidate_idx = np.where(mask)[0]
    if len(candidate_idx) == 0:
        return []

    qa_emb = bi_encoder.encode(qa_text, convert_to_numpy=True)
    sims = np.dot(edge_embeddings[candidate_idx], qa_emb)
    weights = conceptnet_df.iloc[candidate_idx]["weight"].values
    sims = sims * weights 
    keep = sims >= sim_threshold
    idx = candidate_idx[keep]
    sims = sims[keep]

    order = np.argsort(-sims)[:recall_top_k]
    return [(int(idx[i]), float(sims[i])) for i in order]


In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cuda")

def rerank_edges(qa_text, candidates, top_k=5):
    if not candidates:
        return []

    texts = [
        [qa_text, conceptnet_df.iloc[idx]["edge_text"]]
        for idx, _ in candidates
    ]
    scores = cross_encoder.predict(texts)

    ranked = sorted(
        zip(candidates, scores),
        key=lambda x: -x[1]
    )
    return [(idx, float(score)) for ((idx, _), score) in ranked[:top_k]]


In [None]:
llama = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.1-8B",
    max_new_tokens=120,
    do_sample=False,
    device_map="auto",
    return_full_text=False
)

def cot_disambiguate(qa_text, edges):
    refined = []
    for idx, _ in edges:
        edge = conceptnet_df.iloc[idx]
        prompt = f"""
Question & Answer:
{qa_text}

Edge:
{edge['head']} {edge['relation']} {edge['tail']}

Does this edge logically support the answer?
Return JSON only:
{{"score": 0.0 to 1.0}}
"""
        out = llama(prompt)[0]["generated_text"]
        try:
            score = float(out[out.find("{")+1:out.find("}")].split(":")[1])
        except:
            score = 0.0
        refined.append((idx, score))

    refined.sort(key=lambda x: -x[1])
    return refined


In [None]:
def extract_conceptnet_edges(row):
    qa_text = row["question"] + " " + row["answer"]

    recalled = recall_edges(row["concepts"], qa_text)
    reranked = rerank_edges(qa_text, recalled)

    if len(reranked) > 1 and abs(reranked[0][1] - reranked[1][1]) < 0.05:
        reranked = cot_disambiguate(qa_text, reranked)

    final = reranked[:3]

    return [
        {
            "head": conceptnet_df.iloc[idx]["head"],
            "relation": conceptnet_df.iloc[idx]["relation"],
            "tail": conceptnet_df.iloc[idx]["tail"],
            "score": score
        }
        for idx, score in final
    ]

pairs_df["conceptnet_pruned"] = pairs_df.apply(extract_conceptnet_edges, axis=1)


In [None]:
import networkx as nx

G = nx.from_pandas_edgelist(
    subgraph, 
    source="head", 
    target="tail", 
    edge_attr=["relation", "weight"],
    create_using=nx.DiGraph()
)

nx.write_graphml(G, f"graphs/qid_{qid}.graphml")


In [None]:
import os

os.environ["HUGGINGFACE_HUB_TOKEN"] = "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"
