<a href="https://colab.research.google.com/github/mahimahi18/LegalQA_RAG/blob/main/ContradictionDetection_Chunking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Read the Excel file
df = pd.read_excel("Full_answer_Dataset.xlsx")

# Function to prepend answer to updated_answers
def prepend_answer(row):
    # Ensure both columns are strings before concatenation
    answers_str = str(row['answers'])
    updated_answers_str = str(row['updated_answers'])

    if pd.isna(row['updated_answers']):  # If updated_answers is empty/NaN
        return answers_str
    else:
        return answers_str + "|||" + updated_answers_str

# Apply the function
df['updated_answers'] = df.apply(prepend_answer, axis=1)

# Save back to Excel (new file to avoid overwriting original)
df.to_excel("updated_file.xlsx", index=False)

In [None]:
# ===========================
# STEP 1: Install dependencies
# ===========================
!pip install transformers datasets torch networkx pandas openpyxl -q

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import networkx as nx
from tqdm import tqdm

# ===========================
# STEP 2: Load Pretrained NLI Model
# ===========================
model_name = "roberta-large-mnli"   # if Colab runs out of memory, switch to "roberta-base-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# Label mapping for RoBERTa MNLI
id2label = {0: "contradiction", 1: "neutral", 2: "entailment"}

def classify_pair(premise, hypothesis):
    """Run NLI on a pair and return (label, probs)."""
    inputs = tokenizer.encode_plus(
        premise,
        hypothesis,
        return_tensors="pt",
        truncation=True,
        max_length=256
    )
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = F.softmax(logits, dim=-1).flatten().tolist()
    label = id2label[int(torch.argmax(logits))]
    return label, probs

# ===========================
# STEP 3: Grouping Logic
# ===========================
def group_responses(responses):
    """
    Given a list of responses, classify pairwise relations and
    group them into majority and contradicting groups.
    """
    n = len(responses)
    if n == 1:  # single response → no grouping
        return responses[0], ""

    # Step 1: Build graph with "agree" and "contradict" edges
    G = nx.Graph()
    G.add_nodes_from(range(n))

    for i in range(n):
        for j in range(i+1, n):
            label, _ = classify_pair(responses[i], responses[j])

            if label == "entailment":   # treat entailment as "agree"
                G.add_edge(i, j, type="agree")
            elif label == "contradiction":
                G.add_edge(i, j, type="contradict")
            # ignore neutral edges

    # Step 2: Largest "agree" component = majority group
    agree_edges = [(u, v) for u, v, d in G.edges(data=True) if d["type"] == "agree"]
    agree_subgraph = nx.Graph()
    agree_subgraph.add_nodes_from(G.nodes())
    agree_subgraph.add_edges_from(agree_edges)

    components = list(nx.connected_components(agree_subgraph))
    if components:
        majority_group = max(components, key=len)
    else:
        # if no agreements, pick the first answer as majority
        majority_group = {0}

    # Step 3: Contradicting group = responses linked by contradiction to majority
    contradict_group = []
    for node in range(n):
        if node not in majority_group:
            for maj in majority_group:
                if G.has_edge(node, maj) and G[maj][node]["type"] == "contradict":
                    contradict_group.append(node)
                    break

    # Convert to strings
    majority_responses = " ||| ".join([responses[i].strip() for i in majority_group])
    contradict_responses = " ||| ".join([responses[i].strip() for i in contradict_group])

    return majority_responses, contradict_responses

# ===========================
# STEP 4: Load Your Excel File
# ===========================
filename = "Mini_FullAnswers.xlsx"   # <-- fixed file name
df = pd.read_excel(filename)

assert "Final_answers" in df.columns, "❌ Your file must contain a column named 'Final_answers'"

# ===========================
# STEP 5: Process Each Row
# ===========================
majority_col, contradict_col = [], []

for responses_str in tqdm(df["Final_answers"], desc="Processing questions"):
    if pd.isna(responses_str) or responses_str.strip() == "":
        majority_col.append("")
        contradict_col.append("")
        continue

    responses = [r.strip() for r in responses_str.split("|||") if r.strip()]

    if len(responses) == 0:
        majority_col.append("")
        contradict_col.append("")
    else:
        maj, contra = group_responses(responses)
        majority_col.append(maj)
        contradict_col.append(contra)

# ===========================
# STEP 6: Save Output
# ===========================
output_file = "grouped_responses.xlsx"
df["Majority_group"] = majority_col
df["Contradicting_group"] = contradict_col

df.to_excel(output_file, index=False)
print(f"✅ Processing complete! Results saved to {output_file}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing questions:  25%|██▌       | 6/24 [01:51<05:33, 18.50s/it]


KeyboardInterrupt: 

In [None]:

#FASTWER WITH BATCH PROCESSING AND GPU

# ===========================
# STEP 1: Install dependencies
# ===========================
!pip install transformers datasets torch networkx pandas openpyxl -q

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
import networkx as nx
from tqdm import tqdm

# ===========================
# STEP 2: Load Pretrained NLI Model (choose large or base)
# ===========================
model_name = "roberta-large-mnli"   # more accurate, slower
#model_name = "roberta-base-mnli"     # lighter, faster

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ Using device:", device)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

# Label mapping
id2label = {0: "contradiction", 1: "neutral", 2: "entailment"}


# ===========================
# STEP 3: Batched Pair Classification
# ===========================
def classify_pairs_batch(pairs, batch_size=32):
    """
    Classify a list of (premise, hypothesis) pairs in batches.
    Returns a list of labels.
    """
    all_labels = []
    for i in range(0, len(pairs), batch_size):
        batch = pairs[i:i+batch_size]
        premises, hypotheses = zip(*batch)
        inputs = tokenizer(
            list(premises),
            list(hypotheses),
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=256
        ).to(device)

        with torch.no_grad():
            logits = model(**inputs).logits
            preds = torch.argmax(logits, dim=-1).cpu().tolist()

        batch_labels = [id2label[p] for p in preds]
        all_labels.extend(batch_labels)

    return all_labels


# ===========================
# STEP 4: Grouping Logic (now batched)
# ===========================
def group_responses(responses):
    n = len(responses)
    if n == 1:
        return responses[0], ""

    # Build all pairs
    pairs = [(responses[i], responses[j]) for i in range(n) for j in range(i+1, n)]
    labels = classify_pairs_batch(pairs)

    # Build graph
    G = nx.Graph()
    G.add_nodes_from(range(n))

    k = 0
    for i in range(n):
        for j in range(i+1, n):
            label = labels[k]
            k += 1
            if label == "entailment":
                G.add_edge(i, j, type="agree")
            elif label == "contradiction":
                G.add_edge(i, j, type="contradict")

    # Majority group = largest agree component
    agree_edges = [(u, v) for u, v, d in G.edges(data=True) if d["type"] == "agree"]
    agree_subgraph = nx.Graph()
    agree_subgraph.add_nodes_from(G.nodes())
    agree_subgraph.add_edges_from(agree_edges)

    components = list(nx.connected_components(agree_subgraph))
    if components:
        majority_group = max(components, key=len)
    else:
        majority_group = {0}

    # Contradict group
    contradict_group = []
    for node in range(n):
        if node not in majority_group:
            for maj in majority_group:
                if G.has_edge(node, maj) and G[maj][node]["type"] == "contradict":
                    contradict_group.append(node)
                    break

    majority_responses = " ||| ".join([responses[i].strip() for i in majority_group])
    contradict_responses = " ||| ".join([responses[i].strip() for i in contradict_group])

    return majority_responses, contradict_responses


# ===========================
# STEP 5: Load Excel
# ===========================
filename = "Mini_FullAnswers.xlsx"   # <-- your file
df = pd.read_excel(filename)

assert "Final_answers" in df.columns, "❌ Column 'Final_answers' not found!"


# ===========================
# STEP 6: Process Each Row
# ===========================
majority_col, contradict_col = [], []

for responses_str in tqdm(df["Final_answers"], desc="Processing questions"):
    if pd.isna(responses_str) or responses_str.strip() == "":
        majority_col.append("")
        contradict_col.append("")
        continue

    responses = [r.strip() for r in responses_str.split("|||") if r.strip()]

    if len(responses) == 0:
        majority_col.append("")
        contradict_col.append("")
    else:
        maj, contra = group_responses(responses)
        majority_col.append(maj)
        contradict_col.append(contra)


# ===========================
# STEP 7: Save Output
# ===========================
output_file = "grouped_responses.xlsx"
df["Majority_group"] = majority_col
df["Contradicting_group"] = contradict_col

df.to_excel(output_file, index=False)
print(f"✅ Done! Results saved to {output_file}")


✅ Using device: cuda


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing questions: 100%|██████████| 24/24 [00:10<00:00,  2.30it/s]

✅ Done! Results saved to grouped_responses.xlsx





#Model 3
1.   Uses Softmax scores for classification
2.   Pipeline Flowefew:

        *   First Take a response pair form the response set
        *   Then chunk the responses 256 with 128 overlap
        *   Calculate the softmax scoresbw all chunk pairs of the responses
        *   If avg(agrrement score)>0.3 then add agreement edge
        *   If any chunk pair has contradiction score>0.5 then the whole repsonse pair is makred as contradicting
        *   Builds the graph, chooses the majority group = largest connected component of agree edges, and marks nodes outside majority that contradict the majority as the contradicting/minor group.

        *   Writes two new columns to the spreadsheet: Majority_group and Contradicting_group.














In [None]:
# ===========================
# CONTRADICTION DETECTION WITH CHUNKING + SOFTMAX (FIXED)
# ===========================
!pip install transformers datasets torch networkx pandas openpyxl -q

import pandas as pd
import torch
import numpy as np
import networkx as nx
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ===========================
# STEP 1: GLOBAL CONFIG (easy tuning)
# ===========================

# 🔹 Model to use (default: RoBERTa fine-tuned on NLI)
MODEL_NAME = "roberta-large-mnli"

# 🔹 Device (GPU if available, else CPU)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 🔹 Chunking params - FIXED: Reduced chunk size to account for pair concatenation
CHUNK_SIZE = 240        # max tokens per chunk (leaving room for 240+240+32 = 512)
STRIDE = 128            # overlap between chunks to avoid cutting sentences
MAX_PAIR_TOKENS = 512   # hard limit for model input

# 🔹 Batch params
CHUNK_BATCH_SIZE = 64   # number of chunk pairs processed together on GPU

# 🔹 Thresholds for classification
AGREE_THRESHOLD = 0.30           # if avg entailment >= this → agreement
CONTRADICT_CHUNK_THRESH = 0.8   # if any chunk pair has contradiction > this → contradiction

# 🔹 Input/Output files
INPUT_FILE = "Mini_FullAnswers.xlsx"
OUTPUT_FILE = "grouped_responses_chunked.xlsx"

# 🔹 Logging settings
VERBOSE_TRUNCATION = True  # Set to False to reduce truncation logs

# ===========================
# STEP 2: Load Model + Tokenizer
# ===========================
print("✅ Using device:", DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# Label mappings (from Hugging Face model config)
id2label = {int(k): v.upper() for k, v in model.config.id2label.items()}
label2id = {v: k for k, v in id2label.items()}
print("Model label mapping:", id2label)

# Safe access to indices
CONTRA_IDX = label2id.get("CONTRADICTION", 0)
NEUTRAL_IDX = label2id.get("NEUTRAL", 1)
ENTAIL_IDX = label2id.get("ENTAILMENT", 2)

# ===========================
# STEP 3: Helper Functions
# ===========================

def chunk_input_ids(input_ids, chunk_size=CHUNK_SIZE, stride=STRIDE):
    """
    Split a tokenized sequence into overlapping chunks.
    Ensures model never sees >512 tokens.
    """
    L = len(input_ids)
    if L == 0:
        return []
    chunks = []
    start = 0
    while start < L:
        end = min(start + chunk_size, L)
        chunks.append(input_ids[start:end])
        if end == L:
            break
        start += stride
    return chunks

def decode_chunk(chunk_ids):
    """Convert chunk token IDs back to text for model input."""
    return tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

def safe_chunk_length(chunk_a, chunk_b, max_total=480):
    """
    SOLUTION 1: Dynamically truncate chunks if they're too long when paired.
    Leaves 32 tokens buffer for special tokens and padding.
    """
    len_a, len_b = len(chunk_a), len(chunk_b)
    if len_a + len_b <= max_total:
        return chunk_a, chunk_b

    # 🚨 LOG TRUNCATION EVENT (only if verbose logging is enabled)
    if VERBOSE_TRUNCATION:
        print(f"⚠️  TRUNCATION: Chunk pair too long ({len_a}+{len_b}={len_a+len_b} tokens > {max_total})")
        print(f"   → Reducing to proportional lengths...")

    # Proportionally reduce both chunks
    ratio = max_total / (len_a + len_b)
    new_len_a = int(len_a * ratio)
    new_len_b = max_total - new_len_a

    if VERBOSE_TRUNCATION:
        print(f"   → New lengths: {new_len_a} + {new_len_b} = {new_len_a + new_len_b} tokens")

    return chunk_a[:new_len_a], chunk_b[:new_len_b]

def classify_chunkpairs_probs(chunk_pairs_texts, batch_size=CHUNK_BATCH_SIZE):
    """
    Run NLI model on many (premise, hypothesis) chunk pairs in batches.
    Returns: numpy array of softmax probabilities for each label.

    SOLUTION 2: Add extra safety check in tokenizer call.
    """
    all_probs = []
    for i in range(0, len(chunk_pairs_texts), batch_size):
        batch = chunk_pairs_texts[i:i+batch_size]
        a_list = [x[0] for x in batch]
        b_list = [x[1] for x in batch]

        try:
            inputs = tokenizer(
                a_list, b_list,
                return_tensors="pt",
                truncation=True,          # This should handle overflow
                padding=True,
                max_length=MAX_PAIR_TOKENS
            ).to(DEVICE)

            with torch.no_grad():
                logits = model(**inputs).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()

            all_probs.append(probs)

        except RuntimeError as e:
            if "sequence length" in str(e).lower():
                print(f"🚨 BATCH ERROR: Sequence length issue despite safety checks!")
                print(f"   → Error: {str(e)}")
                print(f"   → Batch size: {len(batch)} pairs")
                print(f"   → Skipping this batch and using zero probabilities...")
                # Create zero probabilities for this batch
                zero_probs = np.zeros((len(batch), len(id2label)), dtype=float)
                all_probs.append(zero_probs)
            else:
                raise e

    if not all_probs:
        return np.zeros((0, len(id2label)), dtype=float)

    return np.vstack(all_probs)

def tokenize_no_trunc(text):
    """Tokenize text WITHOUT truncation or special tokens (so we can chunk manually)."""
    return tokenizer.encode(text, add_special_tokens=False)

def compute_pair_scores(respA, respB):
    """
    Compare two responses:
    - Chunk both
    - Compare all chunk pairs with safety checks
    - Aggregate scores: avg entailment, max entailment, max contradiction
    """
    idsA, idsB = tokenize_no_trunc(respA), tokenize_no_trunc(respB)
    chunksA, chunksB = chunk_input_ids(idsA), chunk_input_ids(idsB)

    if not chunksA or not chunksB:
        return {"avg_entail": 0.0, "max_entail": 0.0, "max_contradict": 0.0, "n_pairs": 0}

    # SOLUTION 3: Build chunk pairs with length safety check
    chunk_pairs_texts = []
    for a in chunksA:
        for b in chunksB:
            # Apply safety truncation
            safe_a, safe_b = safe_chunk_length(a, b)
            chunk_text_a = decode_chunk(safe_a)
            chunk_text_b = decode_chunk(safe_b)
            chunk_pairs_texts.append((chunk_text_a, chunk_text_b))

    # Run model
    probs = classify_chunkpairs_probs(chunk_pairs_texts, batch_size=CHUNK_BATCH_SIZE)
    if probs.shape[0] == 0:
        return {"avg_entail": 0.0, "max_entail": 0.0, "max_contradict": 0.0, "n_pairs": 0}

    p_entail = probs[:, ENTAIL_IDX]
    p_contradict = probs[:, CONTRA_IDX]

    return {
        "avg_entail": float(np.mean(p_entail)),
        "max_entail": float(np.max(p_entail)),
        "max_contradict": float(np.max(p_contradict)),
        "n_pairs": probs.shape[0]
    }

def group_responses_with_chunking(responses):
    """
    Group responses into:
    - Majority (largest agreement cluster)
    - Contradicting (those that disagree with majority)
    """
    n = len(responses)
    if n == 1:
        return responses[0].strip(), ""

    # Step 1: Compute all pair scores
    pair_scores = {}
    for i in range(n):
        for j in range(i+1, n):
            pair_scores[(i, j)] = compute_pair_scores(responses[i], responses[j])

    # Step 2: Build graph with agree/contradict edges
    G = nx.Graph()
    G.add_nodes_from(range(n))

    for (i, j), s in pair_scores.items():
        if s["max_contradict"] > CONTRADICT_CHUNK_THRESH:
            G.add_edge(i, j, type="contradict", weight=s["max_contradict"])
        if s["avg_entail"] >= AGREE_THRESHOLD:
            G.add_edge(i, j, type="agree", weight=s["avg_entail"])

    # Step 3: Find largest agree-connected component (majority)
    agree_edges = [(u, v) for u, v, d in G.edges(data=True) if d["type"] == "agree"]
    agree_subgraph = nx.Graph()
    agree_subgraph.add_nodes_from(G.nodes())
    agree_subgraph.add_edges_from(agree_edges)

    components = list(nx.connected_components(agree_subgraph))
    if components:
        majority_group = max(components, key=len)
    else:
        # fallback: pick node with highest average entailment
        node_scores = {i: 0.0 for i in range(n)}
        for (i, j), s in pair_scores.items():
            node_scores[i] += s["avg_entail"]
            node_scores[j] += s["avg_entail"]
        max_score = max(node_scores.values())
        majority_group = {i for i, sc in node_scores.items() if sc >= 0.9 * max_score}

    # Step 4: Find contradictions to majority
    contradict_group = []
    for node in range(n):
        if node in majority_group:
            continue
        for maj in majority_group:
            if G.has_edge(node, maj) and G[node][maj]["type"] == "contradict":
                contradict_group.append(node)
                break

    majority_responses = " ||| ".join([responses[i].strip() for i in sorted(list(majority_group))])
    contradict_responses = " ||| ".join([responses[i].strip() for i in sorted(contradict_group)])

    return majority_responses, contradict_responses

# ===========================
# STEP 4: Run on Excel
# ===========================
df = pd.read_excel(INPUT_FILE)
assert "Final_answers" in df.columns, "❌ Column 'Final_answers' not found!"

majority_col, contradict_col = [], []
for responses_str in tqdm(df["Final_answers"], desc="Processing questions"):
    if pd.isna(responses_str) or responses_str.strip() == "":
        majority_col.append("")
        contradict_col.append("")
        continue

    responses = [r.strip() for r in responses_str.split("|||") if r.strip()]
    if not responses:
        majority_col.append("")
        contradict_col.append("")
    else:
        maj, contra = group_responses_with_chunking(responses)
        majority_col.append(maj)
        contradict_col.append(contra)

df["Majority_group"] = majority_col
df["Contradicting_group"] = contradict_col
df.to_excel(OUTPUT_FILE, index=False)
print(f"✅ Done! Results saved to {OUTPUT_FILE}")

✅ Using device: cuda


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model label mapping: {0: 'CONTRADICTION', 1: 'NEUTRAL', 2: 'ENTAILMENT'}


Processing questions:   0%|          | 0/24 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
Processing questions: 100%|██████████| 24/24 [00:21<00:00,  1.13it/s]

✅ Done! Results saved to grouped_responses_chunked.xlsx



