News retrieval with stance detection

In [78]:
import pandas as pd
import numpy as np
import re
import nltk
import xgboost as xgb
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from rank_bm25 import BM25Okapi
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC # Included for consistency, though only LTR is used for ranking
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- 0. Dependency Installation & Setup ---
try:
    print("Setting up NLTK resources and loading data...")
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Load FNC-1 Dataset
    bodies_df = pd.read_csv("train_bodies.csv")
    stances_df = pd.read_csv("train_stances.csv")

    # 1. Deduplication (CRITICAL FIX)
    bodies_df = bodies_df.drop_duplicates(subset=['articleBody'], keep='first').reset_index(drop=True)
    
    # 2. Merge data for context (full stance pairs)
    data = pd.merge(stances_df, bodies_df, on='Body ID', how='left')
    data.dropna(subset=['articleBody'], inplace=True) 
    data.reset_index(drop=True, inplace=True)
    
except FileNotFoundError:
    print("\nFATAL ERROR: Ensure 'train_bodies.csv' and 'train_stances.csv' are in the script directory.")
    exit()
except Exception as e:
    print(f"Error during setup: {e}")
    exit()

# --- Preprocessing Function ---
def preprocess(text):
    """Clean, tokenize, remove stop words, and lemmatize text."""
    if pd.isna(text) or text is None:
        return []
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
data['processed_headline'] = data['Headline'].apply(preprocess)
data['processed_body'] = data['articleBody'].apply(preprocess)
corpus = data['processed_headline'].tolist() + data['processed_body'].tolist()


# --- 1. Train Word2Vec and Vectorize (Features for LTR) ---
print("Training Word2Vec Skip-gram model...")
w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=2, sg=1, negative=5, workers=4)

def get_doc_vector(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if len(vectors) > 0 else np.zeros(100)

body_vectors = np.array([get_doc_vector(tokens) for tokens in data['processed_body']])
headline_vectors = np.array([get_doc_vector(tokens) for tokens in data['processed_headline']])
print("Vectorization complete.")


# --- 2. Train BM25 Model (Phase 1 Retriever & Feature) ---
print("Building the BM25 index on unique bodies...")
corpus_tokens_unique = bodies_df['articleBody'].apply(preprocess).tolist()
bm25_model = BM25Okapi(corpus_tokens_unique)
print("BM25 index built.")

def get_single_bm25_score(bm25_model, q_tokens, b_tokens):
    # Calculate the score for a query against a single document
    try:
        temp_bm25 = BM25Okapi([b_tokens])
        return temp_bm25.get_scores(q_tokens)[0]
    except ZeroDivisionError:
        return 0.0


# --- 3. Feature Extraction (X_hybrid) ---
def extract_hybrid_features(headline_vecs, body_vecs, headline_tokens, body_tokens, bm25_model):
    """Generates the 202-dimensional feature matrix."""
    cos_sims = [cosine_similarity(h.reshape(1, -1), b.reshape(1, -1))[0][0] 
                for h, b in zip(headline_vecs, body_vecs)]
    
    # Calculate BM25 score for every headline-body pair (Heavy step)
    bm25_scores = [get_single_bm25_score(bm25_model, h_tokens, b_tokens) 
                   for h_tokens, b_tokens in zip(headline_tokens, body_tokens)]

    # [H_vec(100), B_vec(100), CosSim(1), BM25(1)] = 202 features
    X_hybrid = np.hstack([
        headline_vecs, 
        body_vecs, 
        np.array(cos_sims).reshape(-1, 1), 
        np.array(bm25_scores).reshape(-1, 1)
    ])
    return X_hybrid

print("Extracting X_hybrid (202 features)...")
X_hybrid = extract_hybrid_features(
    headline_vectors, 
    body_vectors, 
    data['processed_headline'].tolist(), 
    data['processed_body'].tolist(),
    bm25_model
)


# --- 4. LTR Training (SelectKBest & XGBRanker) ---
print("Training LTR Ranker (XGBoost) with correct grouping...")
K_FEATURES = 150
relevance_map = {'agree': 3, 'disagree': 3, 'discuss': 2, 'unrelated': 0}

# 1. Split data, including the 'Headline' column for grouping
X_train_hybrid, _, y_train_stance, _, train_headlines, _ = train_test_split(
    X_hybrid,
    data['Stance'],
    data['Headline'], 
    test_size=0.2,
    random_state=42
)

# 2. Feature Selection
selector = SelectKBest(k=K_FEATURES)
# We must fit the selector on the training data first
selector.fit(X_train_hybrid, pd.Series(y_train_stance).map(relevance_map)) 
X_train_selected = selector.transform(X_train_hybrid)

# 3. Create Relevance Labels
y_train_rel = np.array([relevance_map[label] for label in y_train_stance])

# 4. CRITICAL FIX: Create the Group Information for LTR
# Sort training data by Headline to group queries and create the group list
train_df = pd.DataFrame({
    'features_index': range(len(X_train_selected)),
    'Headline': train_headlines.reset_index(drop=True)
})
train_df = train_df.sort_values(by='Headline').reset_index(drop=True)

# Reorder features and labels based on the sorted index
X_train_selected_sorted = X_train_selected[train_df['features_index'].values]
y_train_rel_sorted = y_train_rel[train_df['features_index'].values]

# Count the number of documents (rows) for each unique headline (group)
groups_train = train_df.groupby('Headline').size().tolist()

# 5. Train XGBoost Ranker
ranker = xgb.XGBRanker(objective='rank:pairwise', learning_rate=0.1, n_estimators=100)
ranker.fit(X_train_selected_sorted, y_train_rel_sorted, group=groups_train) 
print("LTR Ranker Training Complete.")

# 6. Train SVM Classifier (Required for Stance Prediction)
svm_clf = SVC(kernel='linear', C=1.0, random_state=42, probability=True, class_weight='balanced')
svm_clf.fit(X_train_selected, y_train_stance)
print("SVM Classifier Training Complete.")

print("-" * 70)

# ====================================================================
# --- INTERACTIVE SEARCH EXECUTION (Phase 1: BM25, Phase 2: LTR/SVM) ---
# ====================================================================

def two_phase_hybrid_search3():
    print("\n" + "="*70)
    print("  FINAL HYBRID SYSTEM: BM25 (Phase 1) + LTR/SVM (Phase 2)")
    print("  Results are ranked by LTR Score (Hybrid Relevance).")
    print("  Type 'exit' or 'quit' to stop.")
    print("="*70)

    while True:
        user_query = input("\n>> Enter Headline Claim: ")
        
        if user_query.lower() in ['exit', 'quit']:
            print("Exiting system. Goodbye!")
            break
        if not user_query.strip():
            continue

        q_tokens = preprocess(user_query)
        q_vec = get_doc_vector(q_tokens).astype('float32')

        # --- PHASE 1: LEXICAL RETRIEVAL (BM25) ---
        CANDIDATE_POOL_SIZE = 50 
        bm25_all_scores = bm25_model.get_scores(q_tokens)
        bm25_indices = np.argsort(bm25_all_scores)[::-1][:CANDIDATE_POOL_SIZE]

        if len(bm25_indices) == 0 or bm25_all_scores[bm25_indices[0]] < 0.1:
            print("   [!] No relevant documents found.")
            continue
            
        print(f"--- Phase 1: Retrieved {len(bm25_indices)} candidates via BM25.")
        
        # --- FEATURE ALIGNMENT FIX ---
        # Find the vector of the closest *training headline* to the query vector 
        # to ensure features align with SVM/LTR training.
        headline_similarities = cosine_similarity(q_vec.reshape(1, -1), headline_vectors)[0]
        closest_headline_idx = np.argmax(headline_similarities)
        
        best_h_vec = headline_vectors[closest_headline_idx]
        best_h_text = data.iloc[closest_headline_idx]['Headline']
        
        print(f"   [Classifier Reference] Using closest training headline: '{best_h_text}'")
        
        # --- PHASE 2: HYBRID RANKING & CLASSIFICATION (LTR/SVM) ---
        
        candidate_bodies = []
        candidate_features = [] # Stores 202-dim vector

        # A. Feature Extraction
        for idx in bm25_indices:
            # Get body information from the unique bodies_df
            body_id = bodies_df.loc[idx, 'Body ID']
            body_row_full = data[data['Body ID'] == body_id].iloc[0]

            b_vec = get_doc_vector(body_row_full['processed_body'])
            body_text = body_row_full['articleBody']
            b_tokens = body_row_full['processed_body'] 
            
            # Generate Hybrid Feature Vector using the Reference Headline Vector
            cos_sim = cosine_similarity(best_h_vec.reshape(1, -1), b_vec.reshape(1, -1))[0][0]
            bm25_score = bm25_all_scores[idx] 
            
            feat = np.concatenate([best_h_vec, b_vec, [cos_sim], [bm25_score]])
            
            candidate_features.append(feat)
            candidate_bodies.append(body_text)

        candidate_features = np.array(candidate_features)
        
        # B. Feature Selection
        candidate_features_selected = selector.transform(candidate_features)
            
        # C. Stance Classification (SVM)
        predicted_stances = svm_clf.predict(candidate_features_selected)
        
        # D. Hybrid Re-Ranking (LTR)
        ranking_scores = ranker.predict(candidate_features_selected)
        
        # E. Combine and Sort Results (Sorted by LTR Score)
        results = zip(ranking_scores, predicted_stances, candidate_bodies)
        sorted_results = sorted(results, key=lambda x: x[0], reverse=True)

        # --- Display Results ---
        
        print("\n--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---")
        print(f"   Showing top 5 of {len(sorted_results)} results:\n")
        
        for i, (score, stance, body) in enumerate(sorted_results[:5]):
            snippet = " ".join(body.split()[:40]) + "..."
            stance_display = f"[{stance.upper()}]"
            
            print(f"   {i+1}. {stance_display} (LTR Score: {score:.4f})")
            print(f"      Evidence: \"{snippet}\"")
            print("-" * 50)



Setting up NLTK resources and loading data...
Training Word2Vec Skip-gram model...
Vectorization complete.
Building the BM25 index on unique bodies...
BM25 index built.
Extracting X_hybrid (202 features)...
Training LTR Ranker (XGBoost) with correct grouping...
LTR Ranker Training Complete.
SVM Classifier Training Complete.
----------------------------------------------------------------------


In [65]:
# Run the interactive loop
two_phase_hybrid_search3()


  FINAL HYBRID SYSTEM: BM25 (Phase 1) + LTR/SVM (Phase 2)
  Results are ranked by LTR Score (Hybrid Relevance).
  Type 'exit' or 'quit' to stop.



>> Enter Headline Claim:  NASA discover the crater


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'NASA Raises Doubts About Reports of Nicaraguan Meteorite'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 3.2557)
      Evidence: "We reported on Monday that a meteor, thought possibly to be a chunk of an Earth-passing asteroid, was the cause of a 40-foot crater outside the international airport in the Nicaraguan capital. But astronomers and NASA scientists are now casting..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 3.1037)
      Evidence: "An explosion and a crater reported near the capital of Nicaragua raised suspicions on Monday that a meteorite had split off from an asteroid that passed by Earth this weekend and struck our planet. But NASA scientists have now cast..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 3.0165)
  


>> Enter Headline Claim:  NASA questions the crater


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Nasa questions whether crater in Nicaragua caused by meteorite'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 3.2360)
      Evidence: "We reported on Monday that a meteor, thought possibly to be a chunk of an Earth-passing asteroid, was the cause of a 40-foot crater outside the international airport in the Nicaraguan capital. But astronomers and NASA scientists are now casting..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 3.1898)
      Evidence: "An explosion and a crater reported near the capital of Nicaragua raised suspicions on Monday that a meteorite had split off from an asteroid that passed by Earth this weekend and struck our planet. But NASA scientists have now cast..."
--------------------------------------------------
   3. [DISAGREE] (LTR Score: 3.031


>> Enter Headline Claim:  bunch of students set college on fire


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Student accidentally sets college on fire during fireworks proposal'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 1.6485)
      Evidence: "THIS is definitely NOT how to propose to your girlfriend. A hopeless romantic determined to see his proposal go off with a bang ended up burning down his entire college sports hall with a box of fireworks - bought to..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 1.5331)
      Evidence: "The proposal went off with a different kind of bang than Dim Xiong Chien had expected when his fireworks set the college ablaze A blundering Romeo who wanted to propose to his girlfriend with a big bang burnt down his..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 1.4232)
      Evidence: "He popped the qu


>> Enter Headline Claim:  students set college on fire


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Student accidentally sets college on fire during fireworks proposal'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 1.6485)
      Evidence: "THIS is definitely NOT how to propose to your girlfriend. A hopeless romantic determined to see his proposal go off with a bang ended up burning down his entire college sports hall with a box of fireworks - bought to..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 1.5331)
      Evidence: "The proposal went off with a different kind of bang than Dim Xiong Chien had expected when his fireworks set the college ablaze A blundering Romeo who wanted to propose to his girlfriend with a big bang burnt down his..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 1.4232)
      Evidence: "He popped the qu


>> Enter Headline Claim:  student set college on fire accidentally


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Student accidentally sets college on fire during fireworks proposal'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 1.6485)
      Evidence: "THIS is definitely NOT how to propose to your girlfriend. A hopeless romantic determined to see his proposal go off with a bang ended up burning down his entire college sports hall with a box of fireworks - bought to..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 1.5331)
      Evidence: "The proposal went off with a different kind of bang than Dim Xiong Chien had expected when his fireworks set the college ablaze A blundering Romeo who wanted to propose to his girlfriend with a big bang burnt down his..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 1.4232)
      Evidence: "He popped the qu


>> Enter Headline Claim:  Student accidentally sets college on fire during fireworks proposal


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Student accidentally sets college on fire during fireworks proposal'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 1.6485)
      Evidence: "THIS is definitely NOT how to propose to your girlfriend. A hopeless romantic determined to see his proposal go off with a bang ended up burning down his entire college sports hall with a box of fireworks - bought to..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 1.5331)
      Evidence: "The proposal went off with a different kind of bang than Dim Xiong Chien had expected when his fireworks set the college ablaze A blundering Romeo who wanted to propose to his girlfriend with a big bang burnt down his..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 1.4232)
      Evidence: "He popped the qu


>> Enter Headline Claim:  flooding the dam


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Palestine accuses Israel of opening dams, flooding Gaza, forcing evacuations'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 2.0704)
      Evidence: "GAZA, Feb. 22 (Xinhua) -- A Palestinian minister lashed out at Israel on Sunday after it opened its dams near the border with the Gaza Strip, flooding the central area of the besieged enclave with huge amounts of water. Mufid..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 1.7848)
      Evidence: "Israel has rejected allegations by government officials in the Gaza strip that authorities were responsible for released storm waters flooding parts of the besieged area. "The claim is entirely false, and southern Israel does not have any dams," said a..."
--------------------------------------------------
   3. [UNRELATED] 


>> Enter Headline Claim:  nasa


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'NASA Raises Doubts About Reports of Nicaraguan Meteorite'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 3.2557)
      Evidence: "We reported on Monday that a meteor, thought possibly to be a chunk of an Earth-passing asteroid, was the cause of a 40-foot crater outside the international airport in the Nicaraguan capital. But astronomers and NASA scientists are now casting..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 3.1037)
      Evidence: "An explosion and a crater reported near the capital of Nicaragua raised suspicions on Monday that a meteorite had split off from an asteroid that passed by Earth this weekend and struck our planet. But NASA scientists have now cast..."
--------------------------------------------------
   3. [DISAGREE] (LTR Score: 2.9708)
   


>> Enter Headline Claim:  meteor landing


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Meteor Leaves 40-Foot Crater Near Managua's Airport'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 3.1015)
      Evidence: "We reported on Monday that a meteor, thought possibly to be a chunk of an Earth-passing asteroid, was the cause of a 40-foot crater outside the international airport in the Nicaraguan capital. But astronomers and NASA scientists are now casting..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 2.9497)
      Evidence: "Updated at 11:45 p.m. ET There was an unexpected crash landing near the international airport in the Nicaraguan capital over the weekend, but luckily no one was hurt: A small meteor, thought to have broken off from an Earth-passing asteroid,..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 2.872


>> Enter Headline Claim:  exit


Exiting system. Goodbye!


In [79]:
# Run the interactive loop
two_phase_hybrid_search3()


  FINAL HYBRID SYSTEM: BM25 (Phase 1) + LTR/SVM (Phase 2)
  Results are ranked by LTR Score (Hybrid Relevance).
  Type 'exit' or 'quit' to stop.



>> Enter Headline Claim:  gaza


--- Phase 1: Retrieved 50 candidates via BM25.
   [Classifier Reference] Using closest training headline: 'Israel opens dams, floods Gaza'

--- Phase 2: Hybrid Re-ranked Results (Sorted by LTR Score) ---
   Showing top 5 of 50 results:

   1. [UNRELATED] (LTR Score: 2.5377)
      Evidence: "GAZA, Feb. 22 (Xinhua) -- A Palestinian minister lashed out at Israel on Sunday after it opened its dams near the border with the Gaza Strip, flooding the central area of the besieged enclave with huge amounts of water. Mufid..."
--------------------------------------------------
   2. [UNRELATED] (LTR Score: 2.2695)
      Evidence: "GAZA CITY (Ma'an) -- Hundreds of Palestinians were evacuated from their homes Sunday morning after Israeli authorities opened a number of dams near the border, flooding the Gaza Valley in the wake of a recent severe winter storm. The Gaza..."
--------------------------------------------------
   3. [UNRELATED] (LTR Score: 2.0983)
      Evidence: "Hundreds of Palestinian


>> Enter Headline Claim:  exit


Exiting system. Goodbye!


In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, classification_report, accuracy_score, confusion_matrix
from collections import defaultdict

# --- 1. Re-split Data to Get Test Set ---
# We assume X_hybrid, data['Stance'], and data['Headline'] are loaded from your main script
# Use the same random_state=42 to guarantee the exact same split as training.
_, X_test_hybrid, _, y_test_stance, _, test_headlines = train_test_split(
    X_hybrid,
    data['Stance'],
    data['Headline'],
    test_size=0.2,
    random_state=42
)

# 2. Map Stance Labels to Relevance Grades (0-3)
relevance_map = {'agree': 3, 'disagree': 3, 'discuss': 2, 'unrelated': 0}
y_test_rel = np.array([relevance_map[label] for label in y_test_stance])


# --- 2. Predict Relevance Scores on Test Set (Phase 2) ---
print("\n--- Starting Evaluation: Predicting Scores on Test Set ---")

# Apply the trained SelectKBest feature selection
X_test_selected = selector.transform(X_test_hybrid)

# Use the LTR Ranker to predict the relevance score for every test pair
predicted_scores = ranker.predict(X_test_selected)


# --- 3. Group Metrics by Headline (Query) ---

# Re-create a DataFrame for the test set to facilitate grouping by headline
test_df = pd.DataFrame({
    'headline': test_headlines.reset_index(drop=True),
    'score': predicted_scores,
    'true_rel': y_test_rel
})

# Dictionary to hold the true relevance grades and predicted scores for each unique query (headline)
query_metrics = defaultdict(lambda: {'true': [], 'score': []})
for _, row in test_df.iterrows():
    query_metrics[row['headline']]['true'].append(row['true_rel'])
    query_metrics[row['headline']]['score'].append(row['score'])


# --- 4. Define Evaluation Metrics ---

def calculate_ndcg(true_grades, predicted_scores):
    """Calculates NDCG@k. We use k=None (all results) for simplicity."""
    
    # Sort true grades based on predicted scores
    relevance = np.array(true_grades)[np.argsort(predicted_scores)[::-1]]
    
    # Calculate DCG
    # DCG formula: sum( (2^rel - 1) / log2(i + 1) )
    discount = np.log2(np.arange(len(relevance)) + 2) # i starts at 0, discount starts at log2(2)
    dcg = np.sum((np.power(2, relevance) - 1) / discount)

    # Calculate IDCG (Ideal DCG)
    ideal_relevance = np.sort(relevance)[::-1]
    idcg = np.sum((np.power(2, ideal_relevance) - 1) / discount)
    
    return dcg / idcg if idcg > 0 else 0.0

def calculate_map(true_grades, predicted_scores):
    """Calculates Average Precision (AP) for a single query."""
    # MAP requires binary relevance (rel > 0 is relevant)
    true_binary = (np.array(true_grades) > 0).astype(int)
    
    # average_precision_score requires sorting by the score
    return average_precision_score(true_binary, predicted_scores)


# --- 5. Calculate Overall Metrics ---

map_scores = []
ndcg_scores = []
total_queries = len(query_metrics)

for query, metrics in query_metrics.items():
    # Only evaluate queries with at least one relevant document for non-zero AP
    if np.sum(metrics['true']) > 0:
        map_scores.append(calculate_map(metrics['true'], metrics['score']))
        ndcg_scores.append(calculate_ndcg(metrics['true'], metrics['score']))


final_map = np.mean(map_scores)
final_ndcg = np.mean(ndcg_scores)

# --- 5.5. Calculate SVM Classification Metrics ---

# 1. Predict Stance Labels on the Test Set
# We use the trained svm_clf and the same selected test features X_test_selected
predicted_stances = svm_clf.predict(X_test_selected)

# 2. Get True Stance Labels
# y_test_stance holds the true categorical labels ('agree', 'unrelated', etc.)
true_stances = y_test_stance.reset_index(drop=True)

# 3. Calculate Overall Accuracy
svm_accuracy = accuracy_score(true_stances, predicted_stances)

# 4. Generate Detailed Report (Precision, Recall, F1-score for each class)
# classification_report generates a comprehensive text summary.
svm_classification_report = classification_report(true_stances, predicted_stances, zero_division=0)

# 5. Generate Confusion Matrix (Optional, but highly informative)
# The labels list ensures the matrix is ordered consistently.
labels_order = ['agree', 'disagree', 'discuss', 'unrelated']
conf_matrix = confusion_matrix(true_stances, predicted_stances, labels=labels_order)

# Convert confusion matrix to a readable format (e.g., a DataFrame)
conf_matrix_df = pd.DataFrame(conf_matrix, index=labels_order, columns=[f'Pred {l}' for l in labels_order])


# --- 6. Display Results (Updated) ---
print("\n" + "="*50)
print("¬† ¬† ¬† ¬† ¬† ¬† ¬†üîç SYSTEM EVALUATION üìä")
print("="*50)
print(f"Total Test Queries Evaluated: {total_queries}")
print("-" * 50)

## LTR RANKING METRICS
print("--- LTR RANKING METRICS ---")
print(f"Mean Average Precision (MAP): {final_map:.4f}")
print(f"Normalized Discounted Cumulative Gain (NDCG): {final_ndcg:.4f}")
print("-" * 50)

## SVM CLASSIFICATION METRICS
print("--- SVM CLASSIFICATION METRICS ---")
print(f"Overall Accuracy: {svm_accuracy:.4f}\n")
print("Classification Report (Precision/Recall/F1-score):\n")
print(svm_classification_report)
print("\nConfusion Matrix (Rows=True Label, Columns=Predicted Label):\n")
print(conf_matrix_df)
print("="*50)


--- Starting Evaluation: Predicting Scores on Test Set ---

¬† ¬† ¬† ¬† ¬† ¬† ¬†üîç SYSTEM EVALUATION üìä
Total Test Queries Evaluated: 1572
--------------------------------------------------
--- LTR RANKING METRICS ---
Mean Average Precision (MAP): 0.9989
Normalized Discounted Cumulative Gain (NDCG): 0.9977
--------------------------------------------------
--- SVM CLASSIFICATION METRICS ---
Overall Accuracy: 0.9047

Classification Report (Precision/Recall/F1-score):

              precision    recall  f1-score   support

       agree       0.56      0.60      0.58       734
    disagree       0.29      0.83      0.43       170
     discuss       0.84      0.72      0.78      1763
   unrelated       1.00      0.98      0.99      7255

    accuracy                           0.90      9922
   macro avg       0.67      0.78      0.69      9922
weighted avg       0.93      0.90      0.91      9922


Confusion Matrix (Rows=True Label, Columns=Predicted Label):

           Pred agree  Pr

In [80]:
import gradio as gr
from typing import List, Dict, Any

# --- NOTE: REPLICATE THE run_hybrid_search FUNCTION FOR GRADIO ---
# We must use a dedicated function that returns a string (Markdown) for Gradio output.
def run_hybrid_search_for_gradio(user_query: str) -> str:
    """Adapts the two_phase_hybrid_search logic to return a formatted string for Gradio."""
    global bm25_model, w2v_model, selector, ranker, svm_clf, data, bodies_df, headline_vectors
    
    # Quick checks for initialization (useful if not all setup was run)
    if not all([bm25_model, w2v_model, selector, ranker, svm_clf]):
        return "System not fully initialized. Please ensure all training steps ran successfully."
    if not user_query.strip():
        return "Please enter a headline claim."

    q_tokens = preprocess(user_query)
    q_vec = get_doc_vector(q_tokens).astype('float32')

    # --- PHASE 1: LEXICAL RETRIEVAL (BM25) ---
    CANDIDATE_POOL_SIZE = 50 
    bm25_all_scores = bm25_model.get_scores(q_tokens)
    bm25_indices = np.argsort(bm25_all_scores)[::-1][:CANDIDATE_POOL_SIZE]

    if len(bm25_indices) == 0 or (len(bm25_all_scores) > 0 and bm25_all_scores[bm25_indices[0]] < 0.1):
        return "No strongly relevant documents found via BM25."
        
    # --- FEATURE ALIGNMENT FIX (Using closest training headline vector) ---
    headline_similarities = cosine_similarity(q_vec.reshape(1, -1), headline_vectors)[0]
    closest_headline_idx = np.argmax(headline_similarities)
    best_h_vec = headline_vectors[closest_headline_idx]
    best_h_text = data.iloc[closest_headline_idx]['Headline']

    # --- PHASE 2: HYBRID RANKING & CLASSIFICATION (LTR/SVM) ---
    candidate_features = []
    candidate_bodies = []

    for idx in bm25_indices:
        body_id = bodies_df.loc[idx, 'Body ID']
        # Locate the body's full row in the *merged* data frame for vector access
        body_row_full = data[data['Body ID'] == body_id].iloc[0] 

        b_vec = get_doc_vector(body_row_full['processed_body'])
        body_text = body_row_full['articleBody']
        
        # Feature vector generation
        cos_sim = cosine_similarity(best_h_vec.reshape(1, -1), b_vec.reshape(1, -1))[0][0]
        bm25_score = bm25_all_scores[idx] 
        feat = np.concatenate([best_h_vec, b_vec, [cos_sim], [bm25_score]])
        
        candidate_features.append(feat)
        candidate_bodies.append(body_text)

    candidate_features = np.array(candidate_features)
    
    # Prediction
    candidate_features_selected = selector.transform(candidate_features)
    predicted_stances = svm_clf.predict(candidate_features_selected)
    ranking_scores = ranker.predict(candidate_features_selected)
    
    # Combine and Sort Results
    results = zip(ranking_scores, predicted_stances, candidate_bodies)
    sorted_results = sorted(results, key=lambda x: x[0], reverse=True)

    # --- Format Output for Gradio (Markdown) ---
    output_markdown = f"## üîé Hybrid Search Results\n"
    output_markdown += f"--- Phase 1: Retrieved {len(bm25_indices)} candidates via BM25 ---\n\n"
    output_markdown += f"**Classifier Reference Headline:** '{best_h_text}'\n\n"
    output_markdown += f"--- Phase 2: Ranked by LTR Score (Top 5 of {len(sorted_results)}) ---\n\n"

    for i, (score, stance, body) in enumerate(sorted_results[:5]):
        snippet = " ".join(body.split()[:50]) + "..."
        stance_display = f"**[{stance.upper()}]**"
        
        output_markdown += f"### {i+1}. {stance_display} (LTR Score: {score:.4f})\n"
        output_markdown += f"> Evidence: \"{snippet}\"\n\n"
        output_markdown += f"---\n"
        
    return output_markdown.strip()


# Define the Gradio Interface
iface = gr.Interface(
    fn=run_hybrid_search_for_gradio, 
    
    inputs=gr.Textbox(
        lines=2, 
        placeholder="Enter a query of Headline claim ", 
        label="Headline Claim"
    ),
    
    outputs=gr.Markdown(label="Hybrid IR System Output"),
    
    title="News Retrieval system with stance detection",
    description="Query the trained two-phase system (BM25 Retrieval, LTR Ranking & SVM classification).",
    # theme="soft",
    # allow_flagging="never"
)

# Launch the interface
print("\nLaunching Gradio Interface...")
iface.launch()


Launching Gradio Interface...
* Running on local URL:  http://127.0.0.1:7865
* To create a public link, set `share=True` in `launch()`.


