In [1]:
import os
import sys
import pandas as pd
import torch
import spacy
import wikipediaapi
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dotenv import load_dotenv

# --- 1. Environment Setup ---
# Dynamically find the project root (one level up from 'demo' folder)
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.append(project_root)

# Load environment variables
load_dotenv(os.path.join(project_root, '.env'))

# Define Paths
MODEL_PATH = os.path.join(project_root, "models", "rumor_model")
TEST_DATA_PATH = os.path.join(project_root, "data", "processed", "test_dataset.csv")

print(f"‚úÖ Environment Ready.\nüìÇ Project Root: {project_root}")

‚úÖ Environment Ready.
üìÇ Project Root: c:\Users\matin\Desktop\uni\nlp\project\rumor-detection-elkp


In [2]:
class RumorDetector:
    """
    Wraps the ELKP pipeline: Knowledge Injection + BERT Inference.
    """
    
    LABEL_MAP = {0: "NON-RUMOR (Real)", 1: "RUMOR (Fake)"}
    
    def __init__(self, model_path):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"‚è≥ Loading Model on {self.device.upper()}...")
        
        # 1. Load AI Model
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
            self.model.to(self.device)
            self.model.eval() # Set to evaluation mode
        except OSError:
            raise FileNotFoundError(f"‚ùå Model not found at {model_path}. Did you train it?")

        # 2. Load Knowledge Tools
        print("‚è≥ Loading Knowledge Base (SpaCy & Wiki)...")
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            os.system("python -m spacy download en_core_web_sm")
            self.nlp = spacy.load("en_core_web_sm")
            
        self.wiki = wikipediaapi.Wikipedia(
            language='en', 
            user_agent='RumorDemo/1.0 (student@uni.edu)'
        )
        print("‚úÖ Detector Initialized Successfully.")

    def predict(self, text: str) -> dict:
        """
        Full pipeline: Text -> Entity -> Wiki -> Knowledge Prompt -> BERT -> Result
        """
        # Step A: Knowledge Injection
        knowledge = self._fetch_knowledge(text)
        
        if knowledge:
            prompt = f"Knowledge: {knowledge} [SEP] Tweet: {text}"
        else:
            prompt = f"Tweet: {text}"

        # Step B: Model Inference
        inputs = self.tokenizer(
            prompt, 
            return_tensors="pt", 
            truncation=True, 
            max_length=128
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_idx = torch.argmax(probs).item()
            confidence = probs[0][pred_idx].item()

        return {
            "original_text": text,
            "knowledge_found": knowledge,
            "prediction_label": self.LABEL_MAP[pred_idx],
            "prediction_code": pred_idx,
            "confidence": confidence
        }

    def _fetch_knowledge(self, text):
        """Helper to extract entity and query Wikipedia."""
        doc = self.nlp(text)
        if not doc.ents:
            return None
        
        # Taking the first major entity
        entity = doc.ents[0].text
        try:
            page = self.wiki.page(entity)
            if page.exists():
                return f"Entity: {entity} | Info: {page.summary[:200]}..."
        except:
            return None
        return None

# Instantiate the engine once
detector = RumorDetector(MODEL_PATH)

‚è≥ Loading Model on CPU...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

‚è≥ Loading Knowledge Base (SpaCy & Wiki)...
‚úÖ Detector Initialized Successfully.


In [3]:
def visualize_result(result: dict, true_label: int = None):
    """Prints the prediction results in a clean, human-readable format."""
    print("\n" + "="*60)
    print(f"üìù INPUT TWEET:\n\"{result['original_text']}\"")
    print("-" * 60)
    
    # Show Knowledge
    if result['knowledge_found']:
        print(f"üß† KNOWLEDGE INJECTED:\n   {result['knowledge_found']}")
    else:
        print("‚ö™ No external knowledge found (Pure text analysis).")
    
    print("-" * 60)
    
    # Show Prediction
    print(f"ü§ñ AI PREDICTION:  {result['prediction_label']}")
    print(f"üìä CONFIDENCE:     {result['confidence']:.2%}")
    
    # Verification (if we know the truth)
    if true_label is not None:
        truth_str = detector.LABEL_MAP[true_label]
        is_correct = (result['prediction_code'] == true_label)
        icon = "‚úÖ CORRECT" if is_correct else "‚ùå INCORRECT"
        print(f"üéØ ACTUAL LABEL:   {truth_str} -> {icon}")
    
    print("="*60 + "\n")

In [4]:
# Load the unseen test dataset
if os.path.exists(TEST_DATA_PATH):
    test_df = pd.read_csv(TEST_DATA_PATH)
    
    # Pick 1 random sample
    sample = test_df.sample(1).iloc[0]


    raw_text = sample['text']
    if "[SEP] Tweet:" in raw_text:
        raw_text = raw_text.split("[SEP] Tweet:")[-1].strip()


    # Run the Pipeline
    result = detector.predict(raw_text)
    
    # Visualize
    visualize_result(result, true_label=sample['label'])
else:
    print("‚ùå Test data not found. Please run main.py first.")


üìù INPUT TWEET:
"Tweet: "At least 1 gunman" in #SydneySiege (image of suspect not verified) No injuries known - police http://t.co/XLklHFHCT3 http://t.co/Rxh3RH2RMS"
------------------------------------------------------------
‚ö™ No external knowledge found (Pure text analysis).
------------------------------------------------------------
ü§ñ AI PREDICTION:  RUMOR (Fake)
üìä CONFIDENCE:     98.77%
üéØ ACTUAL LABEL:   RUMOR (Fake) -> ‚úÖ CORRECT



In [13]:
# Load the unseen test dataset
if os.path.exists(TEST_DATA_PATH):
    test_df = pd.read_csv(TEST_DATA_PATH)
    
    # Pick 1 random sample
    sample = test_df.sample(1).iloc[0]


    raw_text = sample['text']
    if "[SEP] Tweet:" in raw_text:
        raw_text = raw_text.split("[SEP] Tweet:")[-1].strip()


    # Run the Pipeline
    result = detector.predict(raw_text)
    
    # Visualize
    visualize_result(result, true_label=sample['label'])
else:
    print("‚ùå Test data not found. Please run main.py first.")


üìù INPUT TWEET:
"The PM's office releases a statement about  #sydneysiege. http://t.co/7NdqPYhwcY http://t.co/jeYdlwywO7"
------------------------------------------------------------
üß† KNOWLEDGE INJECTED:
   Entity: about  # | Info: About may refer to:

About (surname)
About.com, an online source for original information and advice
about.me, a personal web hosting service
About URI scheme, an internal URI scheme
About box, a dial...
------------------------------------------------------------
ü§ñ AI PREDICTION:  NON-RUMOR (Real)
üìä CONFIDENCE:     98.44%
üéØ ACTUAL LABEL:   RUMOR (Fake) -> ‚ùå INCORRECT



In [5]:
# Example: A made-up rumor or real news
custom_text = "Breaking: The Eiffel Tower has been sold to a private company."

# Run Prediction
result = detector.predict(custom_text)
visualize_result(result)


üìù INPUT TWEET:
"Breaking: The Eiffel Tower has been sold to a private company."
------------------------------------------------------------
üß† KNOWLEDGE INJECTED:
   Entity: The Eiffel Tower | Info: The Eiffel Tower (  EYE-f…ôl; French: Tour Eiffel [tu Å …õf…õl] ) is a lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built ...
------------------------------------------------------------
ü§ñ AI PREDICTION:  RUMOR (Fake)
üìä CONFIDENCE:     99.85%

