<a href="https://colab.research.google.com/github/muskancodebase/skills-copilot-codespaces-vscode/blob/main/Fake_News_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers requests beautifulsoup4



In [6]:
# --- All Necessary Imports ---
import requests
from transformers import pipeline
from urllib.parse import urlparse
!pip install trafilatura -q
import trafilatura

# --- 1. Define Authentic Sources ---
# A set of reputable news sources and official institutions for fast lookups.
AUTHENTIC_SOURCES = {
    'reuters.com',
    'apnews.com',
    'bbc.com',
    'aljazeera.com',
    'dawn.com',
    'tribune.com.pk',
    'geo.tv',
    'worldbank.org',  # Official institution
    'sbp.org.pk'      # State Bank of Pakistan
}

# --- 2. Web Search Function ---
def search_web(query):
    """
    Searches the web and returns the top 5 result URLs.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(f"https://html.duckduckgo.com/html/?q={query}", headers=headers)
        response.raise_for_status()
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', class_='result__a')]
        return links[:5]
    except requests.exceptions.RequestException as e:
        print(f"Error during web search: {e}")
        return []

# --- 3. Scrape and Summarize Function (with Trafilatura) ---
def scrape_and_summarize(url):
    """
    Scrapes content using trafilatura, chunks it, and summarizes it.
    This version is more robust and uses the CPU to avoid CUDA errors.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        content = trafilatura.extract(response.text)

        if not content or len(content.split()) < 50:
            return None

        # --- KEY CHANGE: Forcing the pipeline to use the CPU (device=-1) for stability ---
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)

        tokenizer = summarizer.tokenizer
        # Truncate the initial tokenization to a max length to prevent overload
        tokens = tokenizer.encode(content, truncation=True, max_length=8192)

        # The model's absolute max input is 1024, so we chunk below that
        chunk_size = 1000

        text_chunks = [
            tokenizer.decode(tokens[i:i + chunk_size], skip_special_tokens=True)
            for i in range(0, len(tokens), chunk_size)
        ]

        summaries = []
        for chunk in text_chunks:
            # Added a try-except block here for extra safety on a per-chunk basis
            try:
                chunk_summary = summarizer(chunk, max_length=100, min_length=20, do_sample=False)
                summaries.append(chunk_summary[0]['summary_text'])
            except Exception as e:
                print(f"    - Could not summarize a chunk. Error: {e}")
                continue # Move to the next chunk

        if not summaries:
            return None

        combined_summary = " ".join(summaries)

        # Final summary of the combined summaries
        final_summary = summarizer(combined_summary, max_length=150, min_length=30, do_sample=False)
        return final_summary[0]['summary_text']

    except Exception as e:
        print(f"  - Could not process {url[:50]}... Error: {e}")
        return None


In [10]:
def detect_fake_news(article_text):
    """
    Detects fake news and returns a verdict along with URLs from authentic sources as proof.
    """
    print("--- Starting Intelligent Fake News Detection ---")

    # 1. Extract the main claim using a summarizer for accuracy
    print("🎯 Extracting main claim from article...")
    # --- KEY CHANGE: Forcing this pipeline to use the CPU (device=-1) for stability ---
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)

    main_claim = summarizer(article_text, max_length=60, min_length=15, do_sample=False)[0]['summary_text']
    print(f"🎯 Main Claim Identified: \"{main_claim}\"")

    # 2. Search the web
    print("\n🌐 Searching the web for evidence...")
    search_urls = search_web(main_claim)
    if not search_urls:
        return {"verdict": "Uncertain (Web Search Failed)", "proof_urls": []}

    # 3. Scrape, summarize, and track successful sources
    print("\n✍️ Summarizing evidence from top sources...")
    summaries = []
    successful_urls = []
    for url in search_urls:
        summary = scrape_and_summarize(url)
        if summary:
            summaries.append(summary)
            successful_urls.append(url)

    if not summaries:
        return {"verdict": "❌ Likely Fake News (No usable evidence found online)", "proof_urls": []}

    evidence = " ".join(summaries)
    print(f'\n💡 Compiled Evidence: "{evidence[:400]}..."')

    # 4. Verify the claim against the evidence using an NLI model
    print("\n🧠 Verifying claim against evidence...")
    # We can also force the NLI model to the CPU for maximum stability
    nli_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)

    result = nli_classifier(evidence, candidate_labels=[main_claim])
    top_score = result['scores'][0]
    print(f"NLI Result: The evidence score for the claim is {top_score:.2f}")

    # 5. Filter the successful URLs to find authentic ones
    proof_urls = []
    for url in successful_urls:
        domain = urlparse(url).netloc
        if domain.replace('www.', '') in AUTHENTIC_SOURCES:
            proof_urls.append(url)

    # 6. Make a final judgment and return it as a dictionary
    if top_score > 0.70:
        verdict = "✅ Likely Real News (Evidence supports the claim)"
    elif top_score < 0.30:
        verdict = "❌ Likely Fake News (Evidence does not support the claim)"
    else:
        verdict = "🤔 Uncertain (Evidence is inconclusive)"

    return {"verdict": verdict, "proof_urls": proof_urls}

In [11]:
# --- Test Case 1: A made-up, sensational claim ---
fake_news_article = "Body: Islamabad, Pakistan – In a shocking move that has stunned global financial markets, the State Bank of Pakistan (SBP) has secretly finalized plans to phase out the Pakistani Rupee and introduce a new gold-backed digital currency, tentatively named the PakCoin. Sources from within the Ministry of Finance claim the transition, set to begin next year, will completely replace the Rupee by 2026. An unnamed official allegedly stated, This is the future. Every PakCoin will be digitally tied to a physical gram of gold held in our national reserves. This will eliminate inflation overnight and make Pakistan a global economic powerhouse. The report suggests a massive, undisclosed gold purchase from Russia and China is already underway to support the new currency. Financial experts are reportedly baffled, as no official announcement has been made through proper channels."

print("🧪 Testing with a FAKE news article...")
result_data = detect_fake_news(fake_news_article)

print(f"\\n====================\\nFinal Verdict: The article is {result_data['verdict']}\\n====================")
if result_data['proof_urls']:
    print("\\nSupporting evidence from authentic sources found:")
    for url in result_data['proof_urls']:
        print(f"- {url}")
else:
    print("\\nNo supporting evidence from authentic sources was found.")


print("\\n\\n" + "="*50 + "\\n\\n")


# --- Test Case 2: A plausible, real-world topic ---
real_news_article = "Headline: Pakistan Secures World Bank Financing for Karachi Water and Sewerage Improvement Project Body: Islamabad, Pakistan – The Government of Pakistan has signed a financing agreement with the World Bank for the Karachi Water and Sewerage Services Improvement Project (KWSSIP) - Phase II. The project aims to improve access to safe water and sanitation services for residents of Karachi, Pakistan's largest city. The financing will support the Karachi Water and Sewerage Corporation (KWSC) in rehabilitating water distribution networks, upgrading pumping stations, and improving the city's wastewater treatment infrastructure. According to the Ministry of Economic Affairs, this initiative is crucial for enhancing climate resilience and public health in the sprawling metropolis. This project continues the efforts of Phase I, focusing on institutional reforms and infrastructure upgrades to create a more efficient and reliable water utility for the city's millions of residents."

print("🧪 Testing with a REAL news article...")
result_data = detect_fake_news(real_news_article)

print(f"\\n====================\\nFinal Verdict: The article is {result_data['verdict']}\\n====================")
if result_data['proof_urls']:
    print("\\nSupporting evidence from authentic sources found:")
    for url in result_data['proof_urls']:
        print(f"- {url}")
else:
    print("\\nNo supporting evidence from authentic sources was found.")

🧪 Testing with a FAKE news article...
--- Starting Intelligent Fake News Detection ---
🎯 Extracting main claim from article...


Device set to use cpu


🎯 Main Claim Identified: " The State Bank of Pakistan (SBP) has secretly finalized plans to phase out the Pakistani Rupee and introduce a new gold-backed digital currency, tentatively named the PakCoin . Sources from within the Ministry of Finance claim the transition, set to begin next year, will completely replace"

🌐 Searching the web for evidence...

✍️ Summarizing evidence from top sources...


Device set to use cpu
Your max_length is set to 150, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 70. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you 


💡 Compiled Evidence: " More than 20% of Pakistan’s foreign direct investment in Pakistan;- 25% of the foreign capital in Pakistan . - Foreign direct investment by sector;- 50% of foreign loans to private sector business;- 70% of Pakistani Direct Direct Investment .  The State Bank of Pakistan (SBP) has officially announced that newly designed currency notes will begin circulating in Pakistan 2025 . The new currency not..."

🧠 Verifying claim against evidence...


Device set to use cpu


NLI Result: The evidence score for the claim is 0.02
\nSupporting evidence from authentic sources found:
- https://www.sbp.org.pk/index.html
- https://tribune.com.pk/story/2454809/sbp-all-set-to-replace-currency-notes-in-circulation-to-curb-corruption
🧪 Testing with a REAL news article...
--- Starting Intelligent Fake News Detection ---
🎯 Extracting main claim from article...


Device set to use cpu


🎯 Main Claim Identified: " Pakistan Secures World Bank Financing for Karachi Water and Sewerage Improvement Project . Project aims to improve access to safe water and sanitation services for residents of Karachi . Project continues efforts of Phase I, focusing on institutional reforms and infrastructure upgrades ."

🌐 Searching the web for evidence...

✍️ Summarizing evidence from top sources...


Device set to use cpu
Your max_length is set to 150, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 64. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Device set to use cpu
Your max_length is set to 150, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you m


💡 Compiled Evidence: " World Bank has approved $240 million in financing for the Second Karachi Water and Sewerage Services Improvement Project (KWSSIP-2) The project will provide safely managed water supply to nearly 16 million people and sanitation services to nearly 7.5 million people in Karachi by 2030 .  World Bank has approved $240 million financing for the second Karachi Water and Sewerage Services Improvement P..."

🧠 Verifying claim against evidence...


Device set to use cpu


NLI Result: The evidence score for the claim is 0.84
\nSupporting evidence from authentic sources found:
- https://www.worldbank.org/en/news/press-release/2024/12/11/pakistan-world-bank-approves-240-million-to-support-investments-in-water-sanitation-and-hygiene-services-in-karachi
- https://www.dawn.com/news/1878611
- https://tribune.com.pk/story/2515954/world-bank-approves-240m-to-improve-karachis-water
