In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Claim Verification with RAG and Gemini

## Problem Breakdown
**Goal:**
The goal of this project is to **fact-check a user-provided claim** by:

* Searching the web for relevant evidence,
* Use embeddings + vector search (**RAG**) to filter the most relevant evidence,
* Passing that evidence to a large language model (**Gemini 1.5 Flash**),
* Returning a structured **verdict** (true, false, or uncertain) along with an **explanation**, **supporting evidence**, and **source URLs**,
* Providing an intuitive **Gradio UI** to interact with the system.


## Tools & Infrastructure 


**Core AI Component:**
* Perform real-time Google searches through a programmable API (`langchain_google_community.GoogleSearchAPIWrapper`)
* Convert text documents into vector embeddings for similarity search (`langchain_huggingface.HuggingFaceEmbeddings`)
* Wrap raw search results as Document objects with metadata (`langchain.docstore.document.Document`)
* Store and retrieve documents using vector similarity (RAG retrieval) (`langchain.vectorstores.FAISS`)
* Run Gemini LLM to generate explanations and fact-check claims (`google.generativeai.GenerativeModel`)

**Deployment:**
* Build a web UI for interactive claim testing (`Gradio`)

# Code

## Install Dependencies

In [2]:
!pip install -q langchain sentence-transformers faiss-cpu langchain_huggingface gradio langchain_google_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.6/99.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.1/160.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m80.0 MB/s[0m eta [36m0:00

## Setup & Configuration
1. Get Google API Key
(⚠️ **Make sure the key is unrestricted or allow it to access Custom Search API publicly**)

2. Get Custom Search Engine (CSE) ID

3. Add Google API Key and CSE as *secrets* in Kaggle: Go to Add-ons > Secrets > Add Secret and add:


In [3]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
GOOGLE_CSE_ID = UserSecretsClient().get_secret("GOOGLE_CSE_ID")

### Model Configuration
The first block initializes the genai.Client from the google package using the provided API key (`GOOGLE_API_KEY`). It then lists all available models that support the generateContent action, printing the names of those models.

The second block configures a specific generative model, `gemini-1.5-pro`, using the `google.generativeai` package. The model is set up by calling `configure` with the API key, enabling access to the selected generative model for subsequent content generation tasks.

In [None]:
# ------- OPTIONAL ------
# from google import genai
# client = genai.Client(api_key=GOOGLE_API_KEY)

# for m in client.models.list():
#     if "generateContent" in m.supported_actions:
#         print(m.name)

In [4]:
from google.generativeai import configure, GenerativeModel

configure(api_key=GOOGLE_API_KEY)
model = GenerativeModel("gemini-1.5-pro")

### Search Configuration
Initialize the `GoogleSearchAPIWrapper` from the `langchain_google_community` package, which allows integration with Google Custom Search. It uses the provided Google API key (`GOOGLE_API_KEY`) and Custom Search Engine ID (`GOOGLE_CSE_ID`) to perform searches and retrieve relevant results for the given queries.

In [5]:
from langchain_google_community import GoogleSearchAPIWrapper

search = GoogleSearchAPIWrapper(google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID)

### Embedding Model Configuration 
Use the `HuggingFaceEmbeddings` class from `langchain_huggingface` to load a pre-trained embedding model. Set the `model_name` to `"sentence-transformers/all-MiniLM-L6-v2"` for efficient and high-quality sentence embeddings, which will be used to convert text into vector representations for retrieval and similarity comparison.

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Prompt Instruction
Define a fact-checking prompt for the language model to verify a claim using retrieved online evidence. The prompt asks the model to return a structured JSON response including:

* A clear **verdict**: "true", "false", or "uncertain"
* A short **explanation** supported by evidence
* A list of **key quotes** from the evidence
* A list of **source URLs** the quotes came from

This ensures transparency and traceability in the fact-checking result.

In [7]:
ANALYSIS_PROMPT = """
You are a fact-checking expert. Your task is to verify the truth of a claim based on online evidence.

Claim:
"{claim}"

Evidence (retrieved from web search):
{evidence_formatted}

Respond in JSON format with:
- verdict: "true", "false", or "uncertain"
- explanation: a concise explanation justifying the verdict using quotes from evidence
- evidence_used: list of key quotes that support your verdict
- sources: list of URLs the quotes came from
"""

### Vector Store Construction
Use `LangChain`'s `Document` and `FAISS` to build a vector store from Google search results:

* Wrap each result's **snippet** and **link** into a Document, storing the **link** in the metadata.
* Collect all documents and convert them into embeddings using the selected embedding model.
* Store the embeddings in a FAISS vectorstore, allowing efficient similarity search for relevant evidence later.

In [8]:
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS

def build_vector_store(claim, search_results):
    documents = []

    for result in search_results:
        text = f"{result['snippet']} (Source: {result['link']})"
        doc = Document(page_content=text, metadata={"source": result['link']})
        documents.append(doc)

    vectorstore = FAISS.from_documents(documents, embedding_model)
    return vectorstore

In [26]:
import time
import json

def verify_claim(claim: str):
    print(f"🧐 Testing claim: '{claim}'")
    start_time = time.time()

    try:
        print(f"[{time.time() - start_time:.1f}s] Starting search...")
        search_results_raw = search.results(claim, num_results=10)  # raw dicts
        print(f"✅ Search completed ({len(search_results_raw)} results, {time.time() - start_time:.1f}s)")
    except Exception as e:
        print(f"❌ Search failed: {type(e).__name__}: {str(e)}")
        return None

    try:
        print(f"[{time.time() - start_time:.1f}s] Building vector store...")
        vectorstore = build_vector_store(claim, search_results_raw)

        print(f"[{time.time() - start_time:.1f}s] Retrieving relevant chunks...")
        docs = vectorstore.similarity_search(claim, k=5)

        # Format evidence
        evidence_formatted = "\n\n".join(
            [f"- \"{doc.page_content}\"" for doc in docs]
        )

        prompt = ANALYSIS_PROMPT.format(claim=claim, evidence_formatted=evidence_formatted)

        print(f"[{time.time() - start_time:.1f}s] Calling LLM...")
        response = model.generate_content(contents=[{"parts": [{"text": prompt}]}])
        print(f"✅ LLM completed ({time.time() - start_time:.1f}s)")

        cleaned = response.text.strip().strip("```json").strip("```")
        return json.loads(cleaned)
    except Exception as e:
        print(f"❌ LLM failed: {type(e).__name__}: {str(e)}")
        return None


In [25]:
result = verify_claim("Landing on moon was fake")
print("📝 Result:", json.dumps(result, indent=2))

🧐 Testing claim: 'Landing on moon was fake'
[0.0s] Starting search...
✅ Search completed (10 results, 0.3s)
[0.3s] Building vector store...
[0.6s] Retrieving relevant chunks...
[0.6s] Calling LLM...
✅ LLM completed (7.0s)
📝 Result: {
  "verdict": "false",
  "explanation": "Multiple reliable sources debunk the claim that the moon landing was faked. The provided evidence includes articles that address and refute common conspiracy theories, such as concerns about the Van Allen belts, the lack of stars in photographs, and the footprint/boot discrepancy.  While some sources present the arguments of conspiracy theorists, they do so in the context of debunking them. For example, the IOP article explains how the Apollo missions navigated the Van Allen belts.  Furthermore, the Reddit comment is not a credible source.",
  "evidence_used": [
    "\"Perhaps the most convincing argument that the landings were faked has to do with something called the Van Allen belts...\" (This source goes on to exp

In [27]:
import gradio as gr

def explain_claim(claim):
    result = verify_claim(claim)
    if result is None:
        return "⚠️ Error: Could not generate a response. Please try again."
    
    verdict = result.get("verdict", "unknown")
    explanation = result.get("explanation", "")
    evidence = result.get("evidence_used", [])
    sources = result.get("sources", [])
    
    response = f"### ✅ Verdict: `{verdict}`\n\n"
    response += f"**Explanation:** {explanation}\n\n"
    
    if evidence:
        response += "**Evidence:**\n"
        for item in evidence:
            response += f"- {item}\n"
    
    if sources:
        response += "\n**Sources:**\n"
        for src in sources:
            response += f"- {src}\n"
    
    return response

# Launch Gradio UI
gr.Interface(
    fn=explain_claim,
    inputs=gr.Textbox(lines=2, label="Enter a Claim"),
    outputs=gr.Markdown(label="Analysis"),
    title="Claim Verifier with RAG and Gemini 🔍",
    description="Enter a claim and get a verdict based on search evidence.",
    examples=["5G causes COVID-19", "The moon landing was faked"]
).launch()


* Running on local URL:  http://127.0.0.1:7871
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://2acf38086d292e9ad9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




🧐 Testing claim: '5G causes COVID-19'
[0.0s] Starting search...
✅ Search completed (10 results, 0.2s)
[0.2s] Building vector store...
[0.5s] Retrieving relevant chunks...
[0.5s] Calling LLM...
✅ LLM completed (4.7s)
🧐 Testing claim: 'Can AI models like ChatGPT experience emotions?'
[0.0s] Starting search...
✅ Search completed (10 results, 0.4s)
[0.4s] Building vector store...
[0.6s] Retrieving relevant chunks...
[0.6s] Calling LLM...
✅ LLM completed (5.4s)
🧐 Testing claim: '5G causes COVID-19'
[0.0s] Starting search...
✅ Search completed (10 results, 0.4s)
[0.4s] Building vector store...
[0.7s] Retrieving relevant chunks...
[0.7s] Calling LLM...
✅ LLM completed (4.9s)
🧐 Testing claim: 'Can AI models like ChatGPT experience emotions?'
[0.0s] Starting search...
✅ Search completed (10 results, 0.2s)
[0.2s] Building vector store...
[0.4s] Retrieving relevant chunks...
[0.4s] Calling LLM...
✅ LLM completed (5.0s)
