In [1]:
import os

In [2]:
import wikipedia

# List of article titles (adjust to your needs)
titles = [
    "Water_supply_and_sanitation_in_India", "Water_supply", "Water_distribution_system", "Non-revenue_water", "Improved_water_source",
    "Water_scarcity", "Rainwater_harvesting", "Open_defecation",
    "Improved_sanitation", "Pit_latrine", "Sewage", "Wastewater_treatment",
    "Manual_scavenging", "Wastewater", "Sewage_treatment",
    "Community-led_total_sanitation", "Microcredit_for_water_supply_and_sanitation", "Swachh_Bharat_Mission",
    "Fecal_sludge_management", "Water_tariff"
]

# Download the content
documents = []
for title in titles:
    try:
        content = wikipedia.page(title).content
        documents.append((title, content))
    except Exception as e:
        print(f"Error loading {title}: {e}")

<class 'ModuleNotFoundError'>: No module named 'wikipedia'

In [None]:
documents

In [None]:
with open("all_wikipedia_articles.txt", "w", encoding="utf-8") as f:
    for title, content in documents:
        f.write(f"=== {title} ===\n")
        f.write(content)
        f.write("\n\n")

print("All articles saved in 'all_wikipedia_articles.txt'")

In [None]:
def clean_wiki_text(text):
    # List of unwanted section titles (case-insensitive match)
    unwanted_sections = [
        "See also", "References", "Further reading", "External links", "Bibliography", "Notes", "Sources"
    ]

    for section in unwanted_sections:
        split_marker = f"\n{section}"
        lower_split_marker = f"\n{section.lower()}"

        # Try matching both cases (some may be lowercased)
        if split_marker in text:
            text = text.split(split_marker)[0]
        elif lower_split_marker in text:
            text = text.split(lower_split_marker)[0]

    return text.strip()

In [None]:
# Cleaned documents
cleaned_documents = [(title, clean_wiki_text(content)) for title, content in documents]

In [None]:
with open("all_wikipedia_articles_cleaned.txt", "w", encoding="utf-8") as f:
    for title, content in cleaned_documents:
        f.write(f"=== {title} ===\n")
        f.write(content)
        f.write("\n\n")

print("Cleaned articles saved to 'all_wikipedia_articles_cleaned.txt'")

In [None]:
from typing import List
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def chunk_text(title: str, text: str, max_tokens=300) -> List[dict]:
    paragraphs = text.split('\n')
    chunks = []
    for para in paragraphs:
        para = para.strip()
        if len(para.split()) > 30:
            chunks.append({"title": title, "text": para})
    return chunks

all_chunks = []
for title, content in cleaned_documents:
    all_chunks.extend(chunk_text(title, content))

print(f"Total chunks: {len(all_chunks)}")

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model_name = 'sentence-transformers/all-MiniLM-L6-v2'  # Fast & lightweight
embedder = SentenceTransformer(model_name)

texts = [chunk["text"] for chunk in all_chunks]
embeddings = embedder.encode(texts, convert_to_numpy=True)

In [None]:
import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print("FAISS index ready.")

In [None]:
def search_similar_chunks(query, top_k=4):
    query_vector = embedder.encode([query])
    scores, indices = index.search(query_vector, top_k)

    retrieved_texts = []
    for i in indices[0]:
        retrieved_texts.append(all_chunks[i]['text'])

    return "\n\n".join(retrieved_texts)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load free, chat-ready open model
model_id = "HuggingFaceH4/zephyr-7b-beta"  # or try mistralai/Mistral-7B-Instruct
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype="auto")

qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
def ask_chatbot(question):
    context = search_similar_chunks(question)

    prompt = f"""
Answer the question only using the context below.
If the answer is not present, say "I don't know".

Context:
{context}

Question:
{question}
Answer:
"""

    response = qa_pipeline(prompt, max_new_tokens=300, do_sample=True, temperature=0.3)[0]['generated_text']

    # Trim to answer only
    answer = response.split("Answer:")[-1].strip()

    return answer  # ✅ RETURN instead of print

In [None]:
question = input("Ask your sanitation-related question: ")
print(ask_chatbot(question))

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

q_input = widgets.Text(placeholder="Ask a sanitation-related question...")
output = widgets.Output()

def on_submit(change):
    output.clear_output()
    with output:
        response = ask_chatbot(q_input.value)
        print(f"💬 Bot: {response}")

q_input.on_submit(on_submit)
display(q_input, output)