# **Install required packages**

In [None]:
%%capture
!pip install wikipedia langchain langchain-community langchain-huggingface chromadb sentence-transformers transformers torch accelerate protobuf==4.25.3 google-api-core==2.19.0 -q

# **Import necessary libraries**

In [None]:
import wikipedia
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from getpass import getpass
import re

# **Configure Wikipedia to use Farsi language**

In [None]:
wikipedia.set_lang('fa')  # Set Wikipedia to Persian (Farsi)

# **Login to Hugging Face for accessing the Gemma model**

In [None]:
try:
    token = getpass("Enter your Hugging Face token (press Enter to skip): ")
    if token.strip():
        login(token=token)
        print("✅ Successfully logged in to Hugging Face!")
except:
    print("➡️ Proceeding without token...")

✅ Successfully logged in to Hugging Face!


# **Load the Gemma 2B model and tokenizer**

In [None]:
model_name = "google/gemma-2b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,       # Use float16 for memory efficiency
    device_map="cpu",                # Use CPU to avoid CUDA issues (set to "auto" for GPU)
    trust_remote_code=True,          # Allow custom model code
    low_cpu_mem_usage=True           # Optimize memory usage
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

# **Create a text-generation pipeline for LangChain**

In [None]:
hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=150,              # Limit response length
    temperature=0.3,                 # Low temperature for less randomness
    top_p=0.9,                       # Top-p sampling for focused outputs
    return_full_text=False           # Return only generated text
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

Device set to use cpu


# **Define a prompt template for RAG**

In [None]:
prompt_template = """Based on the following information, provide an answer:
{context}

Question: {question}
Answer:"""
prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)

# **RAG function to retrieve context from Wikipedia and generate answers**

In [None]:
def rag_with_wikipedia(question, max_chars=2000):
    """
    Retrieves relevant Wikipedia article content and generates an answer using the Gemma model.

    Args:
        question (str): The query to search on Wikipedia.
        max_chars (int): Maximum length of the context to retrieve (default: 2000).

    Returns:
        tuple: (response, context, page_title) - Generated answer, retrieved context, and Wikipedia article title.
    """
    try:
        # Step 1: Search Wikipedia for relevant articles
        search_results = wikipedia.search(question)
        if not search_results:
            return "No relevant Wikipedia article found in Persian.", "", ""

        # Step 2: Load the first relevant article
        page_title = search_results[0]  # e.g., "پایتون (زبان برنامه‌نویسی)"
        page = wikipedia.page(page_title)

        # Step 3: Extract content (truncate to max_chars for efficiency)
        full_content = page.content
        context = full_content[:max_chars] + "..." if len(full_content) > max_chars else full_content

        # Step 4: Generate response using the language model
        response = llm.invoke(prompt.format(context=context, question=question))
        response = response.split("پاسخ:")[-1].strip()  # Extract only the answer part

        return response, context, page_title
    except Exception as e:
        return f"Error accessing Wikipedia: {str(e)}", "", ""

# **Test the RAG system with sample questions**

In [None]:
queries = [
    "پایتون چیست؟",             # What is Python?
    "چرا پایتون محبوب است؟",   # Why is Python popular?
    "پایتون کجا استفاده می‌شود؟" # Where is Python used?
]

for query in queries:
    response, context, title = rag_with_wikipedia(query)
    print(f"Question: {query}")
    print(f"Wikipedia Article: {title}")
    print(f"Context (Summary): {context[:200]}...")  # Show first 200 chars of context
    print(f"Answer: {response}\n")