In [8]:
import json
import os
import time
import requests
import overpy
from fpdf import FPDF

# Initialize Overpass API
api = overpy.Overpass()

# List of major cities by country
major_cities = {
    "India": ["Delhi", "Mumbai", "Bangalore", "Kolkata", "Chennai"],
    "USA": ["New York", "Los Angeles", "Chicago", "Houston", "San Francisco"],
    "UK": ["London", "Manchester", "Birmingham", "Glasgow", "Liverpool"],
    "Germany": ["Berlin", "Munich", "Hamburg", "Frankfurt", "Cologne"],
    "France": ["Paris", "Marseille", "Lyon", "Toulouse", "Nice"],
    "Japan": ["Tokyo", "Osaka", "Yokohama", "Nagoya", "Sapporo"],
    "Australia": ["Sydney", "Melbourne", "Brisbane", "Perth", "Adelaide"],
    "Canada": ["Toronto", "Vancouver", "Montreal", "Calgary", "Ottawa"],
    "Brazil": ["São Paulo", "Rio de Janeiro", "Brasília", "Salvador", "Fortaleza"],
    "China": ["Shanghai", "Beijing", "Guangzhou", "Shenzhen", "Chengdu"]
}

# Create folder for PDFs
os.makedirs("city_data", exist_ok=True)

# Function to fetch city data from Overpass API
def fetch_city_data(city_name, country_name):
    print(f"📍 Fetching data for {city_name}, {country_name}...")

    # First query: Search by city name and alternative names
    query = f"""
    [out:json][timeout:100];
    (
      node["name"="{city_name}"]["place"~"city|town|metropolitan_area"];
      node["name:en"="{city_name}"]["place"~"city|town|metropolitan_area"];
      node["alt_name"="{city_name}"]["place"~"city|town|metropolitan_area"];
    );
    out body;
    """
    
    try:
        result = api.query(query)
    except overpy.exception.OverpassRuntimeError as e:
        print(f"⚠️ Overpass API error for {city_name}: {e}")
        return None

    # If no results, fallback to a broad location-based search
    if not result.nodes:
        print(f"⚠️ No exact match for {city_name}. Trying bounding box search...")

        query = f"""
        [out:json][timeout:100];
        (
          node["place"~"city|town|metropolitan_area"](around:500000, 28.6448, 77.216721);
        );
        out body;
        """
        try:
            result = api.query(query)
        except overpy.exception.OverpassRuntimeError as e:
            print(f"⚠️ Overpass API error (fallback) for {city_name}: {e}")
            return None

        if not result.nodes:
            print(f"⚠️ No data found for {city_name}. Skipping...")
            return None

    node = result.nodes[0]
    return {
        "city": node.tags.get("name", "Unknown"),
        "state": node.tags.get("is_in:state", "N/A"),
        "population": int(node.tags.get("population", 0)) if node.tags.get("population", "0").isdigit() else "N/A",
        "latitude": float(node.lat),
        "longitude": float(node.lon),
        "altitude": node.tags.get("ele", "N/A"),
        "timezone": node.tags.get("timezone", "N/A"),
        "landmarks": "",
        "climate": ""
    }

# Function to get Wikipedia summary for a city
def get_wikipedia_summary(city_name):
    print(f"🌍 Fetching Wikipedia summary for {city_name}...")

    search_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{city_name.replace(' ', '_')}"
    try:
        response = requests.get(search_url, timeout=10)
        if response.status_code != 200:
            return "Wikipedia page not found"

        data = response.json()
        return data.get("extract", "No relevant information found")
    except requests.exceptions.RequestException as e:
        return f"Error fetching Wikipedia: {e}"

# Collect data for all major cities
all_cities = []
for country, cities in major_cities.items():
    for city in cities:
        city_data = fetch_city_data(city, country)
        if city_data:
            city_data["country"] = country
            city_data["landmarks"] = get_wikipedia_summary(city)  # Get Wikipedia info
            all_cities.append(city_data)
        time.sleep(2)  # Prevent API rate limits

# Save collected data as JSON
with open("major_cities.json", "w", encoding="utf-8") as f:
    json.dump(all_cities, f, indent=4)

print(f"✅ Collected data for {len(all_cities)} major cities. Saved as 'major_cities.json'.")

📍 Fetching data for Delhi, India...
🌍 Fetching Wikipedia summary for Delhi...
📍 Fetching data for Mumbai, India...
🌍 Fetching Wikipedia summary for Mumbai...
📍 Fetching data for Bangalore, India...
🌍 Fetching Wikipedia summary for Bangalore...
📍 Fetching data for Kolkata, India...
🌍 Fetching Wikipedia summary for Kolkata...
📍 Fetching data for Chennai, India...
🌍 Fetching Wikipedia summary for Chennai...
📍 Fetching data for New York, USA...
🌍 Fetching Wikipedia summary for New York...
📍 Fetching data for Los Angeles, USA...
🌍 Fetching Wikipedia summary for Los Angeles...
📍 Fetching data for Chicago, USA...
🌍 Fetching Wikipedia summary for Chicago...
📍 Fetching data for Houston, USA...
🌍 Fetching Wikipedia summary for Houston...
📍 Fetching data for San Francisco, USA...
🌍 Fetching Wikipedia summary for San Francisco...
📍 Fetching data for London, UK...
🌍 Fetching Wikipedia summary for London...
📍 Fetching data for Manchester, UK...
🌍 Fetching Wikipedia summary for Manchester...
📍 Fetchi

In [1]:
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import requests
from typing import List, Dict, Any

# Configuration
GEMINI_API_KEY = "AIzaSyBGg1NhYaZ3uXk-b96cUq4WQW_LcLq5Hsk"
GEMINI_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?key={GEMINI_API_KEY}"

# Load city data
with open("major_cities.json", "r", encoding="utf-8") as f:
    city_data = json.load(f)

# Initialize embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Prepare embeddings for vector retrieval
texts = []
for city in city_data:
    text = f"""
    City: {city['city']}
    Country: {city['country']}
    Population: {city['population']}
    Altitude: {city['altitude']}
    Landmarks: {city['landmarks']}
    Climate: {city['climate']}
    """
    texts.append(text)

embeddings = model.encode(texts).astype(np.float32)
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index for vector retrieval
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

def generate_sub_questions(query: str) -> List[Dict[str, Any]]:
    """
    Task 1: Sub-question Generation
    Decomposes complex questions into sub-questions with source mapping and retrieval methods
    """
    prompt = f"""
    Analyze this question and generate 2-3 sub-questions needed to answer it.
    For each, specify:
    1. The sub-question text
    2. Required data attributes
    3. Retrieval method (vector/summary)
    4. Data source ('city_dataset' for our data)
    
    Use this exact JSON format:
    {{
        "question": "sub-question text",
        "attributes": ["list", "of", "attributes"],
        "retrieval": "vector|summary",
        "source": "city_dataset"
    }}

    Question: {query}

    Return only a valid JSON list of these objects.
    """
    
    data = {
        "contents": [{"role": "user", "parts": [{"text": prompt}]}],
        "generationConfig": {"temperature": 0.1}
    }
    
    try:
        response = requests.post(GEMINI_URL, json=data)
        if response.status_code == 200:
            return json.loads(response.json()["candidates"][0]["content"]["parts"][0]["text"])
    except:
        pass
    
    # Fallback: Simple question with all attributes and vector retrieval
    return [{
        "question": query,
        "attributes": ["population", "altitude", "landmarks", "climate"],
        "retrieval": "vector",
        "source": "city_dataset"
    }]

def vector_retrieval(query: str, attributes: List[str], top_k: int = 3) -> List[Dict[str, Any]]:
    """Vector retrieval using FAISS index"""
    query_embedding = model.encode([query]).astype(np.float32)
    query_embedding /= np.linalg.norm(query_embedding)
    
    scores, indices = index.search(query_embedding, top_k * 3)
    
    results = []
    for score, idx in zip(scores[0], indices[0]):
        city = city_data[idx]
        result = {"city": city["city"], "country": city["country"], "score": float(score)}
        for attr in attributes:
            if attr in city:
                result[attr] = city[attr]
        results.append(result)
    
    return sorted(results, key=lambda x: x["score"], reverse=True)[:top_k]

def summary_retrieval(query: str) -> List[Dict[str, Any]]:
    """Summary retrieval returns full city records"""
    return city_data[:3]  # Return first 3 cities as fallback

def retrieve_city_data(sub_question: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Task 2: Vector/Summary Retrieval
    Handles both retrieval methods based on sub-question specification
    """
    if sub_question["retrieval"] == "vector":
        return vector_retrieval(sub_question["question"], sub_question["attributes"])
    else:
        return summary_retrieval(sub_question["question"])

def format_sub_answer(city: Dict[str, Any], attribute: str) -> str:
    """Formats individual city data for sub-question responses"""
    if attribute == "population":
        if not isinstance(city.get('population'), (int, float)) or city['population'] <= 0:
            return None
        return f"{city['city']}: {round(city['population']/1000000, 1)}M"
    elif attribute == "altitude":
        if not city.get('altitude') or city['altitude'] == "N/A":
            return None
        return f"{city['city']}: {city['altitude']}m"
    elif attribute == "landmarks":
        if not city.get('landmarks'):
            return None
        return f"{city['city']}: {city['landmarks'].split('.')[0].strip()}"
    elif attribute == "climate":
        if not city.get('climate'):
            return None
        return f"{city['city']}: {city['climate'].split('.')[0].strip()}"
    return f"{city['city']}, {city['country']}"

def generate_final_response(query: str, sub_answers: List[Dict[str, Any]]) -> str:
    """
    Task 3: Response Aggregation
    Properly handles all question types and data fields from the city data
    """
    # Initialize default values
    final_answer = "Couldn't find specific information to answer this question."
    sub_responses = []
    
    # Extract country from query if specified
    query_lower = query.lower()
    target_country = None
    for country in ['india', 'usa', 'china', 'japan', 'uk', 'germany', 'france', 'australia', 'canada', 'brazil']:
        if country in query_lower:
            target_country = country
            break
    
    # Extract target city if specified
    target_city = None
    for city in [c['city'].lower() for c in city_data]:
        if city in query_lower:
            target_city = city
            break
    
    # Prepare data containers
    altitude_data = []
    population_data = []
    landmark_data = []
    climate_data = []
    
    # Collect all relevant city data
    for city in city_data:
        # Skip if country filter exists and doesn't match
        if target_country and city['country'].lower() != target_country:
            continue
        
        # Skip if city filter exists and doesn't match
        if target_city and city['city'].lower() != target_city:
            continue
        
        # Process altitude data
        if city.get('altitude') and city['altitude'] != 'N/A':
            try:
                altitude = float(city['altitude'].replace('m', '')) if 'm' in city['altitude'] else float(city['altitude'])
                altitude_data.append({
                    'city': city['city'],
                    'value': f"{altitude}m",
                    'numeric': altitude
                })
            except ValueError:
                pass
        
        # Process population data
        if city.get('population') and isinstance(city['population'], (int, float)):
            population_data.append({
                'city': city['city'],
                'value': f"{round(city['population']/1000000, 1)}M",
                'numeric': city['population']
            })
        
        # Process landmark data
        if city.get('landmarks') and city['landmarks'] not in ['N/A', '']:
            landmark_data.append({
                'city': city['city'],
                'value': city['landmarks'].split('.')[0].strip()
            })
        
        # Process climate data
        if city.get('climate') and city['climate'] not in ['N/A', '']:
            climate_data.append({
                'city': city['city'],
                'value': city['climate'].split('.')[0].strip()
            })

    # Determine question type and format response
    # Altitude questions
    if any(keyword in query_lower for keyword in ['altitude', 'elevation', 'height']):
        if altitude_data:
            altitude_data.sort(key=lambda x: x['numeric'], reverse='highest' in query_lower)
            top_entry = altitude_data[0]
            
            if 'highest' in query_lower:
                final_answer = f"{top_entry['city']} has the highest altitude in {target_country.title() if target_country else 'the world'} at {top_entry['value']}."
            elif 'lowest' in query_lower:
                final_answer = f"{top_entry['city']} has the lowest altitude in {target_country.title() if target_country else 'the world'} at {top_entry['value']}."
            else:
                final_answer = f"{top_entry['city']} has altitude {top_entry['value']}."
            
            sub_responses = [f"{x['city']}: {x['value']}" for x in altitude_data[:3]]
    
    # Population questions
    elif any(keyword in query_lower for keyword in ['population', 'populous']):
        if population_data:
            population_data.sort(key=lambda x: x['numeric'], reverse=True)
            top_entry = population_data[0]
            
            if 'highest' in query_lower:
                final_answer = f"{top_entry['city']} has the highest population in {target_country.title() if target_country else 'the world'} at {top_entry['value']}."
            elif 'lowest' in query_lower:
                final_answer = f"{top_entry['city']} has the lowest population in {target_country.title() if target_country else 'the world'} at {top_entry['value']}."
            else:
                final_answer = f"{top_entry['city']} has population {top_entry['value']}."
            
            sub_responses = [f"{x['city']}: {x['value']}" for x in population_data[:3]]
    
    # Landmark questions
    elif any(keyword in query_lower for keyword in ['landmark', 'place', 'attraction', 'tourist', 'visit', 'see']):
        if landmark_data:
            if target_city:
                landmarks = [x['value'] for x in landmark_data if x['city'].lower() == target_city]
                final_answer = f"Places to visit in {target_city.title()}: {', '.join(landmarks[:3])}."
            else:
                landmarks = [f"{x['city']}: {x['value']}" for x in landmark_data]
                final_answer = f"Top landmarks: {', '.join(landmarks[:3])}."
            
            sub_responses = [f"{x['city']}: {x['value']}" for x in landmark_data[:3]]
    
    # Climate questions
    elif any(keyword in query_lower for keyword in ['climate', 'weather', 'temperature']):
        if climate_data:
            if target_city:
                climates = [x['value'] for x in climate_data if x['city'].lower() == target_city]
                final_answer = f"Climate in {target_city.title()}: {', '.join(climates[:3])}."
            else:
                climates = [f"{x['city']}: {x['value']}" for x in climate_data]
                final_answer = f"Climate information: {', '.join(climates[:3])}."
            
            sub_responses = [f"{x['city']}: {x['value']}" for x in climate_data[:3]]
    
    # Format the output
    response_lines = ["○ Sub-question responses:"]
    for i, resp in enumerate(sub_responses[:3], 1):
        response_lines.append(f'■ "{resp}"')
    response_lines.append(f'○ Final Response: "{final_answer}"')
    
    return '\n'.join(response_lines)

def answer_city_question(query: str) -> str:
    """Complete RAG pipeline execution"""
    # Task 1: Generate sub-questions
    sub_questions = generate_sub_questions(query)
    
    # Task 2: Retrieve answers for each sub-question
    sub_answers = []
    for sub_q in sub_questions:
        cities = retrieve_city_data(sub_q)
        answers = []
        for city in cities:
            # Try all requested attributes until we get a valid answer
            for attr in sub_q["attributes"]:
                ans = format_sub_answer(city, attr)
                if ans:
                    answers.append(ans)
                    break
        
        sub_answers.append({
            "question": sub_q["question"],
            "answers": answers,
            "source": sub_q.get("source", "city_dataset")
        })
    
    # Task 3: Generate final response
    return generate_final_response(query, sub_answers)

# Example usage
if __name__ == "__main__":
    test_queries = [
        "Which city has the highest population in India?"
    ]
    
    for query in test_queries:
        print(f"\nQuestion: {query}")
        print(answer_city_question(query))
        print("-" * 80)



Question: Which city has the highest population in India?
○ Sub-question responses:
■ "Mumbai: 12.4M"
■ "Bengaluru: 10.8M"
■ "Chennai: 4.7M"
○ Final Response: "Mumbai has the highest population in India at 12.4M."
--------------------------------------------------------------------------------
