# Import libraries

In [None]:
import os
import requests
import math
import shutil
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.parse import urljoin

# Create folders for images and metadata

In [None]:
os.makedirs("images", exist_ok=True)
os.makedirs("metadata", exist_ok=True)

# Web Crawler to Collect Images and Metadata

In [None]:
def crawl_images(base_url, max_images=1000):
    visited = set()
    image_data = {}
    queue = [base_url]

    while queue and len(image_data) < max_images:
        url = queue.pop(0)
        if url in visited:
            continue
        visited.add(url)
        try:
            response = requests.get(url, timeout=5)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, "html.parser")

            for img in soup.find_all("img"):
                img_url = urljoin(url, img.get("src", ""))
                alt_text = img.get("alt", "No caption").strip()

                if img_url and len(image_data) < max_images:
                    img_name = f"image_{len(image_data)}.jpg"
                    img_path = os.path.join("images", img_name)

                    try:
                        img_data = requests.get(img_url, stream=True, timeout=5)
                        with open(img_path, "wb") as f:
                            shutil.copyfileobj(img_data.raw, f)

                        image_data[img_name] = alt_text
                    except:
                        continue

            for link in soup.find_all("a", href=True):
                new_url = urljoin(url, link["href"])
                if new_url.startswith(base_url) and new_url not in visited:
                    queue.append(new_url)
        except:
            continue

    return image_data

# Preprocessing Function

In [None]:
def preprocess(text):
    text = text.lower()
    tokens = text.split()
    return [token for token in tokens if token.isalnum()]

# Build Inverted Index

In [None]:
def build_inverted_index(metadata):
    inverted_index = defaultdict(dict)
    doc_lengths = {}

    for img_name, text in metadata.items():
        tokens = preprocess(text)
        term_freqs = defaultdict(int)

        for term in tokens:
            term_freqs[term] += 1

        for term, freq in term_freqs.items():
            inverted_index[term][img_name] = freq

        doc_lengths[img_name] = len(tokens)

    return inverted_index, doc_lengths

# TF-IDF Retrieval

In [None]:
def compute_tfidf_scores(query, inverted_index, doc_lengths, total_docs):
    query_terms = preprocess(query)
    scores = defaultdict(float)

    for term in query_terms:
        if term in inverted_index:
            doc_freq = len(inverted_index[term])
            idf = math.log((total_docs / (1 + doc_freq)))
            for img_name, term_freq in inverted_index[term].items():
                tf = term_freq / doc_lengths[img_name]
                scores[img_name] += tf * idf

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)

# Search Engine Execution

In [None]:
if __name__ == "__main__":
    base_url = "https://en.wikipedia.org/wiki/Artificial_intelligence"
    image_metadata = crawl_images(base_url, max_images=1000)

    inverted_index, doc_lengths = build_inverted_index(image_metadata)
    total_docs = len(image_metadata)

    # Example query
    query = "robotics"
    results = compute_tfidf_scores(query, inverted_index, doc_lengths, total_docs)

    print("Search Results for query:", query)
    for img, score in results[:10]:
        print(f"{img}: {score}")