In [None]:
import requests
import re
import math
from collections import defaultdict

TITLES = [
    "Artificial intelligence",
    "Machine learning",
    "Natural language processing",
    "Deep learning",
    "Neural network",
    "Computer vision",
    "Data mining",
]


In [None]:
def fetch_wiki_plain(title: str, timeout=20):
    url = f"https://en.wikipedia.org/api/rest_v1/page/plain/{title.replace(' ', '_')}"
    resp = requests.get(url, timeout=timeout, headers={"User-Agent": "IR-Demo/1.0"})
    if resp.status_code == 200:
        return resp.text
    return ""


In [None]:
def build_documents(titles):
    docs = {}
    for i, title in enumerate(titles, start=1):
        docs[i] = {
            "title": title,
            "body": fetch_wiki_plain(title)
        }
    return docs

DOCUMENTS = build_documents(TITLES)

In [None]:
def tokenize(text: str):
    return re.findall(r"\w+", text.lower()) if text else []

In [None]:
def build_inverted_index(documents):
    inv = defaultdict(lambda: defaultdict(lambda: {
        "count": 0,
        "positions": {"title": [], "body": []},
        "field": {"title": 0, "body": 0},
    }))
    for doc_id, obj in documents.items():

        for pos, tok in enumerate(tokenize(obj["title"])):
            post = inv[tok][doc_id]
            post["count"] += 1
            post["positions"]["title"].append(pos)
            post["field"]["title"] += 1

        for pos, tok in enumerate(tokenize(obj["body"])):
            post = inv[tok][doc_id]
            post["count"] += 1
            post["positions"]["body"].append(pos)
            post["field"]["body"] += 1
    return inv

INVERTED_INDEX = build_inverted_index(DOCUMENTS)

In [None]:
N = len(DOCUMENTS)

def compute_idf(inverted_index, N_docs):
    idf = {}
    for term, postings in inverted_index.items():
        df = len(postings)
        idf[term] = math.log(N_docs / (1 + df)) + 1
    return idf

IDF = compute_idf(INVERTED_INDEX, N)

In [None]:
def score_documents(query, inverted_index, idf, title_boost=0.5):
    q_tokens = tokenize(query)
    scores = defaultdict(float)
    for term in q_tokens:
        if term not in inverted_index:
            continue
        for doc_id, data in inverted_index[term].items():
            tf = data["count"]
            scores[doc_id] += tf * idf.get(term, 0.0)
            if data["field"]["title"] > 0:
                scores[doc_id] += title_boost * data["field"]["title"]
    return scores

In [None]:
def search_all(query):
    scores = score_documents(query, INVERTED_INDEX, IDF)
    ranked = []
    for doc_id, doc in DOCUMENTS.items():
        score = scores.get(doc_id, 0.0)
        ranked.append((doc_id, score, doc["title"]))
    ranked = sorted(ranked, key=lambda x: x[1], reverse=True)
    return ranked

In [None]:
euser_query = input("Enter query: ")

results = search_all(user_query)

print(f"\nQuery: {user_query}")
print("Ranking results:")
for rank, (doc_id, score, title) in enumerate(results, start=1):
    print(f"{rank}. Doc {doc_id} | {title} | score={score:.2f}")

Enter query: clustering

Query: clustering
Ranking results:
1. Doc 1 | Artificial intelligence | score=0.00
2. Doc 2 | Machine learning | score=0.00
3. Doc 3 | Natural language processing | score=0.00
4. Doc 4 | Deep learning | score=0.00
5. Doc 5 | Neural network | score=0.00
6. Doc 6 | Computer vision | score=0.00
7. Doc 7 | Data mining | score=0.00
