In [6]:
# =====================================================
# INFORMATION RETRIEVAL SYSTEM (Boolean + TF-IDF + BM25)
# =====================================================

import pandas as pd
import numpy as np
import nltk
import re
import string

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi

# Download stopwords (will show "already up-to-date" — OK)
nltk.download('stopwords')

# =====================================================
# 1. LOAD DATASET (Fix UnicodeDecodeError)
# =====================================================

csv_path = r"D:\MSCS24047\3rd Semester\IR&TM\HW3\Articles.csv"

# Try different encodings until one works
encodings_to_try = ["utf-8", "latin1", "ISO-8859-1"]

df = None
for enc in encodings_to_try:
    try:
        df = pd.read_csv(csv_path, encoding=enc, on_bad_lines='skip')
        print(f"Loaded using encoding: {enc}")
        break
    except Exception as e:
        print(f"Failed with encoding {enc}: {e}")

if df is None:
    raise Exception("Could not read file with any encoding.")

print("Dataset Loaded Successfully!")
print("Total documents:", len(df))
print("Columns:", df.columns.tolist())


# =====================================================
# 2. SELECT TEXT COLUMN (Automatic Detection)
# =====================================================

possible_cols = ["text", "article", "content", "body", "description"]

text_col = None
for col in df.columns:
    if col.lower() in possible_cols:
        text_col = col
        break

if text_col is None:
    # If nothing matches, select first column
    text_col = df.columns[0]

print(f"Using TEXT COLUMN: {text_col}")

# Clean missing values
df[text_col] = df[text_col].astype(str).fillna("")


# =====================================================
# 3. TEXT PREPROCESSING
# =====================================================

stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = text.strip()
    return text

def tokenize(text):
    tokens = clean_text(text).split()
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

df["clean_text"] = df[text_col].apply(clean_text)
df["tokens"] = df["clean_text"].apply(tokenize)


# =====================================================
# 4. BOOLEAN RETRIEVAL
# =====================================================

def boolean_search(query):
    q_tokens = tokenize(query)
    results = df[df["tokens"].apply(lambda x: all(t in x for t in q_tokens))]
    return results.head(5)


# =====================================================
# 5. TF-IDF RETRIEVAL
# =====================================================

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["clean_text"])

def retrieve_tfidf(query):
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, tfidf_matrix).flatten()
    top_idx = scores.argsort()[-5:][::-1]
    return df.iloc[top_idx]


# =====================================================
# 6. BM25 RETRIEVAL
# =====================================================

bm25 = BM25Okapi(df["tokens"])

def retrieve_bm25(query):
    q_tokens = tokenize(query)
    scores = bm25.get_scores(q_tokens)
    top_idx = np.argsort(scores)[-5:][::-1]
    return df.iloc[top_idx]


# =====================================================
# 7. RUN A SAMPLE QUERY
# =====================================================

query = "economic policy inflation government"

print("\n===== BOOLEAN RESULTS =====")
print(boolean_search(query)[text_col])

print("\n===== TF-IDF RESULTS =====")
print(retrieve_tfidf(query)[text_col])

print("\n===== BM25 RESULTS =====")
print(retrieve_bm25(query)[text_col])


[nltk_data] Downloading package stopwords to C:\Users\SYED ZAIN UL
[nltk_data]     AIDEEN/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Failed with encoding utf-8: 'utf-8' codec can't decode byte 0xb4 in position 799: invalid start byte
Loaded using encoding: latin1
Dataset Loaded Successfully!
Total documents: 2692
Columns: ['Article', 'Date', 'Heading', 'NewsType']
Using TEXT COLUMN: Article

===== BOOLEAN RESULTS =====
170    ISLAMABAD: Finance Minister Ishaq Dar presente...
275    Singapore: Oil prices eased in Asian trade Mon...
284    Hong Kong: A mixed reading on Chinese inflatio...
354    Hong Kong: Asian stock markets tumbled and the...
359    KARACHI:  State Bank of Pakistan (SBP) on Satu...
Name: Article, dtype: object

===== TF-IDF RESULTS =====
2521    strong>KARACHI: The State Bank of Pakistan ann...
1001    strong>KARACHI: State Bank of Pakistan (SBP) o...
556     strong>KARACHI: The Monetary Policy Committee ...
453     strong>ISLAMABAD: Pakistan´s annual consumer i...
81      KARACHI: The announcement of Monetary Policy f...
Name: Article, dtype: object

===== BM25 RESULTS =====
81      KARACHI: The an