<a href="https://colab.research.google.com/github/mscs24037/IR-assignment_03/blob/main/IR_task_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
!pip install nltk pandas numpy matplotlib scikit-learn
!pip install nltk
import os
import json
import re
import time
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# NLTK data download
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
print("Setup complete!")

Setup complete!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [46]:
#Document Loading
csv_path = '/content/Document/Articles.csv'

encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']

df = None
for encoding in encodings:
    try:
        print(f"Trying encoding: {encoding}")
        df = pd.read_csv(csv_path, encoding=encoding)
        print(f"✓ CSV loaded successfully with {encoding}!")
        break
    except:
        continue

if df is None:
    print(" Could not load CSV with any encoding")
else:
    print(f"Total rows: {len(df)}")
    print(f"Columns: {list(df.columns)}")

    # Find text column
    text_col = None
    max_length = 0

    for col in df.columns:
        try:
            avg_len = df[col].astype(str).str.len().mean()
            if avg_len > max_length:
                max_length = avg_len
                text_col = col
        except:
            continue

    print(f"\n✓ Text column detected: '{text_col}'")

    # Extract documents and generate doc_ids
    documents = df[text_col].astype(str).tolist()
    doc_ids = [f'doc_{i}' for i in df.index]

Trying encoding: utf-8
Trying encoding: latin-1
✓ CSV loaded successfully with latin-1!
Total rows: 2692
Columns: ['Article', 'Date', 'Heading', 'NewsType']

✓ Text column detected: 'Article'


In [47]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [48]:
#Text Preprocessing
class SimplePreprocessor:

    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        # Lowercase
        text = text.lower()
        # Numbers aur special characters remove
        text = re.sub(r'[^a-z\s]', ' ', text)
        # Extra spaces remove
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def tokenize(self, text):
        """Tokenization"""
        return text.split()

    def remove_stopwords(self, tokens):
        """Stopwords remove"""
        return [token for token in tokens if token not in self.stop_words and len(token) > 2]

    def stem_tokens(self, tokens):
        """Stemming"""
        return [self.stemmer.stem(token) for token in tokens]

    def preprocess(self, text):
        """Complete preprocessing"""
        text = self.clean_text(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stopwords(tokens)
        tokens = self.stem_tokens(tokens)
        return ' '.join(tokens)
preprocessor = SimplePreprocessor()
print("Processing documents...")
processed_docs = []
for i, doc in enumerate(documents):
    processed_doc = preprocessor.preprocess(doc)
    processed_docs.append(processed_doc)
    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{len(documents)} documents")

print(f"Preprocessing complete! Total: {len(processed_docs)} documents")

Processing documents...
Processed 100/2692 documents
Processed 200/2692 documents
Processed 300/2692 documents
Processed 400/2692 documents
Processed 500/2692 documents
Processed 600/2692 documents
Processed 700/2692 documents
Processed 800/2692 documents
Processed 900/2692 documents
Processed 1000/2692 documents
Processed 1100/2692 documents
Processed 1200/2692 documents
Processed 1300/2692 documents
Processed 1400/2692 documents
Processed 1500/2692 documents
Processed 1600/2692 documents
Processed 1700/2692 documents
Processed 1800/2692 documents
Processed 1900/2692 documents
Processed 2000/2692 documents
Processed 2100/2692 documents
Processed 2200/2692 documents
Processed 2300/2692 documents
Processed 2400/2692 documents
Processed 2500/2692 documents
Processed 2600/2692 documents
Preprocessing complete! Total: 2692 documents


In [49]:
 #TF-IDF Retrieval System
class TFIDFRetriever:
    def __init__(self, documents, doc_ids):
        self.documents = documents
        self.doc_ids = doc_ids

        self.vectorizer = TfidfVectorizer(
            max_features=5000,  # Top 5000 features
            min_df=2,           # Minimum should be in 2 documents
            max_df=0.8,         # should be availabe more than 80%
            ngram_range=(1, 2)  # Unigrams and bigrams
        )
        #make document vector
        self.doc_vectors = self.vectorizer.fit_transform(documents)
        print(f"Index built! Shape: {self.doc_vectors.shape}")
        print(f"Vocabulary size: {len(self.vectorizer.vocabulary_)}")

    def search(self, query, top_k=10):

        # make Query vector
        query_vector = self.vectorizer.transform([query])

        # calculate Cosine similarity
        similarities = cosine_similarity(query_vector, self.doc_vectors)[0]

        # find Top-k indices
        top_indices = np.argsort(similarities)[::-1][:top_k]

        # Results prepare
        results = []
        for rank, idx in enumerate(top_indices, 1):
            if similarities[idx] > 0:  # Only non-zero scores
                results.append({
                    'rank': rank,
                    'doc_id': self.doc_ids[idx],
                    'score': float(similarities[idx])
                })

        return results

    def get_vocabulary_stats(self):
        """Vocabulary statistics"""
        vocab = self.vectorizer.vocabulary_
        feature_names = self.vectorizer.get_feature_names_out()

        print(f"\nVocabulary Statistics:")
        print(f"Total terms: {len(vocab)}")
        print(f"Sample terms: {list(feature_names[:10])}")

    # Step 2: Create doc_ids
    print("\nCreating document IDs...")
    doc_ids = [f"doc_{i+1}" for i in range(len(processed_docs))]
    print(f"Created {len(doc_ids)} document IDs")

    # Step 3: Now create the retriever
    print("\nInitializing TF-IDF retriever...")
    retriever = TFIDFRetriever(processed_docs, doc_ids)
    retriever.get_vocabulary_stats()




Creating document IDs...
Created 2692 document IDs

Initializing TF-IDF retriever...
Index built! Shape: (2692, 5000)
Vocabulary size: 5000

Vocabulary Statistics:
Total terms: 5000
Sample terms: ['aaron', 'aaron finch', 'abandon', 'abbott', 'abdul', 'abdullah', 'abil', 'abl', 'abroad', 'absenc']


In [50]:
#Search Function
def search_documents(query, top_k=10, show_preview=True):

    # Query preprocess
    processed_query = preprocessor.preprocess(query)

    # Search
    start_time = time.time()
    results = retriever.search(processed_query, top_k)
    end_time = time.time()

    print(f"Search completed in {end_time - start_time:.4f} seconds")
    print(f"Found {len(results)} relevant documents\n")

    if not results:
        print("No relevant documents found!")
        return results

    # Results display
    for result in results:
        print(f"Rank {result['rank']}: {result['doc_id']}")
        print(f"Relevance Score: {result['score']:.4f}")

results1 = search_documents("honk kong", top_k=5, show_preview=False)

Search completed in 0.0128 seconds
Found 5 relevant documents

Rank 1: doc_12
Relevance Score: 0.3552
Rank 2: doc_3
Relevance Score: 0.3470
Rank 3: doc_388
Relevance Score: 0.3426
Rank 4: doc_2460
Relevance Score: 0.3224
Rank 5: doc_68
Relevance Score: 0.3203


In [51]:
# EVALUATION
class SimpleEvaluator:
    def __init__(self, retriever, preprocessor):
        self.retriever = retriever
        self.preprocessor = preprocessor

    def precision_at_k(self, results, relevant_docs, k):
        top_k = results[:k]
        relevant_retrieved = sum(1 for r in top_k if r['doc_id'] in relevant_docs)
        return relevant_retrieved / k if k > 0 else 0

    def recall_at_k(self, results, relevant_docs, k):
        top_k = results[:k]
        relevant_retrieved = sum(1 for r in top_k if r['doc_id'] in relevant_docs)
        total_relevant = len(relevant_docs)
        return relevant_retrieved / total_relevant if total_relevant > 0 else 0

    def mean_reciprocal_rank(self, results, relevant_docs):
        for rank, result in enumerate(results, 1):
            if result['doc_id'] in relevant_docs:
                return 1.0 / rank
        return 0.0

# Initialize evaluator
evaluator = SimpleEvaluator(retriever, preprocessor)

# Test query
query = 'HONG KONG'
relevant = ['doc_5']

# Process query
processed_query = preprocessor.preprocess(query)
results = retriever.search(processed_query, top_k=10)

# Calculate metrics
p_at_5 = evaluator.precision_at_k(results, relevant, 5)
r_at_5 = evaluator.recall_at_k(results, relevant, 5)
mrr = evaluator.mean_reciprocal_rank(results, relevant)

# Print results
print(f"\nQuery: {query}")
print(f"Relevant docs: {relevant}")
print(f"\nTop 10 Results:")
for r in results:
    marker = "✓" if r['doc_id'] in relevant else " "
    print(f"  {marker} Rank {r['rank']}: {r['doc_id']} (Score: {r['score']:.4f})")

print(f"\nMetrics:")
print(f"  Precision@5: {p_at_5:.3f}")
print(f"  Recall@5: {r_at_5:.3f}")
print(f"  MRR: {mrr:.3f}")


Query: HONG KONG
Relevant docs: ['doc_5']

Top 10 Results:
    Rank 1: doc_12 (Score: 0.6156)
    Rank 2: doc_3 (Score: 0.6015)
    Rank 3: doc_388 (Score: 0.5939)
    Rank 4: doc_2460 (Score: 0.5587)
    Rank 5: doc_68 (Score: 0.5552)
    Rank 6: doc_1408 (Score: 0.5458)
    Rank 7: doc_113 (Score: 0.4813)
    Rank 8: doc_1406 (Score: 0.4670)
    Rank 9: doc_567 (Score: 0.4388)
    Rank 10: doc_2590 (Score: 0.4348)

Metrics:
  Precision@5: 0.000
  Recall@5: 0.000
  MRR: 0.000


In [52]:
#System Statistics
def display_system_statistics():

    print(f"\n Document Statistics:")
    print(f"  Total documents: {len(documents)}")

    doc_lengths = [len(doc) for doc in documents]
    print(f"  Avg characters per doc: {np.mean(doc_lengths):.0f}")
    print(f"  Min characters: {np.min(doc_lengths)}")
    print(f"  Max characters: {np.max(doc_lengths)}")

    # Processed documents
    processed_lengths = [len(doc.split()) for doc in processed_docs]
    print(f"\n  After preprocessing:")
    print(f"  Avg tokens per doc: {np.mean(processed_lengths):.0f}")
    print(f"  Min tokens: {np.min(processed_lengths)}")
    print(f"  Max tokens: {np.max(processed_lengths)}")

    # TF-IDF statistics
    print(f"\n TF-IDF Index Statistics:")
    print(f"  Vocabulary size: {len(retriever.vectorizer.vocabulary_)}")
    print(f"  Matrix shape: {retriever.doc_vectors.shape}")
    print(f"  Matrix density: {retriever.doc_vectors.nnz / (retriever.doc_vectors.shape[0] * retriever.doc_vectors.shape[1]):.4f}")

display_system_statistics()


 Document Statistics:
  Total documents: 2692
  Avg characters per doc: 1810
  Min characters: 216
  Max characters: 19199

  After preprocessing:
  Avg tokens per doc: 178
  Min tokens: 23
  Max tokens: 1784

 TF-IDF Index Statistics:
  Vocabulary size: 5000
  Matrix shape: (2692, 5000)
  Matrix density: 0.0255


In [53]:
#Complete Test Run
def search_documents(query, top_k=5, show_preview=False):

    if 'retriever' not in globals():
        print("ERROR: TFIDFRetriever not initialized. Run setup first.")
        return []
    results = retriever.search(query, top_k=top_k)

# Multiple test queries
test_queries = [
    "artificial intelligence",
    "computer networks"
]

print("\nRunning test queries...\n")

for i, query in enumerate(test_queries, 1):
    print(f"\nTest {i}/{len(test_queries)}")
    print(f"Query: '{query}'")

    results = search_documents(query, top_k=3, show_preview=False)

    if results:
        print(f"Found {len(results)} results:")
        for result in results:
            print(f"  Rank {result['rank']}: {result['doc_id']} (Score: {result['score']:.4f})")
    else:
        print("  No results found")


Running test queries...


Test 1/2
Query: 'artificial intelligence'
  No results found

Test 2/2
Query: 'computer networks'
  No results found
