In [88]:
# !pip install PyPDF2 beautifulsoup4 nltk scikit-learn matplotlib

In [89]:
import os
import re
import math
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/rajitroy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajitroy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rajitroy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# 1. DATA LOADING AND PREPROCESSING
# ================================

In [90]:
def extract_text_from_file(file_path):
    """
    Extract text content from various file types (HTML, PDF, TXT)

    Args:
        file_path (str): Path to the file

    Returns:
        str: Extracted text content
    """
    print("Extracting text from file")
    try:
        # Handle different file types
        if file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(content, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)

        elif file_path.endswith('.pdf'):
            # Use PyPDF2 for PDF files
            import PyPDF2

            with open(file_path, 'rb') as file:  # Note the 'rb' mode for binary files
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:  # Some pages might not have extractable text
                        text += page_text + " "

        else:  # For text files
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""

In [91]:
def load_documents(base_dir):
    """
    Load all documents from the specified directory structure

    Args:
        base_dir (str): Base directory containing subdirectories for categories

    Returns:
        dict: Dictionary with document IDs as keys and document info as values
    """
    documents = {}
    doc_id = 0

    # Create dictionary to store document paths
    doc_paths = {}

    # Walk through the directory structure
    for root, _, files in os.walk(base_dir):
        category = os.path.basename(root)

        for file in files:
            # Only process HTML, PDF, and text files
            if file.endswith(('.html', '.txt', '.csv', '.pdf')):
                file_path = os.path.join(root, file)

                # Extract text using our universal extractor for all file types
                text = extract_text_from_file(file_path)

                # Skip if no text was extracted
                if not text:
                    continue

                # Store document info
                doc_name = f"{category}_{file}"
                documents[doc_id] = {
                    'id': doc_id,
                    'name': doc_name,
                    'category': category,
                    'path': file_path,
                    'text': text,
                    'tokens': None,  # Will be populated during preprocessing
                    'term_freq': None,  # Will be populated during TF-IDF calculation
                }
                doc_paths[doc_id] = file_path
                doc_id += 1

    print(f"Loaded {len(documents)} documents from {base_dir}")
    return documents, doc_paths

In [92]:
def preprocess_text(text):
    """
    Preprocess text: tokenize, remove stopwords, punctuation, and stem

    Args:
        text (str): Input text

    Returns:
        list: List of preprocessed tokens
    """
    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove punctuation and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens


In [93]:
def preprocess_documents(documents):
    """
    Preprocess all documents in the collection

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Updated dictionary with preprocessed tokens
    """
    for doc_id, doc in documents.items():
        doc['tokens'] = preprocess_text(doc['text'])

    return documents


# 2. INVERTED INDEX AND TF-IDF
# ============================


In [94]:
def build_inverted_index(documents):
    """
    Build an inverted index mapping terms to documents

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Inverted index mapping terms to document IDs
    """
    inverted_index = defaultdict(list)

    for doc_id, doc in documents.items():
        # Get unique terms in the document
        unique_terms = set(doc['tokens'])

        # Add document to the posting list of each term
        for term in unique_terms:
            inverted_index[term].append(doc_id)

    return dict(inverted_index)


In [95]:
def calculate_term_frequencies(documents):
    """
    Calculate term frequencies for each document

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Updated dictionary with term frequencies
    """
    for doc_id, doc in documents.items():
        # Count term frequencies
        term_freq = Counter(doc['tokens'])
        doc['term_freq'] = term_freq

    return documents


In [96]:
def calculate_tfidf(documents, inverted_index):
    """
    Calculate TF-IDF scores for all terms in all documents

    Args:
        documents (dict): Dictionary of documents
        inverted_index (dict): Inverted index mapping terms to document IDs

    Returns:
        dict: TF-IDF scores for all terms in all documents
        dict: Document vectors for similarity calculations
    """
    N = len(documents)  # Total number of documents

    # Calculate IDF for each term
    idf = {}
    for term, doc_ids in inverted_index.items():
        idf[term] = math.log10(N / len(doc_ids))

    # Calculate TF-IDF for each term in each document
    tfidf = {}
    doc_vectors = {}

    for doc_id, doc in documents.items():
        tfidf[doc_id] = {}
        vector = {}

        # Get document length (total number of terms)
        doc_length = len(doc['tokens'])

        # Calculate TF-IDF for each term in the document
        for term, freq in doc['term_freq'].items():
            # Normalized TF (term frequency / document length)
            normalized_tf = freq / doc_length

            # TF-IDF score
            tfidf[doc_id][term] = normalized_tf * idf.get(term, 0)
            vector[term] = tfidf[doc_id][term]

        # Store the document vector
        doc_vectors[doc_id] = vector

    return tfidf, doc_vectors


In [97]:
def display_inverted_index(inverted_index, top_n=10):
    """
    Display the inverted index (sorted)

    Args:
        inverted_index (dict): Inverted index mapping terms to document IDs
        top_n (int): Number of terms to display

    Returns:
        None
    """
    # Sort terms by their frequency (number of documents)
    sorted_terms = sorted(inverted_index.items(),
                          key=lambda x: len(x[1]),
                          reverse=True)

    print(f"Top {top_n} terms in the inverted index:")
    print("=" * 50)
    print("{:<20} {:<10} {:<20}".format("Term", "Doc Count", "Documents"))
    print("-" * 50)

    for term, doc_ids in sorted_terms[:top_n]:
        print("{:<20} {:<10} {:<20}".format(
            term, len(doc_ids), str(doc_ids[:5]) + "..." if len(doc_ids) > 5 else str(doc_ids)
        ))


# 3. SIMILARITY CALCULATION
# =========================


In [98]:
def calculate_similarity_matrix(doc_vectors):
    """
    Calculate cosine similarity matrix for document pairs

    Args:
        doc_vectors (dict): Document vectors

    Returns:
        dict: Similarity matrix for document pairs
    """
    doc_ids = list(doc_vectors.keys())
    similarity_matrix = {}

    for i, doc_id1 in enumerate(doc_ids):
        similarity_matrix[doc_id1] = {}
        vec1 = doc_vectors[doc_id1]

        for doc_id2 in doc_ids:
            if doc_id1 == doc_id2:
                similarity_matrix[doc_id1][doc_id2] = 1.0
                continue

            vec2 = doc_vectors[doc_id2]

            # Calculate dot product
            dot_product = 0
            for term, tfidf1 in vec1.items():
                if term in vec2:
                    dot_product += tfidf1 * vec2[term]

            # Calculate magnitudes
            mag1 = math.sqrt(sum(tfidf**2 for tfidf in vec1.values()))
            mag2 = math.sqrt(sum(tfidf**2 for tfidf in vec2.values()))

            # Calculate cosine similarity
            if mag1 * mag2 == 0:
                similarity_matrix[doc_id1][doc_id2] = 0
            else:
                similarity_matrix[doc_id1][doc_id2] = dot_product / (mag1 * mag2)

    return similarity_matrix


In [99]:
def get_most_similar_documents(similarity_matrix, doc_id, top_n=5):
    """
    Get the most similar documents to a given document

    Args:
        similarity_matrix (dict): Similarity matrix for document pairs
        doc_id (int): Document ID
        top_n (int): Number of similar documents to return

    Returns:
        list: Top similar documents with similarity scores
    """
    similarities = similarity_matrix[doc_id]

    # Sort by similarity score (descending)
    sorted_similarities = sorted(similarities.items(),
                                 key=lambda x: x[1],
                                 reverse=True)

    # Exclude the document itself (similarity = 1.0)
    similar_docs = [(doc_id2, score) for doc_id2, score in sorted_similarities
                    if doc_id2 != doc_id]

    return similar_docs[:top_n]


# 4. RECOMMENDER SYSTEM
# =====================


In [100]:
def search(query, documents, inverted_index, doc_vectors, tolerance=0.8):
    """
    Search for documents matching a query

    Args:
        query (str): Search query
        documents (dict): Dictionary of documents
        inverted_index (dict): Inverted index mapping terms to document IDs
        doc_vectors (dict): Document vectors for similarity calculations
        tolerance (float): Tolerance threshold for fuzzy matching

    Returns:
        list: Ranked list of matching documents
    """
    # Preprocess the query
    query_tokens = preprocess_text(query)

    # If no valid tokens after preprocessing, return empty result
    if not query_tokens:
        return []

    # Find matching documents for each query term
    matching_docs = set()

    for query_term in query_tokens:
        # Try exact matching first
        if query_term in inverted_index:
            matching_docs.update(inverted_index[query_term])
        else:
            # Try fuzzy matching if exact match not found
            all_terms = list(inverted_index.keys())
            close_matches = get_close_matches(query_term, all_terms, n=3, cutoff=tolerance)

            for match in close_matches:
                matching_docs.update(inverted_index[match])

    # If no matching documents found, return empty result
    if not matching_docs:
        return []

    # Calculate query vector
    query_vector = {}
    for term in query_tokens:
        # Use TF-IDF weight if the term is in the corpus, otherwise give it a default weight
        query_vector[term] = query_vector.get(term, 0) + 1

    # Normalize query vector
    query_length = len(query_tokens)
    for term in query_vector:
        query_vector[term] /= query_length

    # Calculate similarity to query for each matching document
    similarities = []

    for doc_id in matching_docs:
        doc_vector = doc_vectors[doc_id]

        # Calculate dot product
        dot_product = 0
        for term, weight in query_vector.items():
            if term in doc_vector:
                dot_product += weight * doc_vector[term]

        # Calculate magnitudes
        query_mag = math.sqrt(sum(w**2 for w in query_vector.values()))
        doc_mag = math.sqrt(sum(w**2 for w in doc_vector.values()))

        # Calculate cosine similarity
        if query_mag * doc_mag == 0:
            similarity = 0
        else:
            similarity = dot_product / (query_mag * doc_mag)

        similarities.append((doc_id, similarity))

    # Sort by similarity score (descending)
    ranked_results = sorted(similarities, key=lambda x: x[1], reverse=True)

    return ranked_results


In [101]:
def display_search_results(results, documents, top_n=5):
    """
    Display search results

    Args:
        results (list): Ranked list of matching documents
        documents (dict): Dictionary of documents
        top_n (int): Number of results to display

    Returns:
        None
    """
    if not results:
        print("No matching documents found.")
        return

    print(f"Found {len(results)} matching documents.")
    print("=" * 80)

    for i, (doc_id, score) in enumerate(results[:top_n]):
        doc = documents[doc_id]
        title = doc['name']
        category = doc['category']

        print(f"Rank {i+1}: {title} [Category: {category}]")
        print(f"Similarity Score: {score:.4f}")

        # Display snippet (first 150 characters of text)
        snippet = doc['text'][:150].strip() + "..." if len(doc['text']) > 150 else doc['text']
        print(f"Snippet: {snippet}")
        print("-" * 80)


# 5. PERFORMANCE EVALUATION
# ========================


In [102]:
def evaluate_search(test_queries, documents, inverted_index, doc_vectors):
    """
    Evaluate search performance using test queries
    """
    metrics = {
        'precision': [],
        'recall': [],
        'f1_score': [],
        'avg_precision': []
    }

    # [Rest of the function remains the same]

    # Calculate average metrics - fixed version
    avg_metrics = {}
    for metric_name in list(metrics.keys()):  # Create a copy of keys for iteration
        avg_metrics[f'avg_{metric_name}'] = sum(metrics[metric_name]) / len(metrics[metric_name]) if metrics[metric_name] else 0

    # Add average metrics to the original metrics dictionary
    metrics.update(avg_metrics)

    return metrics

In [103]:
def display_evaluation_results(metrics):
    """
    Display evaluation results

    Args:
        metrics (dict): Performance metrics

    Returns:
        None
    """
    print("Search System Performance Evaluation")
    print("=" * 50)
    print(f"Average Precision: {metrics['avg_precision']:.4f}")
    print(f"Average Recall: {metrics['avg_recall']:.4f}")
    print(f"Average F1 Score: {metrics['avg_f1_score']:.4f}")
    print(f"Mean Average Precision (MAP): {metrics['avg_avg_precision']:.4f}")


# Main Execution
# =============


In [106]:
def main():
    # Set up base directory
    base_dir = 'bbc_articles'  # Update this path to your data directory

    # 1. Load and preprocess documents
    print("Loading and preprocessing documents...")
    documents, doc_paths = load_documents(base_dir)
    documents = preprocess_documents(documents)

    # 2. Build inverted index and calculate TF-IDF
    print("\nBuilding inverted index and calculating TF-IDF scores...")
    documents = calculate_term_frequencies(documents)
    inverted_index = build_inverted_index(documents)
    tfidf, doc_vectors = calculate_tfidf(documents, inverted_index)

    # Display inverted index
    display_inverted_index(inverted_index)

    # 3. Calculate similarity matrix
    print("\nCalculating document similarity matrix...")
    similarity_matrix = calculate_similarity_matrix(doc_vectors)

    # Display most similar documents for a sample document
    sample_doc_id = 0  # Change this to any valid document ID
    if documents:
        print(f"\nMost similar documents to {documents[sample_doc_id]['name']}:")
        similar_docs = get_most_similar_documents(similarity_matrix, sample_doc_id)
        for i, (doc_id, score) in enumerate(similar_docs):
            print(f"{i+1}. {documents[doc_id]['name']} (Similarity: {score:.4f})")

    # 4. Test search functionality
    print("\nTesting search functionality...")
    test_queries = [
        "technology and artificial intelligence",
        "business news and economy",
        "travel destinations in Europe",
        "art exhibitions and culture"
    ]

    for query in test_queries:
        print(f"\nSearch Query: '{query}'")
        results = search(query, documents, inverted_index, doc_vectors)
        display_search_results(results, documents)

    # 5. Evaluate search performance
    print("\nEvaluating search performance...")
    # Create test queries with relevance judgments
    # In a real scenario, you would have a gold standard set of relevance judgments
    test_queries_eval = {
        0: {
            'query': "technology AI artificial intelligence",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] == 'technology']
        },
        1: {
            'query': "business economy finance",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] == 'business']
        },
        2: {
            'query': "travel destination Europe",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] == 'travel']
        }
    }

    metrics = evaluate_search(test_queries_eval, documents, inverted_index, doc_vectors)
    display_evaluation_results(metrics)

if __name__ == "__main__":
    main()

Loading and preprocessing documents...
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file
Extracting text from file

In [105]:
# import os
#
# def debug_pdf_file():
#     """Debug reading a specific PDF file using PyPDF2"""
#     file_path = "bbc_articles/innovation/innovation_news_10.pdf"
#
#     # Check if file exists
#     if not os.path.exists(file_path):
#         print(f"File not found: {file_path}")
#         return
#
#     print(f"File exists: {file_path}")
#     print(f"File size: {os.path.getsize(file_path)} bytes")
#
#     # Try binary mode first to confirm we can read it
#     try:
#         with open(file_path, 'rb') as file:
#             binary_content = file.read(100)
#             print("First 100 bytes (binary mode):", binary_content)
#     except Exception as e:
#         print(f"Error in binary mode: {e}")
#         return
#
#     # Now try to use PyPDF2 to extract text
#     try:
#         import PyPDF2
#
#         with open(file_path, 'rb') as file:
#             print("Opening PDF with PyPDF2...")
#             pdf_reader = PyPDF2.PdfReader(file)
#             print(f"PDF has {len(pdf_reader.pages)} pages")
#
#             # Extract text from the first page as a test
#             print("Extracting text from the first page...")
#             first_page_text = pdf_reader.pages[0].extract_text()
#
#             # Print first 200 characters of the extracted text
#             print("First 200 characters of extracted text:")
#             print(first_page_text[:200] if first_page_text else "No text extracted")
#
#             # Try to extract text from all pages
#             print("Extracting text from all pages...")
#             all_text = ""
#             for i, page in enumerate(pdf_reader.pages):
#                 page_text = page.extract_text()
#                 all_text += page_text + " "
#                 print(f"Page {i+1}: Extracted {len(page_text)} characters")
#
#             print(f"Total extracted text length: {len(all_text)} characters")
#
#     except ImportError:
#         print("PyPDF2 is not installed. Please install it with: pip install PyPDF2")
#
#     except Exception as e:
#         print(f"Error using PyPDF2: {e}")
#
# # Run the debug function
# debug_pdf_file()