In [64]:
# !pip install PyPDF2 beautifulsoup4 nltk scikit-learn matplotlib

In [65]:
import os
import re
import math
import string
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/rajitroy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajitroy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rajitroy/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# 1. DATA LOADING AND PREPROCESSING
# ================================

In [66]:
def extract_text_from_file(file_path):
    """
    Extract text content from various file types (HTML, PDF, TXT)

    Args:
        file_path (str): Path to the file

    Returns:
        str: Extracted text content
    """
    print("Extracting text from file: ", file_path)
    try:
        # Handle different file types
        if file_path.endswith('.html'):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(content, 'html.parser')
            text = soup.get_text(separator=' ', strip=True)

        elif file_path.endswith('.pdf'):
            # Use PyPDF2 for PDF files
            import PyPDF2

            with open(file_path, 'rb') as file:  # Note the 'rb' mode for binary files
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:  # Some pages might not have extractable text
                        text += page_text + " "

        else:  # For text files
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""

In [67]:
def load_documents(base_dir):
    """
    Load all documents from the specified directory structure

    Args:
        base_dir (str): Base directory containing subdirectories for categories

    Returns:
        dict: Dictionary with document IDs as keys and document info as values
    """
    documents = {}
    doc_id = 0

    # Create dictionary to store document paths
    doc_paths = {}

    # Walk through the directory structure
    for root, _, files in os.walk(base_dir):
        category = os.path.basename(root)

        for file in files:
            # Only process HTML, PDF, and text files
            if file.endswith(('.html', '.txt', '.csv', '.pdf')):
                file_path = os.path.join(root, file)

                # Extract text using our universal extractor for all file types
                text = extract_text_from_file(file_path)

                # Skip if no text was extracted
                if not text:
                    continue

                # Store document info
                doc_name = f"{category}/{file}"
                documents[doc_id] = {
                    'id': doc_id,
                    'name': doc_name,
                    'category': category,
                    'path': file_path,
                    'text': text,
                    'tokens': None,  # Will be populated during preprocessing
                    'term_freq': None,  # Will be populated during TF-IDF calculation
                }
                doc_paths[doc_id] = file_path
                doc_id += 1

    print(f"Loaded {len(documents)} documents from {base_dir}")
    return documents, doc_paths

In [68]:
def preprocess_text(text):
    """
    Preprocess text: tokenize, remove stopwords, punctuation, and stem

    Args:
        text (str): Input text

    Returns:
        list: List of preprocessed tokens
    """
    # Lowercase
    text = text.lower()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove punctuation and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens


In [69]:
def preprocess_documents(documents):
    """
    Preprocess all documents in the collection

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Updated dictionary with preprocessed tokens
    """
    for doc_id, doc in documents.items():
        doc['tokens'] = preprocess_text(doc['text'])

    return documents


# 2. INVERTED INDEX AND TF-IDF
# ============================


In [70]:
def build_inverted_index(documents):
    """
    Build an inverted index mapping terms to documents

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Inverted index mapping terms to document IDs
    """
    inverted_index = defaultdict(list)

    for doc_id, doc in documents.items():
        # Get unique terms in the document
        unique_terms = set(doc['tokens'])

        # Add document to the posting list of each term
        for term in unique_terms:
            inverted_index[term].append(doc_id)

    return dict(inverted_index)


In [71]:
def calculate_term_frequencies(documents):
    """
    Calculate term frequencies for each document

    Args:
        documents (dict): Dictionary of documents

    Returns:
        dict: Updated dictionary with term frequencies
    """
    for doc_id, doc in documents.items():
        # Count term frequencies
        term_freq = Counter(doc['tokens'])
        doc['term_freq'] = term_freq

    return documents


In [72]:
def calculate_tfidf(documents, inverted_index):
    """
    Calculate TF-IDF scores for all terms in all documents

    Args:
        documents (dict): Dictionary of documents
        inverted_index (dict): Inverted index mapping terms to document IDs

    Returns:
        dict: TF-IDF scores for all terms in all documents
        dict: Document vectors for similarity calculations
    """
    N = len(documents)  # Total number of documents

    # Calculate IDF for each term
    idf = {}
    for term, doc_ids in inverted_index.items():
        idf[term] = math.log10(N / len(doc_ids))

    # Calculate TF-IDF for each term in each document
    tfidf = {}
    doc_vectors = {}

    for doc_id, doc in documents.items():
        tfidf[doc_id] = {}
        vector = {}

        # Get document length (total number of terms)
        doc_length = len(doc['tokens'])

        # Calculate TF-IDF for each term in the document
        for term, freq in doc['term_freq'].items():
            # Normalized TF (term frequency / document length)
            normalized_tf = freq / doc_length

            # TF-IDF score
            tfidf[doc_id][term] = normalized_tf * idf.get(term, 0)
            vector[term] = tfidf[doc_id][term]

        # Store the document vector
        doc_vectors[doc_id] = vector

    return tfidf, doc_vectors


In [73]:
def display_inverted_index(inverted_index, top_n=10):
    """
    Display the inverted index (sorted)

    Args:
        inverted_index (dict): Inverted index mapping terms to document IDs
        top_n (int): Number of terms to display

    Returns:
        None
    """
    # Sort terms by their frequency (number of documents)
    sorted_terms = sorted(inverted_index.items(),
                          key=lambda x: len(x[1]),
                          reverse=True)

    print(f"Top {top_n} terms in the inverted index:")
    print("=" * 50)
    print("{:<20} {:<10} {:<20}".format("Term", "Doc Count", "Documents"))
    print("-" * 50)

    for term, doc_ids in sorted_terms[:top_n]:
        print("{:<20} {:<10} {:<20}".format(
            term, len(doc_ids), str(doc_ids[:5]) + "..." if len(doc_ids) > 5 else str(doc_ids)
        ))


# 3. SIMILARITY CALCULATION
# =========================


In [74]:
def calculate_similarity_matrix(doc_vectors):
    """
    Calculate cosine similarity matrix for document pairs

    Args:
        doc_vectors (dict): Document vectors

    Returns:
        dict: Similarity matrix for document pairs
    """
    doc_ids = list(doc_vectors.keys())
    similarity_matrix = {}

    for i, doc_id1 in enumerate(doc_ids):
        similarity_matrix[doc_id1] = {}
        vec1 = doc_vectors[doc_id1]

        for doc_id2 in doc_ids:
            if doc_id1 == doc_id2:
                similarity_matrix[doc_id1][doc_id2] = 1.0
                continue

            vec2 = doc_vectors[doc_id2]

            # Calculate dot product
            dot_product = 0
            for term, tfidf1 in vec1.items():
                if term in vec2:
                    dot_product += tfidf1 * vec2[term]

            # Calculate magnitudes
            mag1 = math.sqrt(sum(tfidf**2 for tfidf in vec1.values()))
            mag2 = math.sqrt(sum(tfidf**2 for tfidf in vec2.values()))

            # Calculate cosine similarity
            if mag1 * mag2 == 0:
                similarity_matrix[doc_id1][doc_id2] = 0
            else:
                similarity_matrix[doc_id1][doc_id2] = dot_product / (mag1 * mag2)

    return similarity_matrix


In [75]:
def get_most_similar_documents(similarity_matrix, doc_id, top_n=5):
    """
    Get the most similar documents to a given document

    Args:
        similarity_matrix (dict): Similarity matrix for document pairs
        doc_id (int): Document ID
        top_n (int): Number of similar documents to return

    Returns:
        list: Top similar documents with similarity scores
    """
    similarities = similarity_matrix[doc_id]

    # Sort by similarity score (descending)
    sorted_similarities = sorted(similarities.items(),
                                 key=lambda x: x[1],
                                 reverse=True)

    # Exclude the document itself (similarity = 1.0)
    similar_docs = [(doc_id2, score) for doc_id2, score in sorted_similarities
                    if doc_id2 != doc_id]

    return similar_docs[:top_n]


# 4. RECOMMENDER SYSTEM
# =====================


In [76]:
def search(query, documents, inverted_index, doc_vectors, user_id=None, user_profiles=None, tolerance=0.8):
    """
    Search for documents matching a query, with optional personalization

    Args:
        query (str): Search query
        documents (dict): Dictionary of documents
        inverted_index (dict): Inverted index mapping terms to document IDs
        doc_vectors (dict): Document vectors for similarity calculations
        user_id (str, optional): User ID for personalized results
        user_profiles (dict, optional): Dictionary of user profiles
        tolerance (float): Tolerance threshold for fuzzy matching

    Returns:
        list: Ranked list of matching documents
    """
    # Preprocess the query
    query_tokens = preprocess_text(query)

    # If no valid tokens after preprocessing, return empty result
    if not query_tokens:
        return []

    # Find matching documents for each query term
    matching_docs = set()

    for query_term in query_tokens:
        # Try exact matching first
        if query_term in inverted_index:
            matching_docs.update(inverted_index[query_term])
        else:
            # Try fuzzy matching if exact match not found
            all_terms = list(inverted_index.keys())
            close_matches = get_close_matches(query_term, all_terms, n=3, cutoff=tolerance)

            for match in close_matches:
                matching_docs.update(inverted_index[match])

    # If no matching documents found, return empty result
    if not matching_docs:
        return []

    # Calculate query vector
    query_vector = {}
    for term in query_tokens:
        # Use TF-IDF weight if the term is in the corpus, otherwise give it a default weight
        query_vector[term] = query_vector.get(term, 0) + 1

    # Normalize query vector
    query_length = len(query_tokens)
    for term in query_vector:
        query_vector[term] /= query_length

    # Calculate similarity to query for each matching document
    similarities = []

    for doc_id in matching_docs:
        doc_vector = doc_vectors[doc_id]

        # Calculate dot product
        dot_product = 0
        for term, weight in query_vector.items():
            if term in doc_vector:
                dot_product += weight * doc_vector[term]

        # Calculate magnitudes
        query_mag = math.sqrt(sum(w**2 for w in query_vector.values()))
        doc_mag = math.sqrt(sum(w**2 for w in doc_vector.values()))

        # Calculate cosine similarity
        if query_mag * doc_mag == 0:
            similarity = 0
        else:
            similarity = dot_product / (query_mag * doc_mag)

        similarities.append((doc_id, similarity))

    # Sort by similarity score (descending)
    ranked_results = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Apply personalized ranking if user_id is provided
    if user_id and user_profiles and user_id in user_profiles:
        ranked_results = personalized_ranking(ranked_results, user_id, documents, user_profiles)

    return ranked_results

In [77]:
def display_search_results(results, documents, user_id=None, user_profiles=None, top_n=5):
    """
    Display search results with personalization indicators if applicable

    Args:
        results (list): Ranked list of matching documents
        documents (dict): Dictionary of documents
        user_id (str, optional): User ID for personalized results
        user_profiles (dict, optional): Dictionary of user profiles
        top_n (int): Number of results to display

    Returns:
        None
    """
    if not results:
        print("No matching documents found.")
        return

    # Display user profile info if personalization was applied
    if user_id and user_profiles and user_id in user_profiles:
        user_name = user_profiles[user_id]['name']
        print(f"Personalized results for {user_name} (user_id: {user_id})")

    print(f"Found {len(results)} matching documents.")
    print("=" * 80)

    for i, (doc_id, score) in enumerate(results[:top_n]):
        doc = documents[doc_id]
        title = doc['name']
        category = doc['category']

        # Add personalization indicator if appropriate
        personalized_indicator = ""
        if user_id and user_profiles and user_id in user_profiles:
            # Check if this document contains terms from user's interest vector
            user_interests = user_profiles[user_id]['interest_vector']
            matches = sum(1 for term in user_interests if term in doc['tokens'])
            if matches > 3:  # Arbitrary threshold for demonstration
                personalized_indicator = " [Matches your interests]"

        print(f"Rank {i+1}: {title} [Category: {category}]{personalized_indicator}")
        print(f"Similarity Score: {score:.4f}")

        # Display snippet (first 150 characters of text)
        snippet = doc['text'][:150].strip() + "..." if len(doc['text']) > 150 else doc['text']
        print(f"Snippet: {snippet}")
        print("-" * 80)

# 5. USER PROFILES
# ========================

In [78]:
def initialize_user_profiles():
    """
    Initialize predefined user profiles with search history.

    Returns:
        dict: Dictionary of user profiles with search history
    """
    user_profiles = {
        'user01': {
            'name': 'Tech User',
            'search_history': [
                'apple new products',
                'artificial intelligence applications',
                'latest smartphone reviews',
                'technology news today',
                'AI machine learning advances',
                'tech company stock prices',
                'new programming languages',
                'software development tools',
                'coding best practices',
                'tech startup funding'
            ],
            'interest_vector': {}  # Will be populated during processing
        },
        'user02': {
            'name': 'Arts & Travel Enthusiast',
            'search_history': [
                'art exhibitions near me',
                'famous museums in europe',
                'travel destinations 2025',
                'cultural heritage sites',
                'best places to visit this summer',
                'art and fashion trends',
                'contemporary artists to watch',
                'historical landmarks',
                'tourism industry news',
                'sustainable travel options'
            ],
            'interest_vector': {}  # Will be populated during processing
        }
    }

    return user_profiles

In [79]:
def build_user_interest_vectors(user_profiles, inverted_index, documents):
    """
    Build interest vectors for each user based on their search history

    Args:
        user_profiles (dict): Dictionary of user profiles
        inverted_index (dict): Inverted index mapping terms to document IDs
        documents (dict): Dictionary of documents

    Returns:
        dict: Updated user profiles with interest vectors
    """
    for user_id, profile in user_profiles.items():
        # Initialize interest vector
        interest_vector = Counter()

        # Process each search query in history
        for query in profile['search_history']:
            # Preprocess query
            query_tokens = preprocess_text(query)

            # Add each token to interest vector with weight
            for token in query_tokens:
                interest_vector[token] += 1

        # Normalize interest vector
        if interest_vector:
            total_weight = sum(interest_vector.values())
            for term in interest_vector:
                interest_vector[term] /= total_weight

        # Store interest vector in user profile
        profile['interest_vector'] = dict(interest_vector)

    return user_profiles

In [80]:
def calculate_profile_similarity(user_id, doc_id, documents, user_profiles):
    """
    Calculate similarity between a user profile and a document

    Args:
        user_id (str): User ID
        doc_id (int): Document ID
        documents (dict): Dictionary of documents
        user_profiles (dict): Dictionary of user profiles

    Returns:
        float: Similarity score between user profile and document
    """
    # If user_id is not provided or invalid, return neutral score
    if user_id not in user_profiles:
        return 0.5

    user_profile = user_profiles[user_id]
    document = documents[doc_id]
    interest_vector = user_profile['interest_vector']

    # If interest vector is empty, return neutral score
    if not interest_vector:
        return 0.5

    # Calculate dot product between interest vector and document tokens
    dot_product = 0
    for term, weight in interest_vector.items():
        if term in document['tokens']:
            dot_product += weight * 1  # Simplified: just check if term exists

    # Normalize by document length for fairness
    doc_length = len(document['tokens'])
    if doc_length > 0:
        similarity = dot_product / math.sqrt(doc_length)
    else:
        similarity = 0

    return similarity

In [81]:
def personalized_ranking(results, user_id, documents, user_profiles):
    """
    Re-rank search results based on user profile

    Args:
        results (list): Original ranked list of (doc_id, similarity) tuples
        user_id (str): User ID
        documents (dict): Dictionary of documents
        user_profiles (dict): Dictionary of user profiles

    Returns:
        list: Re-ranked list of (doc_id, combined_score) tuples
    """
    # If no user_id provided or invalid, return original ranking
    if not user_id or user_id not in user_profiles:
        return results

    # Calculate combined scores (query relevance + profile similarity)
    combined_scores = []

    for doc_id, query_similarity in results:
        # Calculate profile similarity
        profile_similarity = calculate_profile_similarity(user_id, doc_id, documents, user_profiles)

        # Combine scores (70% query relevance, 30% profile similarity)
        combined_score = (0.7 * query_similarity) + (0.3 * profile_similarity)

        combined_scores.append((doc_id, combined_score, query_similarity, profile_similarity))

    # Sort by combined score
    re_ranked_results = sorted(combined_scores, key=lambda x: x[1], reverse=True)

    # Return only doc_id and combined score for compatibility with original format
    return [(doc_id, score) for doc_id, score, _, _ in re_ranked_results]

# 6. PERFORMANCE EVALUATION
# ========================


In [82]:
def evaluate_search(test_queries, documents, inverted_index, doc_vectors, user_id=None, user_profiles=None):
    """
    Evaluate search performance using test queries with detailed logging

    Args:
        test_queries (dict): Dictionary of test queries with relevance judgments
        documents (dict): Dictionary of documents
        inverted_index (dict): Inverted index mapping terms to document IDs
        doc_vectors (dict): Document vectors for similarity calculations
        user_id (str, optional): User ID for personalized evaluation
        user_profiles (dict, optional): Dictionary of user profiles

    Returns:
        dict: Performance metrics
    """
    metrics = {
        'query_id': [],
        'query': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'avg_precision': []
    }

    print("\nDETAILED EVALUATION LOGS:")
    print("=" * 70)
    print(f"Number of test queries: {len(test_queries)}")

    # Add user info if personalization is being evaluated
    if user_id and user_profiles and user_id in user_profiles:
        user_name = user_profiles[user_id]['name']
        print(f"Evaluation for user: {user_name} (user_id: {user_id})")

    for query_id, query_info in test_queries.items():
        query = query_info['query']
        relevant_docs = set(query_info['relevant_docs'])

        print(f"\nQuery {query_id}: '{query}'")
        print(f"Number of relevant documents defined: {len(relevant_docs)}")
        if len(relevant_docs) == 0:
            print("WARNING: No relevant documents defined for this query!")

        # Add query information to metrics
        metrics['query_id'].append(query_id)
        metrics['query'].append(query)

        # Get search results (with personalization if user_id provided)
        results = search(query, documents, inverted_index, doc_vectors, user_id, user_profiles)
        retrieved_docs = set([doc_id for doc_id, _ in results])

        print(f"Number of documents retrieved: {len(retrieved_docs)}")

        # Calculate metrics
        if retrieved_docs:
            intersection = relevant_docs.intersection(retrieved_docs)
            precision = len(intersection) / len(retrieved_docs)
            print(f"Intersection size: {len(intersection)}")
            print(f"Precision: {precision:.4f}")
            metrics['precision'].append(precision)
        else:
            print("No documents retrieved!")
            metrics['precision'].append(0)

        if relevant_docs:
            recall = len(relevant_docs.intersection(retrieved_docs)) / len(relevant_docs)
            print(f"Recall: {recall:.4f}")
            metrics['recall'].append(recall)
        else:
            print("No relevant documents defined!")
            metrics['recall'].append(0)

        # Calculate F1 score
        if metrics['precision'][-1] + metrics['recall'][-1] > 0:
            f1 = 2 * metrics['precision'][-1] * metrics['recall'][-1] / (metrics['precision'][-1] + metrics['recall'][-1])
            print(f"F1 Score: {f1:.4f}")
            metrics['f1_score'].append(f1)
        else:
            print("F1 Score: 0.0000 (precision and recall are both 0)")
            metrics['f1_score'].append(0)

        # Calculate average precision
        avg_precision = 0
        correct_count = 0

        print("\nPrecision at rank calculation:")
        for i, (doc_id, _) in enumerate(results):
            rank = i + 1
            if doc_id in relevant_docs:
                correct_count += 1
                precision_at_k = correct_count / rank
                avg_precision += precision_at_k
                print(f"  Rank {rank}: Document {doc_id} is relevant, precision at {rank} = {precision_at_k:.4f}")
            else:
                print(f"  Rank {rank}: Document {doc_id} is not relevant")

        if correct_count > 0 and len(relevant_docs) > 0:
            avg_precision /= len(relevant_docs)
            print(f"Average Precision: {avg_precision:.4f}")
        else:
            avg_precision = 0
            print("Average Precision: 0.0000 (no relevant documents retrieved)")

        metrics['avg_precision'].append(avg_precision)

    # Calculate average metrics
    avg_metrics = {}
    for metric_name in list(metrics.keys()):
        if metric_name not in ['query_id', 'query']:  # Skip non-numeric fields
            values = metrics[metric_name]
            avg_value = sum(values) / len(values) if values else 0
            avg_metrics[f'avg_{metric_name}'] = avg_value
            print(f"\nAverage {metric_name}: {avg_value:.4f}")

    # Add average metrics to the original metrics dictionary
    metrics.update(avg_metrics)

    return metrics

In [83]:
def display_evaluation_results(metrics):
    """
    Display evaluation results

    Args:
        metrics (dict): Performance metrics

    Returns:
        None
    """
    print("Search System Performance Evaluation")
    print("=" * 50)
    print(f"Average Precision: {metrics['avg_precision']:.4f}")
    print(f"Average Recall: {metrics['avg_recall']:.4f}")
    print(f"Average F1 Score: {metrics['avg_f1_score']:.4f}")
    print(f"Mean Average Precision (MAP): {metrics['avg_avg_precision']:.4f}")


# Main Execution
# =============


In [84]:
def main():
    # Set up base directory
    base_dir = 'bbc_articles'  # Update this path to your data directory

    # 1. Load and preprocess documents
    print("Loading and preprocessing documents...")
    documents, doc_paths = load_documents(base_dir)
    documents = preprocess_documents(documents)

    # 2. Build inverted index and calculate TF-IDF
    print("\nBuilding inverted index and calculating TF-IDF scores...")
    documents = calculate_term_frequencies(documents)
    inverted_index = build_inverted_index(documents)
    tfidf, doc_vectors = calculate_tfidf(documents, inverted_index)

    # Display inverted index
    display_inverted_index(inverted_index)

    # 3. Initialize user profiles and build interest vectors
    print("\nInitializing user profiles...")
    user_profiles = {
        'user01': {
            'name': 'Tech & Programming Enthusiast',
            'search_history': [
                'apple new products',
                'artificial intelligence applications',
                'latest smartphone reviews',
                'python programming language tutorials',
                'AI machine learning advances',
                'python libraries for data science',
                'new programming languages',
                'software development tools',
                'python web frameworks comparison',
                'tech startup funding'
            ],
            'interest_vector': {}  # Will be populated during processing
        },
        'user02': {
            'name': 'Nature & Wildlife Enthusiast',
            'search_history': [
                'art exhibitions near me',
                'exotic animals in rainforests',
                'wildlife photography tips',
                'most dangerous snakes in the world',
                'python snake habitat and behavior',
                'art and nature intersection',
                'animal conservation efforts',
                'natural history museums',
                'endangered reptile species',
                'sustainable wildlife tourism'
            ],
            'interest_vector': {}  # Will be populated during processing
        }
    }

    user_profiles = build_user_interest_vectors(user_profiles, inverted_index, documents)

    print(f"Created {len(user_profiles)} user profiles")

    # Display sample of user interest vectors for verification
    for user_id, profile in user_profiles.items():
        print(f"\nTop interests for {profile['name']} ({user_id}):")
        interests = sorted(profile['interest_vector'].items(), key=lambda x: x[1], reverse=True)
        for term, weight in interests[:10]:  # Show top 10 interests
            print(f"  {term}: {weight:.4f}")

    # 4. Test search functionality with user personalization
    print("\nTesting search functionality with personalization...")

    # Test the Python query to demonstrate different interpretations
    python_query = "python"
    print(f"\nSearch Query: '{python_query}' (should show different results for each user)")

    # Regular search (no personalization)
    print("\nRegular Search Results (no personalization):")
    results = search(python_query, documents, inverted_index, doc_vectors)
    display_search_results(results, documents)

    # Personalized for user01 (Tech & Programming Enthusiast)
    print(f"\nPersonalized Results for {user_profiles['user01']['name']} (user01):")
    results_user01 = search(python_query, documents, inverted_index, doc_vectors, 'user01', user_profiles)
    display_search_results(results_user01, documents, 'user01', user_profiles)

    # Personalized for user02 (Nature & Wildlife Enthusiast)
    print(f"\nPersonalized Results for {user_profiles['user02']['name']} (user02):")
    results_user02 = search(python_query, documents, inverted_index, doc_vectors, 'user02', user_profiles)
    display_search_results(results_user02, documents, 'user02', user_profiles)

    # Add other standard test queries
    standard_test_queries = [
        "technology and artificial intelligence",
        "travel destinations in Europe",
        "art exhibitions and culture"
    ]

    # Test each standard query for each user
    for query in standard_test_queries:
        print(f"\nSearch Query: '{query}'")

        # Regular search (no personalization)
        print("\nRegular Search Results (no personalization):")
        results = search(query, documents, inverted_index, doc_vectors)
        display_search_results(results, documents)

        # Personalized for user01 (Tech & Programming Enthusiast)
        print(f"\nPersonalized Results for {user_profiles['user01']['name']} (user01):")
        results_user01 = search(query, documents, inverted_index, doc_vectors, 'user01', user_profiles)
        display_search_results(results_user01, documents, 'user01', user_profiles)

        # Personalized for user02 (Nature & Wildlife Enthusiast)
        print(f"\nPersonalized Results for {user_profiles['user02']['name']} (user02):")
        results_user02 = search(query, documents, inverted_index, doc_vectors, 'user02', user_profiles)
        display_search_results(results_user02, documents, 'user02', user_profiles)

    # 5. Evaluate search performance
    print("\nEvaluating search performance...")

    # Now include a python-specific evaluation to quantify the difference
    tech_docs = [doc_id for doc_id, doc in documents.items() if doc['category'] == 'technology']
    innovation_docs = [doc_id for doc_id, doc in documents.items() if doc['category'] == 'innovation']

    # For user01 (Tech & Programming), python should mean programming language
    # For user02 (Nature & Wildlife), python should mean snake
    python_tech_docs = [doc_id for doc_id, doc in documents.items()
                        if 'python' in doc['text'].lower() and
                        ('program' in doc['text'].lower() or 'code' in doc['text'].lower() or 'develop' in doc['text'].lower())]

    python_snake_docs = [doc_id for doc_id, doc in documents.items()
                         if 'python' in doc['text'].lower() and
                         ('snake' in doc['text'].lower() or 'reptile' in doc['text'].lower() or 'animal' in doc['text'].lower())]

    # Create evaluation queries
    eval_queries = {
        0: {
            'query': "artificial intelligence",
            'relevant_docs': tech_docs + innovation_docs
        },
        1: {
            'query': "museums cultural exhibition",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] in ['arts', 'travel']]
        },
        2: {
            'query': "python",
            'relevant_docs': python_tech_docs + python_snake_docs  # All python docs are relevant for generic search
        }
    }

    # User-specific evaluation queries
    eval_queries_user01 = {
        0: {
            'query': "artificial intelligence",
            'relevant_docs': tech_docs + innovation_docs
        },
        1: {
            'query': "museums cultural exhibition",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] in ['arts', 'travel']]
        },
        2: {
            'query': "python",
            'relevant_docs': python_tech_docs  # Only programming python docs are relevant for tech user
        }
    }

    eval_queries_user02 = {
        0: {
            'query': "artificial intelligence",
            'relevant_docs': tech_docs + innovation_docs
        },
        1: {
            'query': "museums cultural exhibition",
            'relevant_docs': [doc_id for doc_id, doc in documents.items() if doc['category'] in ['arts', 'travel']]
        },
        2: {
            'query': "python",
            'relevant_docs': python_snake_docs  # Only snake python docs are relevant for nature user
        }
    }

    # Evaluate for each user
    print("\nPerformance for generic search (no personalization):")
    metrics_generic = evaluate_search(eval_queries, documents, inverted_index, doc_vectors)
    display_evaluation_results(metrics_generic)

    print(f"\nPerformance for {user_profiles['user01']['name']} (user01):")
    metrics_user01 = evaluate_search(eval_queries_user01, documents, inverted_index, doc_vectors, 'user01', user_profiles)
    display_evaluation_results(metrics_user01)

    print(f"\nPerformance for {user_profiles['user02']['name']} (user02):")
    metrics_user02 = evaluate_search(eval_queries_user02, documents, inverted_index, doc_vectors, 'user02', user_profiles)
    display_evaluation_results(metrics_user02)

    print("\nRecommender System complete.")

if __name__ == "__main__":
    main()

Loading and preprocessing documents...
Extracting text from file:  bbc_articles/innovation/innovation_news_10.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_8.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_9.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_4.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_5.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_7.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_6.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_2.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_3.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_1.pdf
Extracting text from file:  bbc_articles/innovation/innovation_news_0.pdf
Extracting text from file:  bbc_articles/arts/arts_news_0.pdf
Extracting text from file:  bbc_articles/arts/arts_news_1.pdf
Extracting text from f