In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and Prepare Data

In [10]:
import os
from collections import namedtuple
import math
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
Book = namedtuple('Book', ['title', 'content', 'vector'])

def load_books():
    books_dir = "/content/drive/MyDrive/BookRetrieval(IR)/Book Retrieval Project"
    books = []
    for filename in os.listdir(books_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(books_dir, filename)
            try:
                # First, try to read the file with UTF-8 encoding
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
            except UnicodeDecodeError:
                # If UTF-8 fails, try with ISO-8859-1 encoding
                with open(file_path, 'r', encoding='iso-8859-1') as file:
                    content = file.read()
            title = os.path.splitext(filename)[0]
            books.append(Book(title=title, content=content, vector=None))
    return books


# Vector Space Model

In [13]:
def create_vector_space_model(books):
    vectorizer = TfidfVectorizer()
    content_list = [book.content for book in books]
    tfidf_matrix = vectorizer.fit_transform(content_list)

    vectorized_books = []
    for i, book in enumerate(books):
        vector = tfidf_matrix[i].toarray()[0]
        vectorized_books.append(Book(title=book.title, content=book.content, vector=vector))

    return vectorizer, vectorized_books

def cosine_similarity(vec1, vec2):
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(a * a for a in vec1))
    magnitude2 = math.sqrt(sum(b * b for b in vec2))
    return dot_product / (magnitude1 * magnitude2)

def search_books(query, vectorizer, books):
    query_vector = vectorizer.transform([query]).toarray()[0]
    results = []
    for book in books:
        similarity = cosine_similarity(query_vector, book.vector)
        results.append((book, similarity))
    return sorted(results, key=lambda x: x[1], reverse=True)[:5]

# Evaluation

In [19]:
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Example ground truth
ground_truth = {
    'python': ['Flask Web Development Developing Web Applications With Python (Miguel Grinberg) (Z-Library)',
               'Building The Data Lakehouse (Bill Inmon, Mary Levins, Ranjeet Srivastava) (Z-Library)',
               'Fundamentals of Data Engineering Plan and Build Robust Data Systems (Joe Reis, Matt Housley) (Z-Library)',
               'The Hundred-Page Machine Learning Book (Andriy Burkov) (Z-Library)',
               'Designing Cloud Data Platforms (Danil Zburivsky, Lynda Partner) (Z-Library)'],
    'cloud': ['Designing Cloud Data Platforms (Danil Zburivsky, Lynda Partner) (Z-Library)',
              'Fundamentals of Data Engineering Plan and Build Robust Data Systems (Joe Reis, Matt Housley) (Z-Library)',
              'Spring in Action (Craig Walls) (Z-Library)',
              'Building The Data Lakehouse (Bill Inmon, Mary Levins, Ranjeet Srivastava) (Z-Library)',
              'Continuous Delivery Reliable Software Releases Through Build, Test, and Deployment Automation (Humble, Jez Farley, David [Humble etc.) (Z-Library)'],
}

# Function to evaluate the search results
def evaluate_search(queries, vectorizer, books, ground_truth):
    for query in queries:
        y_true = []
        y_pred = []

        # Get relevant books for the query from ground truth
        relevant_books = ground_truth.get(query, [])
        results = search_books(query, vectorizer, books)
        retrieved_books = [book.title for book, _ in results]

        # Mark ground truth relevance (1 for relevant, 0 for non-relevant)
        for book in books:
            if book.title in relevant_books:
                y_true.append(1)  # Book is relevant
            else:
                y_true.append(0)  # Book is not relevant

            if book.title in retrieved_books:
                y_pred.append(1)  # Book was retrieved by the search
            else:
                y_pred.append(0)  # Book was not retrieved

        # Calculate precision, recall, and accuracy
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)

        # Print metrics for the current query
        print(f"Query: '{query}'")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall: {recall:.2f}")
        print(f"  Accuracy: {accuracy:.2f}\n")

# Load books and vectorize them
books = load_books()
vectorizer, vectorized_books = create_vector_space_model(books)

# Example query list for evaluation
queries = ['python', 'cloud']
# Run the evaluation
evaluate_search(queries, vectorizer, vectorized_books, ground_truth)


Query: 'python'
  Precision: 1.00
  Recall: 1.00
  Accuracy: 1.00

Query: 'cloud'
  Precision: 0.80
  Recall: 1.00
  Accuracy: 0.95

