In [1]:
# Enhanced Vector Space Model for Information Retrieval

#This notebook implements an enhanced Vector Space Model (VSM) for information retrieval using the lnc.ltc weighting scheme.

In [2]:
import os
import math
import json
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [3]:
## EnhancedVSM Class

#This class implements the core functionality of our Vector Space Model.

In [4]:
class EnhancedVSM:
    def __init__(self, corpus_path, index_dir):
        self.corpus_path = corpus_path
        self.index_dir = index_dir
        self.term_index = defaultdict(lambda: {"doc_freq": 0, "occurrences": {}})
        self.doc_magnitudes = {}
        self.corpus_size = 0
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.doc_id_lookup = {}

    def preprocess_text(self, text):
        text = re.sub(r'[^\w\s]', '', text.lower())
        tokens = word_tokenize(text)
        return [self.lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in self.stop_words]

    def construct_index(self):
        for doc_id, filename in enumerate(sorted(os.listdir(self.corpus_path)), start=1):
            if filename.endswith('.txt'):
                self.corpus_size += 1
                self.doc_id_lookup[str(doc_id)] = filename
                filepath = os.path.join(self.corpus_path, filename)
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                    content = file.read()
                    tokens = self.preprocess_text(content)
                    
                    if not tokens:
                        continue  # Skip empty documents
                    
                    term_freq = defaultdict(int)
                    for token in tokens:
                        term_freq[token] += 1
                    
                    doc_magnitude = 0
                    for term, freq in term_freq.items():
                        if str(doc_id) not in self.term_index[term]["occurrences"]:
                            self.term_index[term]["doc_freq"] += 1
                        self.term_index[term]["occurrences"][str(doc_id)] = freq
                        
                        log_tf = 1 + math.log10(freq)
                        doc_magnitude += log_tf ** 2
                    
                    self.doc_magnitudes[doc_id] = math.sqrt(doc_magnitude)

    def persist_index(self):
        os.makedirs(self.index_dir, exist_ok=True)
        
        with open(os.path.join(self.index_dir, 'term_index.json'), 'w', encoding='utf-8') as f:
            json.dump(self.term_index, f)
        
        with open(os.path.join(self.index_dir, 'doc_magnitudes.json'), 'w', encoding='utf-8') as f:
            json.dump({str(k): v for k, v in self.doc_magnitudes.items()}, f)
        
        with open(os.path.join(self.index_dir, 'corpus_size.txt'), 'w', encoding='utf-8') as f:
            f.write(str(self.corpus_size))

        with open(os.path.join(self.index_dir, 'doc_id_lookup.json'), 'w', encoding='utf-8') as f:
            json.dump(self.doc_id_lookup, f)

    def load_index(self):
        with open(os.path.join(self.index_dir, 'term_index.json'), 'r', encoding='utf-8') as f:
            self.term_index = json.load(f)
        
        with open(os.path.join(self.index_dir, 'doc_magnitudes.json'), 'r', encoding='utf-8') as f:
            self.doc_magnitudes = {int(k): v for k, v in json.load(f).items()}
        
        with open(os.path.join(self.index_dir, 'corpus_size.txt'), 'r', encoding='utf-8') as f:
            self.corpus_size = int(f.read().strip())

        with open(os.path.join(self.index_dir, 'doc_id_lookup.json'), 'r', encoding='utf-8') as f:
            self.doc_id_lookup = json.load(f)

    def execute_query(self, query):
        query_terms = self.preprocess_text(query)
        query_vector = defaultdict(float)
        doc_vectors = defaultdict(lambda: defaultdict(float))

        for term in set(query_terms):
            if term in self.term_index:
                tf = 1 + math.log10(query_terms.count(term))
                idf = math.log10(self.corpus_size / self.term_index[term]["doc_freq"])
                query_vector[term] = tf * idf

        query_magnitude = math.sqrt(sum(weight ** 2 for weight in query_vector.values()))
        if query_magnitude == 0:
            return []  # No valid terms in query
        for term in query_vector:
            query_vector[term] /= query_magnitude

        for term in query_vector:
            for doc_id_str, freq in self.term_index[term]["occurrences"].items():
                doc_id = int(doc_id_str)
                if doc_id in self.doc_magnitudes:
                    doc_vectors[doc_id][term] = 1 + math.log10(freq)

        for doc_id, vector in doc_vectors.items():
            magnitude = self.doc_magnitudes[doc_id]
            for term in vector:
                vector[term] /= magnitude

        similarity_scores = {}
        for doc_id, doc_vector in doc_vectors.items():
            score = sum(query_vector[term] * doc_vector[term] for term in query_vector if term in doc_vector)
            similarity_scores[doc_id] = score

        ranked_docs = sorted(similarity_scores.items(), key=lambda x: (-x[1], x[0]))

        return [(self.doc_id_lookup.get(str(doc_id), f"Unknown-{doc_id}"), score) for doc_id, score in ranked_docs[:10]]

In [5]:
## Building the Index

#Run this cell to build the index for your corpus. Make sure your corpus is in a folder named 'corpus' in the same directory as this notebook.

In [6]:
vsm = EnhancedVSM('corpus', 'index')
print("Constructing index...")
vsm.construct_index()
vsm.persist_index()
print("Index constructed and saved successfully.")

Constructing index...
Index constructed and saved successfully.


In [7]:
## Loading the Index

#If you've already built the index, you can load it using this cell.

In [8]:
vsm = EnhancedVSM('corpus', 'index')
print("Loading index...")
vsm.load_index()
print("Index loaded successfully.")

Loading index...
Index loaded successfully.


In [9]:
## Performing Queries

#Use this cell to perform queries on your corpus. You can modify the queries or add more as needed.

In [10]:
queries = [
    "Developing your Zomato business account and profile is a great way to boost your restaurant's online reputation",
    "Warwickshire, came from an ancient family and was the heiress to some land"
]

for query in queries:
    print(f"\nQuery: '{query}'")
    results = vsm.execute_query(query)
    if results:
        print("Top 10 most relevant documents:")
        for i, (filename, score) in enumerate(results, 1):
            print(f"{i}. {filename} (Similarity: {score:.4f})")
    else:
        print("No relevant documents found.")


Query: 'Developing your Zomato business account and profile is a great way to boost your restaurant's online reputation'
Top 10 most relevant documents:
1. zomato.txt (Similarity: 0.2036)
2. swiggy.txt (Similarity: 0.1213)
3. instagram.txt (Similarity: 0.0564)
4. messenger.txt (Similarity: 0.0556)
5. youtube.txt (Similarity: 0.0454)
6. reddit.txt (Similarity: 0.0440)
7. bing.txt (Similarity: 0.0415)
8. flipkart.txt (Similarity: 0.0396)
9. paypal.txt (Similarity: 0.0389)
10. HP.txt (Similarity: 0.0389)

Query: 'Warwickshire, came from an ancient family and was the heiress to some land'
Top 10 most relevant documents:
1. shakespeare.txt (Similarity: 0.1202)
2. levis.txt (Similarity: 0.0241)
3. nike.txt (Similarity: 0.0183)
4. Adobe.txt (Similarity: 0.0158)
5. zomato.txt (Similarity: 0.0149)
6. huawei.txt (Similarity: 0.0136)
7. skype.txt (Similarity: 0.0121)
8. blackberry.txt (Similarity: 0.0114)
9. reliance.txt (Similarity: 0.0105)
10. Dell.txt (Similarity: 0.0104)


In [11]:
## Interactive Search

#Use this cell to perform interactive searches on your corpus.

In [12]:
while True:
    query = input("Enter your search query (or 'exit' to quit): ")
    if query.lower() == 'exit':
        break

    results = vsm.execute_query(query)
    if results:
        print("\nTop 10 most relevant documents:")
        for i, (filename, score) in enumerate(results, 1):
            print(f"{i}. {filename} (Similarity: {score:.4f})")
    else:
        print("No relevant documents found.")
    print()


Top 10 most relevant documents:
1. shakespeare.txt (Similarity: 0.1202)
2. levis.txt (Similarity: 0.0241)
3. nike.txt (Similarity: 0.0183)
4. Adobe.txt (Similarity: 0.0158)
5. zomato.txt (Similarity: 0.0149)
6. huawei.txt (Similarity: 0.0136)
7. skype.txt (Similarity: 0.0121)
8. blackberry.txt (Similarity: 0.0114)
9. reliance.txt (Similarity: 0.0105)
10. Dell.txt (Similarity: 0.0104)

No relevant documents found.

No relevant documents found.

