Loading Libraries

In [43]:
import os
import re
import math
import chardet
import warnings
import pandas as pd
import numpy as np
import tkinter as tk
import pickle
from tkinter import scrolledtext
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')

Function to read documents from a directory

In [44]:
def read_documents(directory):
    universal_set = set()
    documents = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('.txt'):
            doc_id = os.path.splitext(file_name)[0]
            with open(os.path.join(directory, file_name), 'r', encoding='ISO-8859-1') as file:
                content = file.read()
                documents[doc_id] = content
                universal_set.add(doc_id)
    return documents, universal_set

Function to preprocess the content (lowercasing, tokenization, stemming)

In [45]:
def preprocess(content):
    tokens = re.findall(r'\b\w+\b', content.lower())
    porter_stemmer = PorterStemmer()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return stemmed_tokens

Function to build index and calculate tf

In [46]:
def build_index_docs(documents):
    index = {}
    for doc_id, doc_content in documents.items():
        terms = preprocess(doc_content)
        for term in terms:
            if term not in index:
                index[term] = {}
            if doc_id not in index[term]:
                index[term][doc_id] = 0
            index[term][doc_id] += 1
    return index

In [47]:
def build_index_q(query):
    index = {}
    terms = preprocess(query)
    for term in terms:
        if term not in index:
            index[term] = {}
            index[term] = 0
        index[term] += 1
    return index

Function to compute IDF

In [48]:
def compute_idf(token, index, total_docs):
    if token in index.keys():
        df = len(index[token])
        if df == 0:
            return 0
        idf = math.log(total_docs / df)
        return idf
    else:
        return 0

Function to build document vectors

In [49]:
def build_document_vectors(documents, index, total_docs):
    document_vectors = {}
    for doc_id, doc_text in documents.items():
        vector = {}
        for token in index.keys():
            if doc_id in index[token]:
                tf = index[token][doc_id]
            else:
                tf = 0
            idf = compute_idf(token, index, total_docs)
            vector[token] = tf*idf
        document_vectors[doc_id] = vector
    return document_vectors

Function to build query vectors

In [50]:
def build_query_vectors(index, query, q_index, total_docs):
    vector = {}
    for token in index.keys():
        if token in q_index.keys():
            tf = q_index[token]
            idf = compute_idf(token, index, total_docs)
            vector[token] = tf*idf
        else:
            vector[token] = 0
    return vector

Function to compute cosine similarity

In [59]:
def compute_cosine_similarity(query_vector, doc_vector):
    
    if isinstance(doc_vector, dict):
        doc_vector = np.array(list(doc_vector.values()))
    if isinstance(query_vector, dict):
        query_vector = np.array(list(query_vector.values()))
    
    return np.dot(doc_vector, query_vector) / (np.linalg.norm(doc_vector) * (np.linalg.norm(query_vector)))

Function to process queries

In [60]:
def process_query(index, docs_vectors, query, universal_set):
    
    q_index = build_index_q(query)
    query_vector = build_query_vectors(index, query, q_index, len(universal_set))

    sim = {}
    for doc in docs_vectors.keys():
        sim[int(doc)] = compute_cosine_similarity(query_vector, docs_vectors[doc])
    
    sorted_docs = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    
    result = []
    for i in range(len(sorted_docs)):
        if sorted_docs[i][1] >= 0.05:
            result.append(sorted_docs[i][0])
    
    return result

Function to save document vectors and universal set to pickle files

In [61]:
def save_indexes(document_vectors, document_index, universal_set):
    with open('document_vectors.pkl', 'wb') as file:
        pickle.dump(document_vectors, file)
        
    with open('document_index.pkl', 'wb') as file:
        pickle.dump(document_index, file)
        
    with open('universal_set.pkl', 'wb') as file:
        pickle.dump(universal_set, file)

Function to load indexes and universal set from pickle files if they exists

In [62]:
def load_indexes():
    document_vectors_file = 'document_vectors.pkl'
    universal_set_file = 'universal_set.pkl'
    document_index_file = 'document_index.pkl'

    if os.path.exists(document_vectors_file) and os.path.exists(universal_set_file):
        with open(document_vectors_file, 'rb') as file:
            document_vectors = pickle.load(file)
            
        with open(document_index_file, 'rb') as file:
            document_index = pickle.load(file)

        with open(universal_set_file, 'rb') as file:
            universal_set = pickle.load(file)
        
        return document_vectors, document_index, universal_set
    else:
        universal_set = set()
        files_directory = 'C:/Users/UBL-HO.DESKTOP-7ET3E40/Desktop/BAI-6A/IR/a/ResearchPapers'
    
        documents, universal_set = read_documents(files_directory)
        
        index = build_index_docs(documents)
        docs_vectors = build_document_vectors(documents, index, len(universal_set))
        
        save_indexes(docs_vectors, index, universal_set)

        return docs_vectors, index, universal_set

Function to retrieve results based on the entered query

In [63]:
def retrieve_results():
    query = query_entry.get()
    result_text.config(state='normal')
    result_text.delete('1.0', tk.END)
    
    result = process_query(index, docs_vectors, query, universal_set)
    
    result_text.insert(tk.END, f"\nRetrieved Documents: {result}\n", 'result')
    
    result_text.config(state='disabled')

Main part of the code

In [64]:
docs_vectors, index, universal_set = load_indexes()

window = tk.Tk()
window.title("Information Retrieval System")
window.geometry("800x600")
window.configure(background='aliceblue')

query_label = tk.Label(window, text="Enter your query:", font=("Helvetica", 14), bg='aliceblue')
query_label.pack(pady=10)

query_entry = tk.Entry(window, width=50, font=("Helvetica", 12), bg='gray95')
query_entry.pack(pady=10)

search_button = tk.Button(window, text="Search", command=retrieve_results, font=("Helvetica", 12), bg='gray90')
search_button.pack(pady=10)

result_text = scrolledtext.ScrolledText(window, width=100, height=10, state='disabled', font=("Helvetica", 12), bg='gray95')
result_text.pack(pady=10)

result_text.tag_configure('result', foreground='green')

window.mainloop()