In [26]:
import numpy as np
import pandas as pd
import string
import re
import nltk
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import PyPDF2

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

True

In [27]:
class LSI_Model:
    def __init__(self, k=3):
        self.k = k
        self.vectorizer = None
        self.VT = None
        self.S = None
        self.doc_vectors = None
        self.documents = []

    def preprocess(self, text):
        text = text.lower()
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
        words = text.split()
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
        return ' '.join(words)

    def fit(self, documents):
        self.documents = documents
        processed_docs = [self.preprocess(doc) for doc in documents]
        self.vectorizer = TfidfVectorizer(norm='l2', sublinear_tf=True)
        tfidf_matrix = self.vectorizer.fit_transform(processed_docs).toarray()
        U, S, VT = np.linalg.svd(tfidf_matrix, full_matrices=False)
        self.VT = VT[:self.k, :]
        self.S = S[:self.k]
        self.doc_vectors = U[:, :self.k] @ np.diag(S[:self.k])
        self.doc_vectors = normalize(self.doc_vectors)

    def query(self, query_text):
        processed_query = self.preprocess(query_text)
        query_vec = self.vectorizer.transform([processed_query]).toarray()[0]
        S_k_inv = np.diag(1 / self.S)
        query_latent = query_vec @ self.VT.T @ S_k_inv
        query_latent = normalize(query_latent.reshape(1, -1))[0]
        similarities = self.doc_vectors @ query_latent
        similarities = np.maximum(similarities, 0)
        results = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
        return results

In [28]:
class LSI_GUI:
    def __init__(self, model_class):
        self.model_class = model_class
        self.model = self.model_class()
        self.documents1 = []
        self.documents2 = []
        self.documents3 = []
        self.all_documents = []
        self.doc_sources = []
        self.theme = "dark"

        self.root = tk.Tk()
        self.root.title("🔍 LSI Semantic Search")
        self.root.geometry("1000x620")
        self.root.minsize(800, 550)

        self.init_styles()
        self.create_widgets()
        self.apply_theme()
        self.root.mainloop()

    def init_styles(self):
        self.style = ttk.Style()
        self.style.theme_use("default")

    def apply_theme(self):
        font_main = ("Segoe UI", 11)

        if self.theme == "dark":
            bg_color = "#0d1b2a"
            fg_color = "#89c2d9"
            btn_bg = "white"
            btn_fg = "black"
            entry_bg = "white"
            entry_fg = "black"
            result_bg = "white"
            result_fg = "black"
        else:
            bg_color = "#dbeeff"
            fg_color = "#003f5c"
            btn_bg = "#000000"
            btn_fg = "white"
            entry_bg = "#f5f5f5"
            entry_fg = "black"
            result_bg = "#f9f9f9"
            result_fg = "black"

        self.root.configure(bg=bg_color)
        self.top_frame.configure(bg=bg_color) 
        self.title_label.configure(bg=bg_color, fg=fg_color, font=("Segoe UI", 20, "bold"))

        for btn in [self.search_btn, self.save_btn, self.reset_btn,
                    self.settings_btn, self.load_button1, self.load_button2, self.load_button3]:
            btn.configure(bg=btn_bg, fg=btn_fg, font=font_main, relief="flat", activebackground="#d0d0d0")

        self.query_entry.configure(bg=entry_bg, fg=entry_fg, font=("Segoe UI", 12), relief="flat")
        self.results_text.configure(bg=result_bg, fg=result_fg, font=("Segoe UI", 11), relief="flat")

    def create_widgets(self):
        self.top_frame = tk.Frame(self.root)
        self.top_frame.pack(fill="x", pady=5)

        self.settings_btn = tk.Menubutton(self.top_frame, text="⚙ Settings", relief="flat")
        self.settings_btn.pack(side="right", padx=10)

        menu = tk.Menu(self.settings_btn, tearoff=0)
        theme_menu = tk.Menu(menu, tearoff=0)
        theme_menu.add_command(label="Dark", command=lambda: self.set_theme("dark"))
        theme_menu.add_command(label="Light", command=lambda: self.set_theme("light"))
        menu.add_cascade(label="Theme", menu=theme_menu)
        self.settings_btn.configure(menu=menu)

        self.title_label = tk.Label(self.root, text="🔍 LSI Semantic Search")
        self.title_label.pack(pady=10)

        self.load_button1 = tk.Button(self.root, text="📂 Load File 1", command=lambda: self.load_file(1))
        self.load_button2 = tk.Button(self.root, text="📂 Load File 2", command=lambda: self.load_file(2))
        self.load_button3 = tk.Button(self.root, text="📂 Load File 3", command=lambda: self.load_file(3))

        self.load_button1.pack(pady=3)
        self.load_button2.pack(pady=3)
        self.load_button3.pack(pady=3)

        self.query_entry = tk.Entry(self.root)
        self.query_entry.insert(0, "Enter your query here...")
        self.query_entry.bind("<FocusIn>", self.clear_placeholder)
        self.query_entry.bind("<FocusOut>", self.add_placeholder)
        self.query_entry.pack(pady=10, ipadx=8, ipady=10, fill='x', padx=100)

        action_frame = tk.Frame(self.root)
        action_frame.pack(pady=5)

        self.search_btn = tk.Button(action_frame, text="🔍 Search", command=self.run_query)
        self.save_btn = tk.Button(action_frame, text="💾 Save Result", command=self.save_results)
        self.reset_btn = tk.Button(action_frame, text="🔄 Reset", command=self.reset_ui)

        self.search_btn.grid(row=0, column=0, padx=10)
        self.save_btn.grid(row=0, column=1, padx=10)
        self.reset_btn.grid(row=0, column=2, padx=10)

        self.results_text = tk.Text(self.root, wrap='word', height=10, width=80)
        self.results_text.pack(padx=100, pady=10, fill='x')

    def clear_placeholder(self, event):
        if self.query_entry.get() == "Enter your query here...":
            self.query_entry.delete(0, tk.END)

    def add_placeholder(self, event):
        if not self.query_entry.get():
            self.query_entry.insert(0, "Enter your query here...")

    def set_theme(self, theme):
        self.theme = theme
        self.apply_theme()

    def save_results(self):
        results = self.results_text.get("1.0", tk.END).strip()
        if not results:
            messagebox.showwarning("Warning", "No results to save.")
            return
        file_path = filedialog.asksaveasfilename(defaultextension=".txt",
                                                 filetypes=[("Text Files", "*.txt")])
        if file_path:
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(results)
            messagebox.showinfo("Saved", "Results saved successfully.")

    def reset_ui(self):
        self.query_entry.delete(0, tk.END)
        self.results_text.delete("1.0", tk.END)
        self.query_entry.insert(0, "Enter your query here...")

    def load_file(self, file_number):
        file_path = filedialog.askopenfilename(filetypes=[
            ("Supported files", "*.csv *.txt *.pdf"),
            ("CSV Files", "*.csv"),
            ("Text Files", "*.txt"),
            ("PDF Files", "*.pdf")
        ])
        if not file_path:
            return

        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')
                docs = df.iloc[:, 0].dropna().astype(str).tolist()
            elif file_path.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    docs = [line.strip() for line in content.splitlines() if line.strip()]
            elif file_path.endswith('.pdf'):
                with open(file_path, 'rb') as f:
                    reader = PyPDF2.PdfReader(f)
                    texts = [page.extract_text() for page in reader.pages]
                    docs = [t.strip() for t in texts if t and t.strip()]
            else:
                messagebox.showerror("Unsupported Format", "Please select a .csv, .txt, or .pdf file.")
                return

            if not docs:
                messagebox.showwarning("Empty File", "No valid content found in the file.")
                return

            if file_number == 1:
                self.documents1 = docs
                self.load_button1.config(text="✅ File 1 Loaded")
            elif file_number == 2:
                self.documents2 = docs
                self.load_button2.config(text="✅ File 2 Loaded")
            elif file_number == 3:
                self.documents3 = docs
                self.load_button3.config(text="✅ File 3 Loaded")

            messagebox.showinfo("Success", f"File {file_number} loaded successfully!")

        except Exception as e:
            messagebox.showerror("Error", f"Failed to load file:\n{e}")

    def run_query(self):
        query = self.query_entry.get()
        self.results_text.delete(1.0, tk.END)

        if not (self.documents1 or self.documents2 or self.documents3):
            messagebox.showwarning("Warning", "Please load at least one file first.")
            return

        if not query or query == "Enter your query here...":
            messagebox.showwarning("Warning", "Please enter a search query.")
            return

        self.all_documents = []
        self.doc_sources = []

        for doc in self.documents1:
            self.all_documents.append(doc)
            self.doc_sources.append(("File 1", doc))
        for doc in self.documents2:
            self.all_documents.append(doc)
            self.doc_sources.append(("File 2", doc))
        for doc in self.documents3:
            self.all_documents.append(doc)
            self.doc_sources.append(("File 3", doc))

        self.model = self.model_class()
        self.model.fit(self.all_documents)

        results = self.model.query(query)

        if not results:
            self.results_text.insert(tk.END, "No results found.")
            return

        self.results_text.insert(tk.END, f"Top {min(5, len(results))} results for query: '{query}'\n\n")

        for idx, score in results[:5]:
            try:
                source, doc = self.doc_sources[idx]
                self.results_text.insert(tk.END, f"{source} - Doc {idx+1} (Score: {score:.4f}):\n{doc}\n\n")
            except IndexError:
                self.results_text.insert(tk.END, f"⚠️ Error: Document index {idx} out of range.\n\n")

if __name__ == "__main__":
    LSI_GUI(LSI_Model)
