In [23]:
import tkinter as tk
from tkinter import filedialog
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import joblib

class LogAnalyzerGUI:
    def __init__(self):
        self.window = tk.Tk()
        self.window.title("Log Analyzer")
        self.window.geometry("400x300")

        # Set GUI colors and fonts
        self.window.configure(bg="#f2f2f2")
        self.label_bg_color = "#f2f2f2"
        self.button_bg_color = "#336699"
        self.button_fg_color = "#ffffff"
        self.result_text_color = "#333333"
        self.result_text_font = ("Arial", 10)

        # Buttons
        self.btn_create_model = self.create_button("Model Creation and Training", self.create_model)
        self.btn_use_saved_model = self.create_button("Use Saved Model and Test", self.use_saved_model)
        self.btn_further_training = self.create_button("Further Training of the Model", self.further_training)
        self.btn_back_to_menu = self.create_button("Back to Menu", self.back_to_menu)

        self.window.mainloop()

    def create_button(self, text, command):
        button = tk.Button(self.window, text=text, bg=self.button_bg_color, fg=self.button_fg_color, command=command)
        button.pack(pady=10)
        return button

    def clear_window(self):
        for widget in self.window.winfo_children():
            widget.destroy()

    def create_model(self):
        self.clear_window()

        # Select Training File
        self.training_file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
        self.label_training_file = self.create_label("Training File: " + self.training_file_path)

        # Select Test File
        self.test_file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
        self.label_test_file = self.create_label("Test File: " + self.test_file_path)

        # Print Matches
        self.btn_print_matches = self.create_button("Print Matches", self.print_matches)
        self.btn_clear_results = self.create_button("Clear Results", self.clear_results)

    def create_label(self, text):
        label = tk.Label(self.window, text=text, bg=self.label_bg_color)
        label.pack(pady=5)
        return label

    def print_matches(self):
        data = [content for content in self.get_log_contents(self.training_file_path)]
        nlp = spacy.load('en_core_web_sm')
        tokens = [' '.join([token.text for token in doc]) for doc in nlp.pipe(data, batch_size=1000, disable=["parser", "ner"])]
        tfidf_matrix = TfidfVectorizer().fit_transform(tokens)

        new_data = [content for content in self.get_log_contents(self.test_file_path)]
        new_tokens = [' '.join([token.text for token in doc]) for doc in nlp.pipe(new_data, batch_size=1000, disable=["parser", "ner"])]
        new_tfidf_matrix = TfidfVectorizer(vocabulary=tfidf_matrix.get_feature_names()).fit_transform(new_tokens)

        similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)
        threshold = 0.66

        self.results_text = self.create_text_widget()
        for i, data_point in enumerate(new_data):
            max_similarity = max(similarity_scores[i])
            found_match = False
            if max_similarity >= threshold:
                index = similarity_scores[i].argmax()
                matched_data = data[index]
                if data_point != matched_data:
                    result = f"New Data: {data_point}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n\n"
                    self.results_text.insert(tk.END, result)

            if not found_match:
                result = f"New Data: {data_point}\nNo similar data found.\n\n"
                self.results_text.insert(tk.END, result)

    def use_saved_model(self):
        self.clear_window()

        # Select Saved Model
        self.saved_model_path = filedialog.askopenfilename(filetypes=[("Pickle Files", "*.pkl")])
        self.label_saved_model = self.create_label("Saved Model: " + self.saved_model_path)

        # Select New Test File
        self.new_test_file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
        self.label_new_test_file = self.create_label("New Test File: " + self.new_test_file_path)

        # Test Saved Model Again
        self.btn_test_saved_model = self.create_button("Test Saved Model", self.test_saved_model)
        self.btn_clear_results = self.create_button("Clear Results", self.clear_results)

    def test_saved_model(self):
        vectorizer = joblib.load(self.saved_model_path)

        new_data = [content for content in self.get_log_contents(self.new_test_file_path)]
        new_tokens = [' '.join([token.text for token in doc]) for doc in nlp.pipe(new_data, batch_size=1000, disable=["parser", "ner"])]
        new_tfidf_matrix = vectorizer.transform(new_tokens)

        similarity_scores = cosine_similarity(new_tfidf_matrix, vectorizer.transform(vectorizer.get_feature_names()))
        threshold = 0.66

        self.results_text = self.create_text_widget()
        for i, data_point in enumerate(new_data):
            max_similarity = max(similarity_scores[i])
            found_match = False
            if max_similarity >= threshold:
                index = similarity_scores[i].argmax()
                matched_data = vectorizer.get_feature_names()[index]
                if data_point != matched_data:
                    result = f"New Data: {data_point}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n\n"
                    self.results_text.insert(tk.END, result)

            if not found_match:
                result = f"New Data: {data_point}\nNo similar data found.\n\n"
                self.results_text.insert(tk.END, result)

    def further_training(self):
        self.clear_window()

        # Select Model for Further Training
        self.model_for_training_path = filedialog.askopenfilename(filetypes=[("Pickle Files", "*.pkl")])
        self.label_model_for_training = self.create_label("Model for Training: " + self.model_for_training_path)

        # Select Additional Training Data
        self.additional_training_data_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
        self.label_additional_training_data = self.create_label("Additional Training Data: " + self.additional_training_data_path)

        # Train Model with Additional Data
        self.btn_train_model = self.create_button("Train Model with Additional Data", self.train_model)
        self.btn_back_to_menu = self.create_button("Back to Menu", self.back_to_menu)

    def train_model(self):
        vectorizer = joblib.load(self.model_for_training_path)

        additional_training_data = [content for content in self.get_log_contents(self.additional_training_data_path)]
        additional_tokens = [' '.join([token.text for token in doc]) for doc in nlp.pipe(additional_training_data, batch_size=1000, disable=["parser", "ner"])]
        tfidf_matrix = vectorizer.transform(additional_tokens)

        joblib.dump(vectorizer, 'trained_model.pkl')
        sparse.save_npz('tfidf_matrix.npz', tfidf_matrix)

    def clear_results(self):
        self.results_text.delete("1.0", tk.END)

    def get_log_contents(self, file_path):
        with open(file_path, 'r') as file:
            return [re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line.strip()) for line in file if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line.strip())]

    def create_text_widget(self):
        text_widget = tk.Text(self.window, bg=self.label_bg_color, fg=self.result_text_color, font=self.result_text_font)
        text_widget.pack(pady=10)
        return text_widget

    def back_to_menu(self):
        self.clear_window()
        self.__init__()

# Create the GUI
log_analyzer_gui = LogAnalyzerGUI()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\shiva\AppData\Local\Temp\ipykernel_4064\4195535685.py", line 69, in print_matches
    new_tfidf_matrix = TfidfVectorizer(vocabulary=tfidf_matrix.get_feature_names()).fit_transform(new_tokens)
AttributeError: 'csr_matrix' object has no attribute 'get_feature_names'


In [51]:
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import joblib
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.scrolledtext import ScrolledText


def preprocess_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
                content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
                if content:
                    data.append(content)
    return data


def train_model(data):
    nlp = spacy.load('en_core_web_sm')
    tokens = []

    for doc in nlp.pipe(data, batch_size=1000, disable=["parser", "ner"]):
        doc_tokens = [token.text for token in doc]
        tokens.append(' '.join(doc_tokens))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(tokens)

    # Save trained model
    joblib.dump(vectorizer, 'trained_model.pkl')
    sparse.save_npz('tfidf_matrix.npz', tfidf_matrix)


def load_trained_model():
    try:
        # Load trained model
        vectorizer = joblib.load('trained_model.pkl')
        tfidf_matrix = sparse.load_npz('tfidf_matrix.npz')
        return vectorizer, tfidf_matrix
    except FileNotFoundError:
        messagebox.showerror("Error", "Trained model and TF-IDF matrix not found. Run training step first.")
        return None, None


def process_new_data(new_data_file, vectorizer, tfidf_matrix):
    results = []
    with open(new_data_file, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            # Ignore date and time
            if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
                content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
                if content:
                    new_tfidf_matrix = vectorizer.transform([content])
                    similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)
                    max_similarity = similarity_scores.max()
                    threshold = 0.66  # Adjust the threshold based on your requirements
                    if max_similarity >= threshold:
                        index = similarity_scores.argmax()
                        matched_data = data[index]
                        if content != matched_data:
                            result = f"New Data: {content}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n"
                            results.append(result)
                    else:
                        result = f"New Data: {content}\nNo similar data found.\n"
                        results.append(result)
    return results


def select_data_file(entry):
    file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
    if file_path:
        entry.delete(0, tk.END)
        entry.insert(tk.END, file_path)


def show_results():
    data_file = data_file_entry.get()
    new_data_file = new_data_file_entry.get()

    if not data_file or not new_data_file:
        messagebox.showwarning("Warning", "Please select both data files.")
        return

    data = preprocess_data(data_file)
    train_model(data)
    vectorizer, tfidf_matrix = load_trained_model()

    if vectorizer is not None and tfidf_matrix is not None:
        results = process_new_data(new_data_file, vectorizer, tfidf_matrix)
        if results:
            results_window = tk.Toplevel()
            results_window.title("Text Similarity Results")
            results_window.geometry("500x500")
            results_window.configure(background="#F0F0F0")

            results_text = ScrolledText(results_window, height=20, width=60, font=("Roboto", 12))
            results_text.pack(pady=20, padx=20)

            for result in results:
                results_text.insert(tk.END, result + '\n')
            results_text.config(state=tk.DISABLED)


# Create the main window
window = tk.Tk()
window.title("Text Similarity")
window.geometry("500x150")
window.configure(background="#F0F0F0")

# Create and place widgets
data_frame = tk.Frame(window, bg="#F0F0F0")
data_frame.pack(pady=20, padx=20)

data_file_label = tk.Label(data_frame, text="Select Data File:", bg="#F0F0F0", fg="#333333", font=("Roboto", 12))
data_file_label.grid(row=0, column=0, pady=(0, 5), sticky="w")

data_file_entry = tk.Entry(data_frame, width=50, font=("Roboto", 12))
data_file_entry.grid(row=1, column=0, pady=(0, 10), padx=5, sticky="w")

data_file_button = tk.Button(data_frame, text="Browse", command=lambda: select_data_file(data_file_entry),
                             font=("Roboto", 12))
data_file_button.grid(row=1, column=1, pady=(0, 10), padx=5, sticky="e")

new_data_file_label = tk.Label(data_frame, text="Select New Data File:", bg="#F0F0F0", fg="#333333",
                               font=("Roboto", 12))
new_data_file_label.grid(row=2, column=0, pady=(0, 5), sticky="w")

new_data_file_entry = tk.Entry(data_frame, width=50, font=("Roboto", 12))
new_data_file_entry.grid(row=3, column=0, pady=(0, 10), padx=5, sticky="w")

new_data_file_button = tk.Button(data_frame, text="Browse", command=lambda: select_data_file(new_data_file_entry),
                                 font=("Roboto", 12))
new_data_file_button.grid(row=3, column=1, pady=(0, 10), padx=5, sticky="e")

results_button = tk.Button(window, text="Show Results", command=show_results, font=("Roboto", 12),
                           bg="#333333", fg="#FFFFFF")
results_button.pack(pady=(0, 10), padx=20)

# Start the main loop
window.mainloop()
