In [5]:
import tkinter as tk
from tkinter import filedialog
from tkinter import ttk
import re
from datetime import datetime

def extract_lines():
    user_condition = condition_entry.get()
    pattern = rf'(\w+ \d+ \d+:\d+:\d+ .*{re.escape(user_condition)}.*)'

    file_path = filedialog.askopenfilename(filetypes=[('Text Files', '*.txt')])

    if file_path:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            data = file.read()
            matches = re.findall(pattern, data)
            matches = sorted(matches, key=lambda x: extract_timestamp(x))

            treeview.delete(*treeview.get_children())

            start_time_str = start_time_entry.get()
            end_time_str = end_time_entry.get()

            start_time = datetime.strptime(start_time_str, '%b %d %H:%M:%S')
            end_time = datetime.strptime(end_time_str, '%b %d %H:%M:%S')

            error_count = 0 

            for match in matches:
                timestamp = extract_timestamp(match)
                line = match.replace(timestamp, '', 1).strip()
                line_time = datetime.strptime(timestamp, '%b %d %H:%M:%S')
                if start_time <= line_time <= end_time:
                    if 'error' in line.lower(): 
                        treeview.insert('', tk.END, values=(timestamp, line), tags=('error',))
                        error_count += 1
                    else:
                        treeview.insert('', tk.END, values=(timestamp, line))

            count_label.config(text=f"Matches Found: {len(treeview.get_children())}")
            error_count_label.config(text=f"Error Count: {error_count}")

            file_name_label.config(text=f"Selected File: {file_path}")


def clear_results():
    treeview.delete(*treeview.get_children())
    count_label.config(text="Matches Found: 0")
    error_count_label.config(text="Error Count: 0")

def export_results():
    file_path = filedialog.asksaveasfilename(defaultextension='.txt', filetypes=[('Text Files', '*.txt')])

    if file_path:
        with open(file_path, 'w') as file:
            for item in treeview.get_children():
                line = treeview.item(item)['values'][1] + '\n'
                file.write(line)

def toggle_case_sensitivity():
    global case_sensitive
    case_sensitive = not case_sensitive
    case_sensitivity_button.config(text=f"Case Sensitivity: {'On' if case_sensitive else 'Off'}")

def extract_timestamp(line):
    timestamp = re.search(r'^(\w+ \d+ \d+:\d+:\d+)', line)
    if timestamp:
        return timestamp.group(1)
    return ''

window = tk.Tk()
window.title("LogExtractor")

# Styling options
window.geometry("800x600")
window.config(bg="#F0F0F0")

# Title label
title_label = tk.Label(window, text="LogExtractor", font=("Arial", 18, "bold"), pady=10, bg="#F0F0F0")
title_label.pack()

# Separator
separator = ttk.Separator(window, orient=tk.HORIZONTAL)
separator.pack(fill=tk.X, pady=10)

# Frame for inputs
input_frame = tk.Frame(window, bg="#F0F0F0")
input_frame.pack(pady=10)

# Labels and entry boxes
condition_label = tk.Label(input_frame, text="Enter the condition:", font=("Arial", 12), bg="#F0F0F0")
condition_label.grid(row=0, column=0, sticky='w', padx=10, pady=5)

condition_entry = tk.Entry(input_frame, font=("Arial", 12))
condition_entry.grid(row=0, column=1, padx=10, pady=5)

start_time_label = tk.Label(input_frame, text="Start Timestamp:", font=("Arial", 12), bg="#F0F0F0")
start_time_label.grid(row=1, column=0, sticky='w', padx=10, pady=5)

start_time_entry = tk.Entry(input_frame, font=("Arial", 12))
start_time_entry.grid(row=1, column=1, padx=10, pady=5)

end_time_label = tk.Label(input_frame, text="End Timestamp:", font=("Arial", 12), bg="#F0F0F0")
end_time_label.grid(row=2, column=0, sticky='w', padx=10, pady=5)

end_time_entry = tk.Entry(input_frame, font=("Arial", 12))
end_time_entry.grid(row=2, column=1, padx=10, pady=5)

# Select File button
select_file_button = tk.Button(window, text="Select File", command=extract_lines, font=("Arial", 12), bg="#4287f5", fg="white")
select_file_button.pack(pady=10)

# Frame for treeview
content_frame = tk.Frame(window)
content_frame.pack(fill=tk.BOTH, expand=True)

treeview = ttk.Treeview(content_frame, columns=('Timestamp', 'Line'), show='headings', selectmode='browse')
treeview.heading('Timestamp', text='Timestamp', anchor='w')
treeview.heading('Line', text='Line', anchor='w')

treeview.column('Timestamp', width=200, anchor='w')
treeview.column('Line', width=800, anchor='w')

treeview.tag_configure("error", background="yellow")  # Configure tag for error messages

scrollbar = ttk.Scrollbar(content_frame, orient=tk.VERTICAL, command=treeview.yview)
treeview.configure(yscroll=scrollbar.set)

treeview.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

count_label = tk.Label(window, text="Matches Found: 0", font=("Arial", 12), bg="#F0F0F0")
count_label.pack()

error_count_label = tk.Label(window, text="Error Count: 0", font=("Arial", 12), bg="#F0F0F0")
error_count_label.pack()
file_name_label = tk.Label(window, text="", font=("Arial", 12), bg="#F0F0F0")
file_name_label.pack()
select_file_button.pack(pady=(0, 10))
clear_results_button = tk.Button(window, text="Clear Results", command=clear_results, font=("Arial", 12))
clear_results_button.pack(pady=5)

export_results_button = tk.Button(window, text="Export Results", command=export_results, font=("Arial", 12))
export_results_button.pack(pady=5)


case_sensitive = False
case_sensitivity_button = tk.Button(window, text="Case Sensitivity: Off", command=toggle_case_sensitivity, font=("Arial", 12))
case_sensitivity_button.pack(pady=5)

window.mainloop()


In [5]:
import re

data = []

with open('30th.txt', 'r') as file:
    for line in file:
        line = line.strip()
        # Ignore empty lines
        if not line:
            continue

        # Ignore date and time
        if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
            content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
            if content:
                data.append(content)

print(data)




In [1]:
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Preprocessing
data = []

with open('30th.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if not line:
            continue

        if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
            content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
            if content:
                data.append(content)

# Model Training
if len(data) > 0:
    nlp = spacy.load('en_core_web_sm')
    tokens = []

    for doc in nlp.pipe(data):
        doc_tokens = [token.text for token in doc]
        tokens.append(' '.join(doc_tokens))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(tokens)

    # New Data
    new_data = [
        'pci 0000:00:17.0: reg 0x14:pci 0000:00:17.0: reg 0x14: [mem 0x92b1e000-0x92b1e0ff] '
    ]

    new_tokens = []

    for doc in nlp.pipe(new_data):
        doc_tokens = [token.text for token in doc]
        new_tokens.append(' '.join(doc_tokens))

    new_tfidf_matrix = vectorizer.transform(new_tokens)

    # Calculate Similarity
    similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

    threshold = 0.5  # Adjust the threshold based on your requirements

    for i, data_point in enumerate(new_data):
        max_similarity = max(similarity_scores[i])
        if max_similarity >= threshold:
            index = similarity_scores[i].argmax()
            matched_data = data[index]
            if data_point != matched_data:
                print(f"New Data: {data_point}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n")
        else:
            print(f"New Data: {data_point}\nNo similar data found.\n")

else:
    print("No data available for training.")


New Data: pci 0000:00:17.0: reg 0x14:pci 0000:00:17.0: reg 0x14: [mem 0x92b1e000-0x92b1e0ff] 
Similar Data: pci 0000:00:17.0: reg 0x14: [mem 0x92b1e000-0x92b1e0ff]
Similarity Score: 0.9455300147064646



In [8]:
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Preprocessing
data = []

with open('30th.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if not line:
            continue
        
        if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
            content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
            if content:
                data.append(content)

# Model Training
if len(data) > 0:
    nlp = spacy.load('en_core_web_sm')
    tokens = []

    for doc in nlp.pipe(data):
        doc_tokens = [token.text for token in doc]
        tokens.append(' '.join(doc_tokens))

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(tokens)

    # New Data from File
    new_data_file = 'naya.txt'
    new_data = []

    with open(new_data_file, 'r') as file:
        for line in file:
            line = line.strip()
            # Ignore date and time
            if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
                content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
                if content:
                    new_data.append(content)

    new_tokens = []

    for doc in nlp.pipe(new_data):
        doc_tokens = [token.text for token in doc]
        new_tokens.append(' '.join(doc_tokens))

    new_tfidf_matrix = vectorizer.transform(new_tokens)

    # Calculate Similarity
    similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

    threshold = 0.8  # Adjust the threshold based on your requirements

    for i, data_point in enumerate(new_data):
        max_similarity = max(similarity_scores[i])
        if max_similarity >= threshold:
            index = similarity_scores[i].argmax()
            matched_data = data[index]
            if data_point != matched_data:
                print(f"New Data: {data_point}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n")
        else:
            print(f"New Data: {data_point}\nNo similar data found.\n")

else:
    print("No data available for training.")


New Data: Jun 13 17:07:04 localhost org.fedoraproject.Anaconda.Modules.Timezone[2556]: INFO:LMAOanaconda.core.dbus:Connecting to the Anaconda bus at unix:abstract=/tmp/dbus-woT7MQpZch,guid=066c46e250c2c54fd301e7416488e887.
Similar Data: Jun 30 21:54:06 localhost org.fedoraproject.Anaconda.Modules.Timezone[2619]: INFO:anaconda.core.dbus:Connecting to the Anaconda bus at unix:abstract=/tmp/dbus-ppKPsT8VVl,guid=7eb9282b096a521a679487f162be61cc.
Similarity Score: 0.8024806352543983

New Data: Jun 13 17:07:04 localhost org.fedoraproject.Anaconda.Modules.Timezone[2556]: DEBUG:dasbus.connection:Registering a service name org.fedoraproject.Anaconda.Modules.Timezone.
Similar Data: Jun 30 21:54:06 localhost org.fedoraproject.Anaconda.Modules.Timezone[2619]: DEBUG:dasbus.connection:Registering a service name org.fedoraproject.Anaconda.Modules.Timezone.
Similarity Score: 0.8355566468892596

New Data: Jun 13 17:07:04 localhost dbus-daemon[2556]: Successfully activated service 'org.fedoraproject.Ana

In [34]:
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tkinter import filedialog
from tkinter import *

# Function to handle button click event
def process_data():
    # Get the selected file paths
    data_file_path = data_file_entry.get()
    new_data_file_path = new_data_file_entry.get()

    # Data Preprocessing
    data = []

    with open(data_file_path, 'r',encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                continue

            if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
                content = re.sub(r'^\w{3} \d{2} \d{2}:\d{2}:\d{2} [\w.-]+ kernel:', '', line).strip()
            if content:
                data.append(content)


    # Model Training
    if len(data) > 0:
        nlp = spacy.load('en_core_web_sm')
        tokens = []

        for doc in nlp.pipe(data):
            doc_tokens = [token.text for token in doc]
            tokens.append(' '.join(doc_tokens))

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(tokens)

        # New Data from File
        new_data = []

        with open(new_data_file_path, 'r',encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if re.match(r'\w{3} \d{2} \d{2}:\d{2}:\d{2}', line):
                    continue

            # Extract content after ignoring date and time
                content = re.sub(r'\w{3} \d{2} \d{2}:\d{2}:\d{2} \w+ kernel:', '', line).strip()
                if content:
                    new_data.append(content)

        new_tokens = []

        for doc in nlp.pipe(new_data):
            doc_tokens = [token.text for token in doc]
            new_tokens.append(' '.join(doc_tokens))

        new_tfidf_matrix = vectorizer.transform(new_tokens)

        # Calculate Similarity
        similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

        threshold = 0.5  # Adjust the threshold based on your requirements

        output_text.delete(1.0, END)  # Clear the output text

        for i, data_point in enumerate(new_data):
            max_similarity = max(similarity_scores[i])
            if max_similarity >= threshold:
                index = similarity_scores[i].argmax()
                matched_data = data[index]
                if data_point != matched_data:
                    output_text.insert(END, f"New Data: {data_point}\nSimilar Data: {matched_data}\nSimilarity Score: {max_similarity}\n\n")
            else:
                output_text.insert(END, f"New Data: {data_point}\nNo similar data found.\n\n")

    else:
        output_text.delete(1.0, END)  # Clear the output text
        output_text.insert(END, "No data available for training.")

# Create the GUI window
window = Tk()
window.title("Text Similarity")

def select_file(entry):
    file_path = filedialog.askopenfilename(initialdir="./", title="Select File", filetypes=(("Text Files", "*.txt"), ("All Files", "*.*")))
    entry.delete(0, END)
    entry.insert(END, file_path)

Label(window, text="Training Data File:").grid(row=0, column=0, sticky=W)
data_file_entry = Entry(window, width=50)
data_file_entry.grid(row=0, column=1, padx=5, pady=5)
Button(window, text="Browse", command=lambda: select_file(data_file_entry)).grid(row=0, column=2, padx=5, pady=5)

Label(window, text="New Data File:").grid(row=1, column=0, sticky=W)
new_data_file_entry = Entry(window, width=50)
new_data_file_entry.grid(row=1, column=1, padx=5, pady=5)
Button(window, text="Browse", command=lambda: select_file(new_data_file_entry)).grid(row=1, column=2, padx=5, pady=5)

Button(window, text="Process Data", command=process_data).grid(row=2, column=1, padx=5, pady=10)

Label(window, text="Output:").grid(row=3, column=0, sticky=W)
output_text = Text(window, height=10, width=60)
output_text.grid(row=3, column=1, padx=5, pady=5)
window.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\tkinter\__init__.py", line 1921, in __call__
    return self.func(*args)
  File "C:\Users\shiva\AppData\Local\Temp\ipykernel_4944\786866320.py", line 61, in process_data
    new_tfidf_matrix = vectorizer.transform(new_tokens)
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 2104, in transform
    return self._tfidf.transform(X, copy=False)
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\feature_extraction\text.py", line 1669, in transform
    X = self._validate_data(
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 577, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "C:\Users\shiva\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\vali

New Data: Jun 13 17:07:04 localhost org.fedoraproject.Anaconda.Modules.Timezone[2556]: INFO:LMAOanaconda.core.dbus:Connecting to the Anaconda bus at unix:abstract=/tmp/dbus-woT7MQpZch,guid=066c46e250c2c54fd301e7416488e887.
Similar Data: Jun 30 21:54:06 localhost org.fedoraproject.Anaconda.Modules.Timezone[2619]: INFO:anaconda.core.dbus:Connecting to the Anaconda bus at unix:abstract=/tmp/dbus-ppKPsT8VVl,guid=7eb9282b096a521a679487f162be61cc.
Similarity Score: 0.8024806352543983

New Data: Jun 13 17:07:04 localhost org.fedoraproject.Anaconda.Modules.Timezone[2556]: DEBUG:dasbus.connection:Registering a service name org.fedoraproject.Anaconda.Modules.Timezone.
Similar Data: Jun 30 21:54:06 localhost org.fedoraproject.Anaconda.Modules.Timezone[2619]: DEBUG:dasbus.connection:Registering a service name org.fedoraproject.Anaconda.Modules.Timezone.
Similarity Score: 0.8355566468892596

New Data: Jun 13 17:07:04 localhost dbus-daemon[2556]: Successfully activated service 'org.fedoraproject.Ana