In [None]:
## install fsearch package 
!python -m pip install -q -e .

: 

In [1]:

from fsearch.config import Config

config = Config(**{'Server': 'abc'})

In [2]:
from fsearch.utils import read_file

filepath = "samples/200k.txt"
file_contents = read_file(filepath)

In [2]:
from fsearch.algorithms import native_search, regex_search, rabin_karp_search, kmp_search, aho_corasick_search

query = "13;0;1;26;0;9;4;0;"
query = "13;0;1;"
## native_search
found = native_search(file_contents, query)
print('native_search:', found)
## regex_search
found = regex_search(file_contents, query)
print('regex_search:', found)
found = rabin_karp_search(file_contents, query)
print('rabin_karp:', found)
found = kmp_search(file_contents, query)
print('kmp_search:', found)
found = aho_corasick_search(file_contents, query)
print('aho_corasick:', found)

native_search: False
regex_search: False
rabin_karp: False
kmp_search: False
aho_corasick: False


In [16]:
# Example usage
contents = """13;0;1;26;0;9;4;0;
11;0;23;16;0;19;5;0;
9;0;1;6;0;10;5;0;
11;0;23;11;0;19;5;0;
17;0;1;26;0;7;3;0;
3;0;1;28;0;7;3;0;
4;0;1;28;0;8;3;0;
5;0;1;26;0;8;3;0; 
"""

query_1 = "11;0;23;11;0;19;5;0;"
query_2 = "11;0;23;11;0;19;5"

In [17]:
def rabin_karp_search(pattern, text):
    """
    Rabin-Karp algorithm to find a full line match of a pattern in the text.
    
    :param pattern: The pattern string to search for.
    :param text: The text in which to search for the pattern.
    :return: True if the pattern matches any full line in the text, otherwise False.
    """
    if not pattern or not text:
        return False

    # Define a prime number for the hash function
    prime = 101

    # Calculate the length of the pattern
    m = len(pattern)

    # Initialize hash values for pattern
    pattern_hash = 0

    # Calculate the hash value of the pattern
    for i in range(m):
        pattern_hash = (prime * pattern_hash + ord(pattern[i]))

    lines = text.split('\n')
    for line in lines:
        n = len(line)
        if n != m:
            continue
        
        # Initialize hash value for current line
        current_hash = 0
        for i in range(m):
            current_hash = (prime * current_hash + ord(line[i]))

        # Compare the hash values
        if pattern_hash == current_hash:
            # Check for exact match to avoid hash collision
            if line == pattern:
                return True

    return False


print('rabin_karp (q1):', rabin_karp_search(query_1, contents))  # Output: True
print('rabin_karp (q2):', rabin_karp_search(query_2, contents))  # Output: False


rabin_karp (q1): True
rabin_karp (q2): False


In [18]:
def kmp_search(pattern, text):
    """
    KMP algorithm to find a full line match of a pattern in the text.

    :param pattern: The pattern string to search for.
    :param text: The text in which to search for the pattern.
    :return: True if the pattern matches any full line in the text, otherwise False.
    """
    def compute_lps(pattern):
        """
        Compute the longest prefix which is also suffix array (lps) for the pattern.

        :param pattern: The pattern string.
        :return: The lps array.
        """
        m = len(pattern)
        lps = [0] * m
        length = 0
        i = 1

        while i < m:
            if pattern[i] == pattern[length]:
                length += 1
                lps[i] = length
                i += 1
            else:
                if length != 0:
                    length = lps[length - 1]
                else:
                    lps[i] = 0
                    i += 1

        return lps

    def kmp_search_line(pattern, line):
        """
        KMP search for a pattern in a single line.

        :param pattern: The pattern string.
        :param line: The line of text.
        :return: True if the pattern matches the full line, otherwise False.
        """
        m = len(pattern)
        n = len(line)
        
        if m != n:
            return False
        
        lps = compute_lps(pattern)
        i = 0  # index for line
        j = 0  # index for pattern

        while i < n:
            if pattern[j] == line[i]:
                i += 1
                j += 1

            if j == m:
                return True
            elif i < n and pattern[j] != line[i]:
                if j != 0:
                    j = lps[j - 1]
                else:
                    i += 1

        return False

    lines = text.split('\n')
    for line in lines:
        if kmp_search_line(pattern, line):
            return True

    return False

# Example usage
print('kmp_search (q1):', kmp_search(query_1, contents))  # Output: True
print('kmp_search (q2):', kmp_search(query_2, contents))  # Output: False

kmp_search (q1): True
kmp_search (q2): False


In [21]:
class AhoCorasick:
    def __init__(self):
        self.goto = {}
        self.output = {}
        self.fail = {}
        self.new_state = 0

    def add_pattern(self, pattern):
        state = 0
        for char in pattern:
            if (state, char) not in self.goto:
                self.new_state += 1
                self.goto[(state, char)] = self.new_state
            state = self.goto[(state, char)]
        self.output[state] = pattern

    def build_automaton(self):
        from collections import deque
        queue = deque()

        for char in {key[1] for key in self.goto if key[0] == 0}:
            state = self.goto[(0, char)]
            self.fail[state] = 0
            queue.append(state)

        while queue:
            r = queue.popleft()
            for key in {key[1] for key in self.goto if key[0] == r}:
                s = self.goto[(r, key)]
                queue.append(s)
                state = self.fail[r]
                while (state, key) not in self.goto and state != 0:
                    state = self.fail[state]
                if (state, key) in self.goto:
                    self.fail[s] = self.goto[(state, key)]
                else:
                    self.fail[s] = 0
                if self.fail[s] in self.output:
                    self.output[s] = self.output[self.fail[s]]

    def search(self, text):
        state = 0
        results = []
        for index, char in enumerate(text):
            while (state, char) not in self.goto and state != 0:
                state = self.fail[state]
            if (state, char) in self.goto:
                state = self.goto[(state, char)]
                if state in self.output:
                    results.append((index - len(self.output[state]) + 1, self.output[state]))
            else:
                state = 0
        return results

def aho_corasick_search(pattern, text):
    """
    Aho-Corasick algorithm to find a full line match of a pattern in the text.

    :param pattern: The pattern string to search for.
    :param text: The text in which to search for the pattern.
    :return: True if the pattern matches any full line in the text, otherwise False.
    """
    aho = AhoCorasick()
    aho.add_pattern(pattern)
    aho.build_automaton()

    lines = text.split('\n')
    for line in lines:
        if len(line) == len(pattern):
            matches = aho.search(line)
            for _, match in matches:
                if match == pattern:
                    return True

    return False

# Example usage
print('aho_corasick_search (q1):', aho_corasick_search(query_1, contents))  # Output: True
print('aho_corasick_search (q2):', aho_corasick_search(query_2, contents))  # Output: False

aho_corasick_search (q1): True
aho_corasick_search (q2): False


In [1]:
from fsearch.utils import benchmark_algorithms

file_path = "samples/200k.txt"
report_path = 'reports/benchmark.pdf'

benchmark_algorithms(file_path, report_path)



Algorithm            Time (seconds) 
-----------------------------------
Regex Search         0.018028       
Native Search        0.187474       
Rabin-Karp Search    0.282420       
Aho-Corasick Search  0.328304       
KMP Search           0.403537       
----------------------------------- 


Benchmark report saved to reports/benchmark.pdf


In [6]:
import os
import matplotlib.pyplot as plt
from io import BytesIO
import timeit
import weasyprint
import base64
import pandas as pd
from fsearch.algorithms import regex_search, rabin_karp_search, kmp_search, aho_corasick_search, native_search

def read_file(file_path: str) -> str:
    with open(file_path, 'r') as file:
        return file.read()


In [23]:
import os
import random
import timeit
from typing import List, Dict
from io import BytesIO
import matplotlib.pyplot as plt
import weasyprint
import base64

# Create a pdf report of the results
report_template : str = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Benchmark Results</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                margin: 5px;
            }}
            .header {{
                text-align: center;
                margin-bottom: 50px;
            }}
            .plot {{
                text-align: center;
                margin: 20px 5px;
            }}
            .table {{
                margin: 10px;
            }}
            
        </style>
    </head>
    <body>
        <div class="header">
            <h1>Benchmarking Search Algorithms</h1>
        </div>
        <div class="table">
            <pre>{table_str}</pre>
        </div>
        <div class="plot">
            <img src="data:image/png;base64,{img_str}" alt="Benchmark Plot">
        </div>
    </body>
    </html>
"""


def generate_samples(file_path: str, size: int = 10) -> List[str]:
    """
    Sample random lines from a file.

    Args:
        - file_path (str): Path to the file.
        - size (int): Number of lines to sample. Defaults to 10.

    Returns:
    list: A list of sampled lines.
    """
    lines = read_file(file_path).split("\n")
    total = len(lines)

    if size > total:
        size = total
    
    sampled_lines = random.sample(lines, size)
    return sampled_lines

def plot_benchmarks(results: Dict[str, Dict[str, float]]) -> BytesIO:
    """
    Plots a grouped bar chart for the benchmark results and returns the BytesIO object.

    Args:
        results (dict): A dictionary containing the algorithm names as keys and another dictionary as values,
                        where the keys are file sizes and the values are execution times.

    Returns:
        BytesIO: The BytesIO object containing the plot image.
    """
    algorithms = list(results.keys())
    file_sizes = list(results[algorithms[0]].keys())

    fig, ax = plt.subplots(figsize=(7, 6))
    width = 0.15
    x = range(len(file_sizes))

    for i, algorithm in enumerate(algorithms):
        times = [results[algorithm][file_size] for file_size in file_sizes]
        ax.bar([pos + i * width for pos in x], times, width, label=algorithm)

    ax.set_xlabel('File Size')
    ax.set_ylabel('Time (seconds)')
    ax.set_title('Benchmark of Search Algorithms')
    ax.set_xticks([pos + width * (len(algorithms) / 2) for pos in x])
    ax.set_xticklabels(file_sizes)
    ax.legend()

    buffer = BytesIO()
    plt.tight_layout()
    plt.savefig(buffer, format='png')
    buffer.seek(0)
    plt.close()

    return buffer

def print_benchmarks(results: Dict[str, Dict[str, float]]) -> str:
    """
    Pretty prints the benchmark results as a table.

    Args:
        results (dict[str, dict[str, float]]): A dictionary with algorithm names as keys and dictionaries of file sizes and times as values.

    Returns:
    str: The table string representation of the results.
    """
    file_sizes = list(next(iter(results.values())).keys())
    headers = ["Algorithm"] + file_sizes + ["Average"]
    row_format = "{:<20}" + "{:<15}" * (len(headers) - 1)
    
    table_str = row_format.format(*headers) + "\n"
    table_str += "-" * 20 + "-" * 15 * (len(headers) - 1) + "\n"

    for algorithm, times in results.items():
        avg_time = sum(times.values()) / len(times)
        row = [algorithm] + [f"{times[file_size]:.6f}" for file_size in file_sizes] + [f"{avg_time:.6f}"]
        table_str += row_format.format(*row) + "\n"

    table_str += "-" * 20 + "-" * 15 * (len(headers) - 1) + "\n"
    print(table_str)
    return table_str

def benchmark_algorithms(file_paths: List[str], report_path: str, sample_size: int = 11):
    """
    Benchmarks the different search algorithms using the content of the specified files and patterns
    sampled from the files, then creates a PDF report with the plotted benchmark results using WeasyPrint.

    Args:
        file_paths (list): A list of paths to the search files.
        pattern (str): The pattern to search for in the files.
        report_path (str): The path the benchmark PDF report will be saved to.
        sample_size (int): Number of lines to sample for generating patterns.

    Returns:
        None
    """
    algorithms = {
        'Native Search': native_search,
        'Rabin-Karp Search': rabin_karp_search,
        'KMP Search': kmp_search,
        'Aho-Corasick Search': aho_corasick_search,
        'Regex Search': regex_search
    }
    
    results = {algorithm: {} for algorithm in algorithms.keys()}

    for file_path in file_paths:
        try:
            text = read_file(file_path)
            file_size = os.path.getsize(file_path)
            #file_size_label = f"{file_size / 1024:.2f} KB" if file_size < 1024 ** 2 else f"{file_size / 1024 ** 2:.2f} MB"
            file_size_label = sum(1 for i in open(file_path, 'rb'))
            pattern = generate_samples(file_path, 1)[0]
            for name, algorithm in algorithms.items():
                timer = timeit.Timer(lambda: algorithm(text, pattern))
                time_taken = timer.timeit(number=1)  # Run the algorithm 10 times and get the average time
                if file_size_label not in results[name]:
                    results[name][file_size_label] = []
                results[name][file_size_label].append(time_taken)
        
        except FileNotFoundError:
            print(f"File at path {file_path} not found.")
        except Exception as e:
            print(f"An error occurred with file {file_path}: {e}")

    avg_results = {algorithm: {file_size: sum(times) / len(times) for file_size, times in result.items()} for algorithm, result in results.items()}
    sorted_results = dict(sorted(avg_results.items(), key=lambda item: sum(item[1].values()) / len(item[1].values())))

    # Pretty print the results
    table_str = print_benchmarks(sorted_results)

    # Plot the results
    plot_img = plot_benchmarks(sorted_results)
    img_str = base64.b64encode(plot_img.read()).decode('utf-8')
    
    report_template = report_template.format(table_str=table_str, img_str=img_str)
    weasyprint.HTML(string=report_template).write_pdf(report_path)
    print(f"Benchmark report saved to {report_path}")




In [22]:
# Example usage
file_paths = [
    "samples/200k.txt",
    "samples/400k.txt"
]
#from fsearch.utils import benchmark_algorithms

report_path = 'benchmark_results.pdf'

benchmark_algorithms(file_paths, report_path, 1)

samples/200k.txt
samples/400k.txt
Algorithm           271100         813300         Average        
-----------------------------------------------------------------
Regex Search        0.003694       0.001705       0.002699       
Native Search       0.019504       0.065433       0.042469       
Rabin-Karp Search   0.038471       0.062112       0.050291       
KMP Search          0.064064       0.068926       0.066495       
Aho-Corasick Search 0.057100       0.076279       0.066689       
-----------------------------------------------------------------

Benchmark report saved to benchmark_results.pdf


In [5]:
import os
for file in os.scandir():
    print(file.path)

./setup.py
./LICENSE
./README.md
./Makefile
./notebook.ipynb
./Archive.zip
./.git
./samples
./fsearch.egg-info
./.certs
./__pycache__
./fsearch
./.gitignore
./reports
./config.ini
./tests
./pytest.ini
./conftest.py
./requirements.txt
./.vscode
./.pytest_cache
./.coverage
./client.py


In [11]:
import itertools

list(itertools.accumulate(range(10), pow))
list(map(pow, range(10), itertools.repeat(2)))
len(list(range(10000, 1000000, 100000)))

10

In [6]:
def loop_with_accumulate(start: int = 10000, end: int = 1000000, steps: int = 10):
    """
    Loops from start to end, stepping a specified number of times using itertools.accumulate,
    and repeats the loop 10 times.

    Args:
        start (int): The starting number. Defaults to 10,000.
        end (int): The ending number. Defaults to 1,000,000.
        steps (int): The number of steps to take between start and end. Defaults to 10.
    """
    step_size = (end - start) // (steps - 1)
    increments = [step_size] * (steps - 1)
    out = []
    for i in range(10):
        sequence = itertools.accumulate([start] + increments)
        for num in sequence:
            # Perform any operation here
            out.append(num)
    print(out)
    return out
# Example usage
x = loop_with_accumulate()
print(len(x))

[10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000, 10000, 120000, 230000, 340000, 450000, 560000, 670000, 780000, 890000, 1000000]
100


In [None]:
a function that print this python value 
```
{
    '10000-kb': {
        10: {38.31242800151813, 63.6}
    },
    '20000-kb': {10: {33.3, 60.6}},
}
```
to a table string of like below format

Queries| 10000-kb  | 20000-kb   |
------- ----------- ------------
10     | 38.3, 63.6| 33.3, 60.6 |

In [12]:
def format_dict_to_table(data: dict) -> str:
    """
    Formats the nested dictionary into a table string.
    
    Args:
        data (dict): The dictionary to format.

    Returns:
        str: The formatted table string.
    """
    # Extract headers from the dictionary keys
    headers = ['Queries'] + list(data.keys())
    
    # Initialize the rows list with the header
    rows = [headers]
    
    # Get the unique query numbers (like 10) from the nested dictionaries
    queries = sorted({query for subdict in data.values() for query in subdict.keys()})
    
    for query in queries:
        # Initialize a row with the current query number
        row = [str(query)]
        for key in data.keys():
            # Get the set of values for the current query number
            values_set = data[key].get(query)
            if values_set:
                # Format the values as comma-separated strings
                formatted_values = ', '.join(f"{value:.1f}" for value in sorted(values_set))
            else:
                formatted_values = ''
            row.append(formatted_values)
        # Add the formatted row to the rows list
        rows.append(row)
    
    # Calculate the maximum width of each column
    col_widths = [max(len(str(item)) for item in col) for col in zip(*rows)]
    
    # Build the formatted table string
    table = ""
    for row in rows:
        formatted_row = " | ".join(str(item).ljust(width) for item, width in zip(row, col_widths))
        table += f"{formatted_row}\n"
    
    return table

# Example usage
data = {
    '10000-kb': {10: {38.31242800151813, 63.6}},
    '20000-kb': {10: {33.3, 60.6}},
}

table_str = format_dict_to_table(data)
print(table_str)

Queries | 10000-kb   | 20000-kb  
10      | 38.3, 63.6 | 33.3, 60.6



In [15]:
def format_dict_to_table(data: dict) -> str:
    """
    Formats the nested dictionary into a table string with underscores under headers.
    
    Args:
        data (dict): The dictionary to format.

    Returns:
        str: The formatted table string.
    """
    # Extract headers from the dictionary keys
    headers = ['Queries'] + list(data.keys())
    
    # Initialize the rows list with the header
    rows = [headers]
    
    # Get the unique query numbers (like 10) from the nested dictionaries
    queries = sorted({query for subdict in data.values() for query in subdict.keys()})
    
    for query in queries:
        # Initialize a row with the current query number
        row = [str(query)]
        for key in data.keys():
            # Get the set of values for the current query number
            values_set = data[key].get(query)
            if values_set:
                # Format the values as comma-separated strings
                formatted_values = ' | '.join(f"{value:.1f}" for value in sorted(values_set))
            else:
                formatted_values = ''
            row.append(formatted_values)
        # Add the formatted row to the rows list
        rows.append(row)
    
    # Calculate the maximum width of each column
    col_widths = [max(len(str(item)) for item in col) for col in zip(*rows)]
    
    # Build the formatted table string
    table = ""
    
    # Format the header row
    header_row = " | ".join(str(item).ljust(width) for item, width in zip(rows[0], col_widths))
    table += f"{header_row}\n"
    
    # Add the underline row
    underline_row = " | ".join('-' * width for width in col_widths)
    table += f"{underline_row}\n"
    
    # Add the data rows
    for row in rows[1:]:
        formatted_row = " | ".join(str(item).ljust(width) for item, width in zip(row, col_widths))
        table += f"{formatted_row}\n"
    
    return table

# Example usage
data = {
    '10000-kb': {10: {38.31242800151813, 63.6}},
    '20000-kb': {10: {33.3, 60.6}},
}

table_str = format_dict_to_table(data)
print(table_str)


Queries | 10000-kb    | 20000-kb   
------- | ----------- | -----------
10      | 38.3 | 63.6 | 33.3 | 60.6



In [19]:
list(range(1))

[0]

In [None]:
import configparser

def write_config(config_path: str, configs):
    """
    Writes the provided configuration dictionary to a file.

    Parameters:
        config_path : str
            The file path where the configuration should be written.

        configs : Dict[str, str]
            A dictionary containing configuration key-value pairs to be written
            under the 'DEFAULT' section of the configuration file.

    Returns:
        None
    """
    config = configparser.ConfigParser()
    config["DEFAULT"] = configs
    with open(config_path, "w") as configfile:
        config.write(configfile)

c = {"host": 1234,"port": 8080, "linux_path":'jimmy', "REREAD_ON_QUERY": False}
#write_config('.test-conf', c)
from fsearch.utils import read_config
read_config('.test-conf')

: 