### Get resources

In [1]:
import sys, os
# Ensure repository root is on sys.path so `scripts` package is importable
repo_root = os.path.abspath('..')
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
src_path = os.path.abspath('../src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

DATASET_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))
RULESETS_FILE = os.path.join(DATASET_DIR, 'rulesets.txt')
PATTERNS_FILE = os.path.join(DATASET_DIR, 'patterns.txt')
SHORT_PATTERNS_FILE = os.path.join(DATASET_DIR, '100_short_patterns.txt')

OUTPUT_DIR = os.path.abspath(os.path.join(os.getcwd(),'..', 'output'))

In [2]:
from downloads import *

download_fdr()

FDR String Matcher Setup

Downloading 122 files from Hyperscan repository...

[1/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr.c ... OK
[2/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr.h ... OK
[3/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_internal.h ... OK
[4/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_loadval.h ... OK
[5/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_confirm.h ... OK
[6/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_confirm_runtime.h ... OK
[7/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_compile.cpp ... OK
[8/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_compile.h ... OK
[9/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fdr/fdr_compile_internal.h ... OK
[10/122] https://raw.githubusercontent.com/intel/hyperscan/master/src/fd

In [None]:
# Download the rules. We see rules as packets and extract patterns as their substrings, preferring the part after content.
download_dataset()


Downloading Snort Rulesets for Testing

[1/2] ET-Open ruleset ... OK
      Extracting ... OK
[2/2] Talos Community ruleset ... OK
      Extracting ... OK

Rulesets downloaded to: D:\Projects\string-matchers\dataset\rulesets/

Extracting Patterns from Rules

Found 48 rule files

Extracted 29605 unique patterns
Patterns saved to: D:\Projects\string-matchers\dataset\patterns.txt
Pattern length: min=1, max=660, avg=15

Filtered 7982 patterns (<=8 bytes)
Short patterns saved to: D:\Projects\string-matchers\dataset\short_patterns.txt
Short pattern length: min=1, max=8, avg=5

Combined rulesets (no comments) into: D:\Projects\string-matchers\dataset\rulesets.txt
Rule lines: 51568, Size: 34.53 MB


In [2]:
from extract_patterns import extract_patterns

# Extract 100 short patterns (length <= 8) for FDR, 
# prioritizing those that occur most frequently in rulesets to reveal errors if there are any,
# but this extraction would be rather slow
extract_patterns(PATTERNS_FILE, SHORT_PATTERNS_FILE, 
                 cond=lambda p: len(p) <= 8, 
                 count=100, 
                 rulesets_path=RULESETS_FILE)

Processed 7900/7967 patterns...

100

### Build and Run with short patterns

In [4]:
from build import build_matcher
from run import run_matcher
import os

build_matcher('fdr')
run_matcher("fdr", patterns_file=SHORT_PATTERNS_FILE, 
            rulesets_file=RULESETS_FILE, max_patterns=100, 
            output_dir=os.path.join(OUTPUT_DIR, 'fdr'))


Building FDR String Matcher

Cleaning existing build directory...

Configuring CMake...
CMake configuration successful

Building...
Build successful

Built executable(s):
D:\Projects\string-matchers\src\fdr\build\Debug\fdr.exe

Running FDR Matcher

Executing D:\Projects\string-matchers\src\fdr\build\Debug\fdr.exe
=== FDR String Matcher Application ===

Loading patterns from: d:\Projects\string-matchers\dataset\100_short_patterns.txt
Loaded 100 patterns
Loaded patterns: 100, using 100 valid patterns

Compiling FDR engine...
SUCCESS: FDR engine compiled in 3 ms

Scanning rulesets from: d:\Projects\string-matchers\dataset\rulesets.txt
  Scanned 1000 rulesets...
  Scanned 2000 rulesets...
  Scanned 3000 rulesets...
  Scanned 4000 rulesets...
  Scanned 5000 rulesets...
  Scanned 6000 rulesets...
  Scanned 7000 rulesets...
  Scanned 8000 rulesets...
  Scanned 9000 rulesets...
  Scanned 10000 rulesets...
  Scanned 11000 rulesets...
  Scanned 12000 rulesets...
  Scanned 13000 rulesets...
  Sc

True

### Validate against naive matcher

In [3]:
from naive.naive import naive_match

naive_match(RULESETS_FILE, 
            SHORT_PATTERNS_FILE, 
            os.path.join(OUTPUT_DIR, 'naive'))

  Scanned 100 rulesets...
  Scanned 200 rulesets...
  Scanned 300 rulesets...
  Scanned 400 rulesets...
  Scanned 500 rulesets...
  Scanned 600 rulesets...
  Scanned 700 rulesets...
  Scanned 800 rulesets...
  Scanned 900 rulesets...
  Scanned 1000 rulesets...
  Scanned 1100 rulesets...
  Scanned 1200 rulesets...
  Scanned 1300 rulesets...
  Scanned 1400 rulesets...
  Scanned 1500 rulesets...
  Scanned 1600 rulesets...
  Scanned 1700 rulesets...
  Scanned 1800 rulesets...
  Scanned 1900 rulesets...
  Scanned 2000 rulesets...
  Scanned 2100 rulesets...
  Scanned 2200 rulesets...
  Scanned 2300 rulesets...
  Scanned 2400 rulesets...
  Scanned 2500 rulesets...
  Scanned 2600 rulesets...
  Scanned 2700 rulesets...
  Scanned 2800 rulesets...
  Scanned 2900 rulesets...
  Scanned 3000 rulesets...
  Scanned 3100 rulesets...
  Scanned 3200 rulesets...
  Scanned 3300 rulesets...
  Scanned 3400 rulesets...
  Scanned 3500 rulesets...
  Scanned 3600 rulesets...
  Scanned 3700 rulesets...
  Scanned 

In [6]:
from compare_results import compare_results
compare_results(os.path.join(OUTPUT_DIR, r'fdr\results.txt'), 
                os.path.join(OUTPUT_DIR, r'naive\results.txt'))

Comparing results:
 A: d:\Projects\string-matchers\output\fdr\results.txt
 B: d:\Projects\string-matchers\output\naive\results.txt

counts: A=51568 B=51568 common=51568
mismatches: 0

Files coincide (match lists identical for all indices).


0

### Run the Python implementation

In [7]:
from py_fdr.FDR import fdr_match

fdr_match(RULESETS_FILE, 
          SHORT_PATTERNS_FILE, 
          os.path.join(OUTPUT_DIR, 'fdr_py'))

Detected 24 CPU cores. Run with 12 workers.
  Scanned 100 rulesets...
  Scanned 200 rulesets...
  Scanned 300 rulesets...
  Scanned 400 rulesets...
  Scanned 500 rulesets...
  Scanned 600 rulesets...
  Scanned 700 rulesets...
  Scanned 800 rulesets...
  Scanned 900 rulesets...
  Scanned 1000 rulesets...
  Scanned 1100 rulesets...
  Scanned 1200 rulesets...
  Scanned 1300 rulesets...
  Scanned 1400 rulesets...
  Scanned 1500 rulesets...
  Scanned 1600 rulesets...
  Scanned 1700 rulesets...
  Scanned 1800 rulesets...
  Scanned 1900 rulesets...
  Scanned 2000 rulesets...
  Scanned 2100 rulesets...
  Scanned 2200 rulesets...
  Scanned 2300 rulesets...
  Scanned 2400 rulesets...
  Scanned 2500 rulesets...
  Scanned 2600 rulesets...
  Scanned 2700 rulesets...
  Scanned 2800 rulesets...
  Scanned 2900 rulesets...
  Scanned 3000 rulesets...
  Scanned 3100 rulesets...
  Scanned 3200 rulesets...
  Scanned 3300 rulesets...
  Scanned 3400 rulesets...
  Scanned 3500 rulesets...
  Scanned 3600 rules

In [8]:
from compare_results import compare_results
compare_results(os.path.join(OUTPUT_DIR, r'naive\results.txt'), 
                os.path.join(OUTPUT_DIR, r'fdr_py\results.txt'))

Comparing results:
 A: d:\Projects\string-matchers\output\naive\results.txt
 B: d:\Projects\string-matchers\output\fdr_py\results.txt

counts: A=51568 B=51568 common=51568
mismatches: 0

Files coincide (match lists identical for all indices).


0

In [9]:
# # Cleanup downloaded and generated files

# from scripts.clean import clean
# clean(root='..', yes=True, verbose=True)

In [6]:
from pathlib import Path

short_patterns_file = Path(r"d:\Projects\string-matchers\dataset\short_patterns.txt")
output_file = Path(r"d:\Projects\string-matchers\dataset\100_short_patterns.txt")

# Read first 100 valid patterns
patterns = []
with open(short_patterns_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith('#'):
            patterns.append(line)
            if len(patterns) >= 100:
                break

# Write to output file
with open(output_file, 'w', encoding='utf-8') as f:
    for pattern in patterns:
        f.write(pattern + '\n')

print(f"Written {len(patterns)} patterns to {output_file}")

Written 100 patterns to d:\Projects\string-matchers\dataset\100_short_patterns.txt
