Skip to content
Permalink
Browse files

Initial version

  • Loading branch information...
mre committed Jan 28, 2018
0 parents commit 82c1d36835348718f04c9ca0dd2c1ebf8b19a312
Showing with 320 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +202 −0 Cargo.lock
  3. +10 −0 Cargo.toml
  4. +1 −0 README.md
  5. +104 −0 src/main.rs
@@ -0,0 +1,3 @@

/target/
**/*.rs.bk

Some generated files are not rendered by default. Learn more.

@@ -0,0 +1,10 @@
[package]
authors = ["Matthias Endler <matthias-endler@gmx.net>"]
name = "tinysearch"
version = "0.1.0"

[dependencies]
bloom = "0.3.2"
lazy_static = "1.0.0"
structopt = "0.1.7"
structopt-derive = "0.1.6"
@@ -0,0 +1 @@
This is a Rust implementation of ["Writing a full-text search engine using Bloom filters"](https://www.stavros.io/posts/bloom-filter-search-engine/).
@@ -0,0 +1,104 @@
extern crate bloom;

extern crate structopt;
#[macro_use]
extern crate structopt_derive;

use std::path::PathBuf;
use std::ffi::OsString;
use std::fs;
use std::fs::File;
use std::collections::{HashMap, HashSet};
use std::io::Read;

use std::error::Error;
use bloom::{BloomFilter, ASMS};
use structopt::StructOpt;

#[derive(StructOpt, Debug)]
struct Opt {
#[structopt(help = "Path to input files (search corpus)")]
corpus_path: String,

#[structopt(help = "Search terms")]
search_terms: String,
}

fn main() {
let opt = Opt::from_args();
// TODO: Proper error handling
run(opt.corpus_path, opt.search_terms).unwrap();
}

fn run(corpus_path: String, search_terms: String) -> Result<(), Box<Error>> {
let filters = generate(corpus_path)?;
let matches = search(search_terms, filters);
println!("Found the following matches: {:#?}", matches);
Ok(())
}

// def search(search_string):
// search_terms = re.split("\W+", search_string)
// return [name for name, filter in filters.items() if all(term in filter for term in search_terms)]
fn search(query: String, filters: HashMap<OsString, BloomFilter>) -> HashSet<OsString> {
let search_terms: HashSet<String> =
query.split_whitespace().map(|s| s.to_lowercase()).collect();
let mut results = HashSet::new();
for (name, filter) in filters {
if search_terms.iter().all(|term| filter.contains(term)) {
results.insert(name);
}
}
results
}

// Read all my posts.
// # posts = {post_name: open(POST_DIR + post_name).read() for post_name in os.listdir(POST_DIR)}
fn generate(dir: String) -> Result<HashMap<OsString, BloomFilter>, Box<Error>> {
let paths: Vec<PathBuf> = fs::read_dir(dir)?
.filter_map(Result::ok)
.map(|f| f.path())
.collect();

let mut posts: HashMap<OsString, String> = HashMap::new();
for path in paths {
if !path.is_file() {
continue;
}
let mut post = File::open(&path)?;
let mut contents = String::new();
post.read_to_string(&mut contents)?;
posts.insert(
path.file_name().ok_or("Not a file")?.to_os_string(),
contents,
);
}

// Create a dictionary of {"post name": "lowercase word set"}.
// split_posts = {name: set(re.split("\W+", contents.lower())) for name, contents in posts.items()}
let split_posts: HashMap<OsString, HashSet<String>> = posts
.into_iter()
.map(|(post, content)| {
(
post,
content
.split_whitespace()
.map(|s| s.to_lowercase())
.collect::<HashSet<String>>(),
)
})
.collect();

// At this point, we have a dictionary of posts and a normalized set of words in each.
// We could do more things, like stemming, removing common words (a, the, etc), but
// we’re going for naive, so let’s just create the filters for now:
let mut filters = HashMap::new();
for (name, words) in split_posts {
let mut filter: BloomFilter = BloomFilter::with_rate(0.01, words.len() as u32);
for word in words {
filter.insert(&word);
}
filters.insert(name, filter);
}
Ok(filters)
}

0 comments on commit 82c1d36

Please sign in to comment.
You can’t perform that action at this time.