Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

a bloom filter

  • Loading branch information...
commit 90974b898080b210f662010f91490330e322e8e4 1 parent cfe6dd6
Cliff Moon authored
View
104 c/bloom.c
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "bloom.h"
+#include "murmur.h"
+
+
+#define BYTE_INDEX(index) ((int)(index/8))
+#define BIT_INDEX(index) (index % 8)
+
+#define SET_BIT(buff, index) buff[BYTE_INDEX(index)] |= (1 << BIT_INDEX(index))
+#define GET_BIT(buff, index) (int)(buff[BYTE_INDEX(index)] & (1 << BIT_INDEX(index)))
+
+#define
+
+//internal function headers
+static void
+static void read_header(FILE *file, bloom_t *bloom);
+static void write_bloom(FILE *file, bloom_t *bloom);
+
+bloom_t *bloom_open(char *filename) {
+ bloom_t *bloom;
+ FILE *file;
+
+ if (NULL == (file = fopen(filename, "r+"))) {
+ return NULL;
+ }
+
+ bloom = malloc(sizeof(bloom_t));
+ read_header(file, bloom);
+
+ bloom->file = file;
+ bloom->filename = filename;
+ return bloom;
+}
+
+bloom_t *bloom_create(long n, double e) {
+ bloom_t *bloom;
+ FILE* file;
+
+ if (NULL == (file = fopen(filename, "w+"))) {
+ return NULL;
+ }
+
+ bloom = malloc(sizeof(bloom_t));
+
+ bloom->n = n;
+ bloom->e = e;
+ bloom->keys = 0;
+ bloom->seed = rand();
+
+ bloom->m = (int) ceil(n * log(e) / log(1.0 / pow(2, log(2))));
+ bloom->k = (int) round(log(2) * bloom->m / n);
+
+ bloom->bits = malloc(sizeof(char) * (int) round(bloom->m / 8));
+ write_bloom(file, bloom);
+ return bloom;
+}
+
+void bloom_put(bloom_t *bloom, char *buff, int len) {
+ int i=0;
+ unsigned int hash = bloom->seed;
+ unsigned int index;
+ // printf("bloom %p\n", bloom);
+ // printf("k %d\n", bloom->k);
+ for(i=0; i<bloom->k; i++) {
+ hash = MurmurHash2(buff, len, hash);
+ index = hash % bloom->m;
+ // printf("setting index %d\n", index);
+ // printf("byte %d bit %d\n", BYTE_INDEX(index), BIT_INDEX(index));
+ SET_BIT(bloom->bits, index);
+ // printf("byte %d\n", bloom->bits[BYTE_INDEX(index)]);
+ }
+ bloom->keys++;
+}
+
+int bloom_has(bloom_t *bloom, char *buff, int len) {
+ int i=0;
+ unsigned int hash = bloom->seed;
+ unsigned int index;
+ // printf("bloom %p\n", bloom);
+ // printf("k %d\n", bloom->k);
+ for(i=0; i<bloom->k; i++) {
+ hash = MurmurHash2(buff, len, hash);
+ index = hash % bloom->m;
+ // printf("getting index %d\n", index);
+ // printf("byte %d bit %d\n", BYTE_INDEX(index), BIT_INDEX(index));
+ // printf("byte %d\n", bloom->bits[BYTE_INDEX(index)]);
+ // printf("get result %d\n", GET_BIT(bloom->bits, index));
+ if (0 == GET_BIT(bloom->bits, index)) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+void bloom_destroy(bloom_t *bloom) {
+ free(bloom->bits);
+ free(bloom);
+}
+
+static void read_header(FILE *file, bloom_t *bloom) {
+
+}
View
21 c/bloom.h
@@ -0,0 +1,21 @@
+
+typedef struct _bloom_t {
+ char *filename;
+ FILE *file;
+ char * bits;
+ unsigned long n;
+ double e;
+ unsigned int m;
+ unsigned int k;
+ unsigned long keys;
+ unsigned int seed;
+} bloom_t;
+
+bloom_t *bloom_open(char *filename);
+bloom_t *bloom_create(char *filename, long n, double e);
+void bloom_put(bloom_t* bloom, char *buff, int len);
+int bloom_has(bloom_t* bloom, char *buff, int len);
+void bloom_destroy(bloom_t* bloom);
+
+#define bloom_key_size(bloom) ((bloom)->keys)
+#define bloom_mem_size(bloom) ((bloom)->m / 8)
View
164 c/bloom_drv.c
@@ -0,0 +1,164 @@
+
+#include "bloom.h"
+#include <erl_driver.h>
+#include <ei.h>
+#include <stdio.h>
+#include <string.h>
+
+typedef struct _bloom_drv_t {
+ ErlDrvPort port;
+ bloom_t *bloom;
+} bloom_drv_t;
+
+typedef union {
+ long i;
+ double d;
+ char c[8];
+} uni;
+
+//=======================================================================
+// COMMAND CODES
+#define SETUP 's'
+#define PUT 'p'
+#define HAS 'h'
+#define MEM_SIZE 'm'
+#define KEY_SIZE 'k'
+
+//=======================================================================
+// ERL_DRIVER CALLBACKS
+static ErlDrvData init(ErlDrvPort port, char *cmd);
+static void stop(ErlDrvData handle);
+static void output(ErlDrvData handle, char *buf, int len);
+
+//=======================================================================
+// Internal functions
+static void setup(bloom_drv_t *driver, char *buf, int len);
+static void put(bloom_drv_t *driver, char *buf, int len);
+static void has(bloom_drv_t *driver, char *buf, int len);
+static void mem_size(bloom_drv_t *driver);
+static void key_size(bloom_drv_t *driver);
+
+//=======================================================================
+// ERL_DRIVER CALLBACKS
+static ErlDrvData init(ErlDrvPort port, char *cmd) {
+ bloom_drv_t *driver;
+
+ driver = driver_alloc(sizeof(bloom_drv_t));
+ driver->port = port;
+ driver->bloom = NULL;
+
+ return (ErlDrvData)driver;
+}
+
+static void stop(ErlDrvData handle) {
+ bloom_drv_t *driver = (bloom_drv_t*)handle;
+
+ bloom_destroy(driver->bloom);
+ driver_free(driver);
+}
+
+static void output(ErlDrvData handle, char *buf, int len) {
+ bloom_drv_t *driver = (bloom_drv_t *)handle;
+ char command = buf[0];
+
+ switch (command) {
+ case SETUP:
+ setup(driver, &buf[1], len-1);
+ break;
+ case PUT:
+ put(driver, &buf[1], len-1);
+ break;
+ case HAS:
+ has(driver, &buf[1], len-1);
+ break;
+ case MEM_SIZE:
+ mem_size(driver);
+ break;
+ case KEY_SIZE:
+ key_size(driver);
+ break;
+ }
+}
+
+//=======================================================================
+//internal
+
+static void setup(bloom_drv_t *driver, char *buf, int len) {
+ long n;
+ double e;
+ int index = 0;
+
+ ei_decode_version(buf, &index, NULL);
+ ei_decode_tuple_header(buf, &index, NULL);
+ ei_decode_long(buf, &index, &n);
+ ei_decode_double(buf, &index, &e);
+
+ driver->bloom = bloom_create(n, e);
+}
+
+static void put(bloom_drv_t *driver, char *buf, int len) {
+ bloom_put(driver->bloom, buf, len);
+}
+
+static void has(bloom_drv_t *driver, char *buf, int len) {
+ int result;
+ ei_x_buff x;
+ result = bloom_has(driver->bloom, buf, len);
+
+ ei_x_new_with_version(&x);
+ ei_x_encode_boolean(&x, result);
+
+ driver_output(driver->port, x.buff, x.index);
+ ei_x_free(&x);
+}
+
+static void mem_size(bloom_drv_t *driver) {
+ long result;
+ ei_x_buff x;
+ result = bloom_mem_size(driver->bloom);
+
+ ei_x_new_with_version(&x);
+ ei_x_encode_long(&x, result);
+
+ driver_output(driver->port, x.buff, x.index);
+ ei_x_free(&x);
+}
+
+static void key_size(bloom_drv_t *driver) {
+ long result;
+ ei_x_buff x;
+ result = bloom_key_size(driver->bloom);
+
+ ei_x_new_with_version(&x);
+ ei_x_encode_long(&x, result);
+
+ driver_output(driver->port, x.buff, x.index);
+ ei_x_free(&x);
+}
+
+static ErlDrvEntry bloom_driver_entry = {
+ NULL, /* init */
+ init,
+ stop,
+ output, /* output */
+ NULL, /* ready_input */
+ NULL, /* ready_output */
+ "bloom_drv", /* the name of the driver */
+ NULL, /* finish */
+ NULL, /* handle */
+ NULL, /* control */
+ NULL, /* timeout */
+ NULL, /* outputv */
+ NULL, /* ready_async */
+ NULL, /* flush */
+ NULL, /* call */
+ NULL, /* event */
+ ERL_DRV_EXTENDED_MARKER, /* ERL_DRV_EXTENDED_MARKER */
+ ERL_DRV_EXTENDED_MAJOR_VERSION, /* ERL_DRV_EXTENDED_MAJOR_VERSION */
+ ERL_DRV_EXTENDED_MAJOR_VERSION, /* ERL_DRV_EXTENDED_MINOR_VERSION */
+ ERL_DRV_FLAG_USE_PORT_LOCKING /* ERL_DRV_FLAGs */
+};
+
+DRIVER_INIT(bloom_driver) {
+ return &bloom_driver_entry;
+}
View
8 doc/BLOOM_FORMAT.txt
@@ -0,0 +1,8 @@
+BIG ENDIAN
+
+
+VERSION:8/integer, N:64/integer, E:64/float,
+M:32/integer, K:32/integer, Keys:64/integer,
+Seed:32/integer, reserved:64 bytes,
+
+BITS
View
81 elibs/bloom.erl
@@ -0,0 +1,81 @@
+%%%-------------------------------------------------------------------
+%%% File: bloom.erl
+%%% @author Cliff Moon <cliff@powerset.com> []
+%%% @copyright 2009 Cliff Moon
+%%% @doc
+%%%
+%%% @end
+%%%
+%%% @since 2009-04-18 by Cliff Moon
+%%%-------------------------------------------------------------------
+-module(bloom).
+-author('cliff@powerset.com').
+
+%% API
+-export([start/2, put/2, has/2, mem_size/1, key_size/1, stop/1]).
+
+%% COMMANDS
+-define(SETUP, $s).
+-define(PUT, $p).
+-define(HAS, $h).
+-define(MEM_SIZE, $m).
+-define(KEY_SIZE, $k).
+
+-ifdef(TEST).
+-include("etest/bloom_test.erl").
+-endif.
+
+%%====================================================================
+%% API
+%%====================================================================
+%%--------------------------------------------------------------------
+%% @spec
+%% @doc
+%% @end
+%%--------------------------------------------------------------------
+
+start(N, E) ->
+ case load_driver() of
+ ok ->
+ P = open_port({spawn, 'bloom_drv'}, [binary]),
+ port_command(P, [?SETUP, term_to_binary({N, E})]),
+ {ok, {bloom, P}};
+ {error, Err} ->
+ Msg = erl_ddll:format_error(Err),
+ {error, Msg}
+ end.
+
+put({bloom, P}, Key) ->
+ port_command(P, [?PUT, Key]).
+
+has({bloom, P}, Key) ->
+ port_command(P, [?HAS, Key]),
+ receive
+ {P, {data,Bin}} -> binary_to_term(Bin)
+ end.
+
+mem_size({bloom, P}) ->
+ port_command(P, [?MEM_SIZE]),
+ receive
+ {P, {data,Bin}} -> binary_to_term(Bin)
+ end.
+
+key_size({bloom, P}) ->
+ port_command(P, [?KEY_SIZE]),
+ receive
+ {P, {data,Bin}} -> binary_to_term(Bin)
+ end.
+
+stop({bloom, P}) ->
+ unlink(P),
+ port_close(P).
+
+%%====================================================================
+%% Internal functions
+%%====================================================================
+
+load_driver() ->
+ Dir = filename:join([filename:dirname(code:which(bloom)), "..", "priv"]),
+ erl_ddll:load(Dir, "bloom_drv").
+
+
View
36 etest/bloom_test.erl
@@ -0,0 +1,36 @@
+-include_lib("eunit/include/eunit.hrl").
+
+simple_bloom_test() ->
+ {ok, Bloom} = bloom:start(10000, 0.001),
+ bloom:put(Bloom, "wut"),
+ ?assertEqual(true, bloom:has(Bloom, "wut")),
+ ?assertEqual(false, bloom:has(Bloom, "fuck")),
+ bloom:stop(Bloom).
+
+insert_many_things_test() ->
+ {ok, Bloom} = bloom:start(10000, 0.001),
+ Keys = lists:map(fun(N) ->
+ Key = "Key" ++ float_to_list(random:uniform()),
+ bloom:put(Bloom, Key),
+ Key
+ end, lists:seq(1, 10000)),
+ lists:foreach(fun(Key) ->
+ ?assert(bloom:has(Bloom, Key))
+ end, Keys),
+ bloom:stop(Bloom).
+
+false_positive_error_rate_test() ->
+ {ok, Bloom} = bloom:start(10000, 0.001),
+ lists:foreach(fun(N) ->
+ Key = "Key" ++ float_to_list(random:uniform()),
+ bloom:put(Bloom, Key)
+ end, lists:seq(1, 10000)),
+ FalsePositives = [X || X <- [bloom:has(Bloom, "butt" ++ float_to_list(random:uniform())) || N <- lists:seq(1,10000)], X == true],
+ FPRate = length(FalsePositives) / 10000,
+ ?debugFmt("false positives: ~p", [length(FalsePositives)]),
+ ?debugFmt("false positives: ~p", [FPRate]),
+ ?debugFmt("mem size ~p", [bloom:mem_size(Bloom)]),
+ ?assert(FPRate < 0.001),
+ ?assertEqual(10000, bloom:key_size(Bloom)),
+ bloom:stop(Bloom).
+
Please sign in to comment.
Something went wrong with that request. Please try again.