Skip to content

Commit

Permalink
use an ad-block c++ library to speed up the url matching
Browse files Browse the repository at this point in the history
  • Loading branch information
parkouss committed Jun 29, 2017
1 parent 22b5230 commit 826ee77
Show file tree
Hide file tree
Showing 8 changed files with 205 additions and 16 deletions.
9 changes: 9 additions & 0 deletions .gitmodules
@@ -0,0 +1,9 @@
[submodule "vendor/hashset-cpp"]
path = vendor/hashset-cpp
url = https://github.com/bbondy/hashset-cpp
[submodule "vendor/bloom-filter-cpp"]
path = vendor/bloom-filter-cpp
url = https://github.com/bbondy/bloom-filter-cpp
[submodule "vendor/ad-block"]
path = vendor/ad-block
url = https://github.com/brave/ad-block
142 changes: 142 additions & 0 deletions c/adblock.c
@@ -0,0 +1,142 @@
#include <Python.h>
#include "structmember.h"

#include "ad_block_client.h"

typedef struct {
PyObject_HEAD

AdBlockClient * client;
} AdBlock;


static void
AdBlock_dealloc(AdBlock* self)
{
delete self->client;
Py_TYPE(self)->tp_free((PyObject*)self);
}

static PyObject *
AdBlock_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
AdBlock *self;

self = (AdBlock *)type->tp_alloc(type, 0);

return (PyObject *)self;
}

static int
AdBlock_init(AdBlock *self, PyObject *args, PyObject *kwds)
{
self->client = new AdBlockClient;
return 0;
}

static PyObject *
AdBlock_parse(AdBlock* self, PyObject *args)
{
const char *data;

if (!PyArg_ParseTuple(args, "s", &data))
return NULL;

self->client->parse(data);

Py_RETURN_NONE;
}

static PyObject *
AdBlock_matches(AdBlock* self, PyObject *args)
{
const char *url, *domain;

if (!PyArg_ParseTuple(args, "ss", &url, &domain))
return NULL;

if (self->client->matches(url, FONoFilterOption, domain)) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
}

static PyMethodDef AdBlock_methods[] = {
{"parse", (PyCFunction)AdBlock_parse, METH_VARARGS,
"Parse adblock data string, like the content of an easylist."
},
{"matches", (PyCFunction)AdBlock_matches, METH_VARARGS,
"matches an url, returns True if it should be filtered."
},
{NULL} /* Sentinel */
};

static PyTypeObject AdBlockType = {
PyVarObject_HEAD_INIT(NULL, 0)
"adblock.AdBlock", /* tp_name */
sizeof(AdBlock), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)AdBlock_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_reserved */
0, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT |
Py_TPFLAGS_BASETYPE, /* tp_flags */
"Adblock objects", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
AdBlock_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)AdBlock_init, /* tp_init */
0, /* tp_alloc */
AdBlock_new, /* tp_new */
};


static PyModuleDef adblockmodule = {
PyModuleDef_HEAD_INIT,
"adblock",
"Module to speed up ad filtering.",
-1,
NULL, NULL, NULL, NULL, NULL
};

PyMODINIT_FUNC
PyInit__adblock(void)
{
PyObject* m;

AdBlockType.tp_new = PyType_GenericNew;
if (PyType_Ready(&AdBlockType) < 0)
return NULL;

m = PyModule_Create(&adblockmodule);
if (m == NULL)
return NULL;

Py_INCREF(&AdBlockType);
PyModule_AddObject(m, "AdBlock", (PyObject *)&AdBlockType);
return m;
}
45 changes: 45 additions & 0 deletions setup.py
@@ -0,0 +1,45 @@
import os

from distutils.core import setup, Extension


THIS_DIR = os.path.dirname(os.path.realpath(__file__))

bloom_dir = os.path.join(THIS_DIR, "vendor", "bloom-filter-cpp")
hashset_dir = os.path.join(THIS_DIR, "vendor", "hashset-cpp")
adblock_dir = os.path.join(THIS_DIR, "vendor", "ad-block")


if "CC" not in os.environ:
# force g++, not sure why but else gcc is used and the code does not
# compile...
os.environ["CC"] = "g++"

adblocker = Extension(
'_adblock',
define_macros=[],
language="c++",
include_dirs=[bloom_dir, hashset_dir, adblock_dir],
# not sure if that help for speed. Careful it strip the debug symbols
extra_compile_args=["-g0"],
sources=[
os.path.join(bloom_dir, "BloomFilter.cpp"),
os.path.join(bloom_dir, "hashFn.cpp"),
os.path.join(hashset_dir, "HashSet.cpp"),
os.path.join(adblock_dir, "ad_block_client.cc"),
os.path.join(adblock_dir, "filter.cc"),
os.path.join(adblock_dir, "cosmetic_filter.cc"),
os.path.join(THIS_DIR, "c", "adblock.c"),
])

setup(
name='webmacs',
version='1.0',
description='Keyboard driven web browser, emacs-like',
author='Julien Pagès',
author_email='j.parkouss@gmail.com',
url='todo',
long_description='''
Work in progress.
''',
ext_modules=[adblocker])
1 change: 1 addition & 0 deletions vendor/ad-block
Submodule ad-block added at eb1e8f
1 change: 1 addition & 0 deletions vendor/bloom-filter-cpp
Submodule bloom-filter-cpp added at 5e5a53
1 change: 1 addition & 0 deletions vendor/hashset-cpp
Submodule hashset-cpp added at 6a6ca1
20 changes: 5 additions & 15 deletions webmacs/adblock.py
@@ -1,11 +1,10 @@
import os
import time

from adblockparser import AdblockRules, AdblockRule
from _adblock import AdBlock
from concurrent.futures import ThreadPoolExecutor
import urllib.request

from PyQt5.QtCore import QRegExp

EASYLIST = (
"https://easylist.to/easylist/easylist.txt",
Expand All @@ -14,13 +13,6 @@
)


class BlockRule(AdblockRule):
def _url_matches(self, url):
if self.regex_re is None:
self.regex_re = QRegExp(self.regex)
return self.regex_re.indexIn(url) != -1


class Adblocker(object):
def __init__(self, cache_path):
if not os.path.isdir(cache_path):
Expand Down Expand Up @@ -51,13 +43,11 @@ def _fetch_urls(self):
executor.submit(self._download_file, url, path)

def generate_rules(self):
adblock = AdBlock()
self._fetch_urls()
rules = []
for path in self._urls.values():
print (path)
with open(path) as f:
for line in f:
line = line.strip()
if line:
rules.append(line)
return AdblockRules(rules, rule_cls=BlockRule)
adblock.parse(f.read())
print (path)
return adblock
2 changes: 1 addition & 1 deletion webmacs/application.py
Expand Up @@ -25,7 +25,7 @@ def __init__(self, app):

def interceptRequest(self, request):
url = request.requestUrl().toString()
if self._adblock.should_block(url):
if self._adblock.matches(url, ""):
logging.info("filtered: %s", url)
request.block(True)

Expand Down

0 comments on commit 826ee77

Please sign in to comment.