Skip to content
This repository has been archived by the owner on Dec 18, 2019. It is now read-only.

Commit

Permalink
Backend: Label guessing: Remove the custom tokenisation function. Res…
Browse files Browse the repository at this point in the history
…ults are much better without it.

Signed-off-by: jflesch <jflesch@gmail.com>
  • Loading branch information
jflesch committed Nov 16, 2015
1 parent a5ec3e7 commit eada91e
Showing 1 changed file with 3 additions and 12 deletions.
15 changes: 3 additions & 12 deletions src/paperwork/backend/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import simplebayes

from paperwork.backend.util import mkdir_p
from paperwork.backend.util import split_words
from paperwork.backend.util import strip_accents


Expand Down Expand Up @@ -121,14 +120,6 @@ def __str__(self):
% (self.get_html_color(), self.name))


def _split_words_hook(text):
"""
Takes unicode, and returns str (simplebays works with 'str')
"""
words = [word.encode("utf-8") for word in split_words(text)]
return " ".join(words)


class LabelGuessUpdater(object):
def __init__(self, guesser):
self.guesser = guesser
Expand All @@ -138,7 +129,7 @@ def add_doc(self, doc):
doc_txt = doc.text
if doc_txt == u"":
return
doc_txt = _split_words_hook(doc_txt)
doc_txt = doc_txt.encode("utf-8")

labels = {label.name for label in doc.labels}

Expand All @@ -160,7 +151,7 @@ def del_doc(self, doc):
doc_txt = doc.text
if doc_txt == u"":
return
doc_txt = _split_words_hook(doc_txt)
doc_txt = doc_txt.encode("utf-8")

labels = {label.name for label in doc._previous_labels}

Expand Down Expand Up @@ -214,7 +205,7 @@ def guess(self, doc):
doc_txt = doc.text
if doc_txt == u"":
return set()
doc_txt = _split_words_hook(doc_txt)
doc_txt = doc_txt.encode("utf-8")
label_names = set()
for (label_name, guesser) in self._bayes.iteritems():
# we balance ourselves the scores, otherwise 'no' wins
Expand Down

0 comments on commit eada91e

Please sign in to comment.