Skip to content
This repository has been archived by the owner on Dec 18, 2019. It is now read-only.

Commit

Permalink
Label guessing: Use simplebayes instead of numpy+scipy
Browse files Browse the repository at this point in the history
Signed-off-by: jflesch <jflesch@gmail.com>
  • Loading branch information
jflesch committed Nov 16, 2015
1 parent d9aa93b commit 4a9c686
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 221 deletions.
14 changes: 7 additions & 7 deletions scripts/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def main():
print("========")

all_labels = set([l.name for l in dsearch.label_list])
label_keys = [ 'global', 'positive', 'negative' ] # for the order
label_keys = ['global', 'positive', 'negative'] # for the order
total_label_accuracy = {
'global': 0,
'positive': 0,
Expand All @@ -52,7 +52,7 @@ def main():
if doc.nb_pages > max_pages:
max_pages = doc.nb_pages

### Keyword stats
# Keyword stats
for page in doc.pages:
sys.stdout.write("%d " % (page.page_nb + 1))
sys.stdout.flush()
Expand All @@ -63,10 +63,10 @@ def main():
# ignore words too short to be useful
if (len(word) < 4):
continue
if not word in words:
if word not in words:
words.add(word)
total_nb_unique_words += 1
if not word in doc_words:
if word not in doc_words:
doc_words.add(word)
total_nb_unique_words_per_doc += 1

Expand All @@ -75,9 +75,9 @@ def main():
if max_word_len < len(word):
max_word_len = len(word)

### Label predictions stats
doc_labels = set([l.name for l in doc.labels])
predicated_labels = set(dsearch.predict_label_list(doc))
# Label predictions stats
doc_labels = {l.name for l in doc.labels}
predicated_labels = {l.name for l in dsearch.guess_labels(doc)}
accurate = {
'global': 0,
'negative': 0,
Expand Down
9 changes: 3 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,9 @@
"python-Levenshtein",
"pyinsane >= 1.3.8",
"pyocr >= 0.3.0",
"numpy",
"scipy",
"scikit-learn",
"scikit-image",
"termcolor", # used by paperwork-chkdeps
"Whoosh",
"simplebayes",
# "PyGObject", # doesn't work with virtualenv
# Missing due to the use of gobject introspection:
# - gtk
Expand All @@ -178,8 +175,8 @@
# Missing because non-python libraries:
# - sane
# - tesseract/cuneiform
],
)
]
)

print ("======================================================================")
print ("======================================================================")
Expand Down
12 changes: 12 additions & 0 deletions src/paperwork/backend/common/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ def __init__(self, docpath, docid=None):
self.path = docpath
self.__cache = {}

# We need to keep track of the labels:
# When updating bayesian filters for label guessing,
# we need to know the new label list, but also the *previous* label
# list
self._previous_labels = self.labels

def drop_cache(self):
self.__cache = {}

Expand Down Expand Up @@ -209,6 +215,9 @@ def _get_text(self):
txt = u""
for page in self.pages:
txt += u"\n".join([unicode(line) for line in page.text])
extra_txt = self.extra_text
if extra_txt != u"":
txt += u"\n" + extra_txt + u"\n"
txt = txt.strip()
return txt

Expand Down Expand Up @@ -422,3 +431,6 @@ def __set_extra_text(self, txt):
def hash_file(path):
dochash = hashlib.sha256(open(path, 'rb').read()).hexdigest()
return int(dochash, 16)

def clone(self):
raise NotImplementedError()
Loading

0 comments on commit 4a9c686

Please sign in to comment.