Label guessing: Use simplebayes instead of numpy+scipy

Signed-off-by: jflesch <jflesch@gmail.com>
openpaperwork · Nov 16, 2015 · 4a9c686 · 4a9c686
1 parent d9aa93b
commit 4a9c686
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 221 deletions.
diff --git a/scripts/stats.py b/scripts/stats.py
@@ -31,7 +31,7 @@ def main():
     print("========")
 
     all_labels = set([l.name for l in dsearch.label_list])
-    label_keys = [ 'global', 'positive', 'negative' ]  # for the order
+    label_keys = ['global', 'positive', 'negative']  # for the order
     total_label_accuracy = {
         'global': 0,
         'positive': 0,
@@ -52,7 +52,7 @@ def main():
         if doc.nb_pages > max_pages:
             max_pages = doc.nb_pages
 
-        ### Keyword stats
+        # Keyword stats
         for page in doc.pages:
             sys.stdout.write("%d " % (page.page_nb + 1))
             sys.stdout.flush()
@@ -63,10 +63,10 @@ def main():
                     # ignore words too short to be useful
                     if (len(word) < 4):
                         continue
-                    if not word in words:
+                    if word not in words:
                         words.add(word)
                         total_nb_unique_words += 1
-                    if not word in doc_words:
+                    if word not in doc_words:
                         doc_words.add(word)
                         total_nb_unique_words_per_doc += 1
 
@@ -75,9 +75,9 @@ def main():
                     if max_word_len < len(word):
                         max_word_len = len(word)
 
-        ### Label predictions stats
-        doc_labels = set([l.name for l in doc.labels])
-        predicated_labels = set(dsearch.predict_label_list(doc))
+        # Label predictions stats
+        doc_labels = {l.name for l in doc.labels}
+        predicated_labels = {l.name for l in dsearch.guess_labels(doc)}
         accurate = {
             'global': 0,
             'negative': 0,

diff --git a/setup.py b/setup.py
@@ -164,12 +164,9 @@
         "python-Levenshtein",
         "pyinsane >= 1.3.8",
         "pyocr >= 0.3.0",
-        "numpy",
-        "scipy",
-        "scikit-learn",
-        "scikit-image",
         "termcolor",  # used by paperwork-chkdeps
         "Whoosh",
+        "simplebayes",
         # "PyGObject",  # doesn't work with virtualenv
         # Missing due to the use of gobject introspection:
         # - gtk
@@ -178,8 +175,8 @@
         # Missing because non-python libraries:
         # - sane
         # - tesseract/cuneiform
-    ],
-    )
+    ]
+)
 
 print ("======================================================================")
 print ("======================================================================")

diff --git a/src/paperwork/backend/common/doc.py b/src/paperwork/backend/common/doc.py
@@ -72,6 +72,12 @@ def __init__(self, docpath, docid=None):
             self.path = docpath
         self.__cache = {}
 
+        # We need to keep track of the labels:
+        # When updating bayesian filters for label guessing,
+        # we need to know the new label list, but also the *previous* label
+        # list
+        self._previous_labels = self.labels
+
     def drop_cache(self):
         self.__cache = {}
 
@@ -209,6 +215,9 @@ def _get_text(self):
         txt = u""
         for page in self.pages:
             txt += u"\n".join([unicode(line) for line in page.text])
+        extra_txt = self.extra_text
+        if extra_txt != u"":
+            txt += u"\n" + extra_txt + u"\n"
         txt = txt.strip()
         return txt
 
@@ -422,3 +431,6 @@ def __set_extra_text(self, txt):
     def hash_file(path):
         dochash = hashlib.sha256(open(path, 'rb').read()).hexdigest()
         return int(dochash, 16)
+
+    def clone(self):
+        raise NotImplementedError()