1st draft of SpamBayes based auto-classification of mail

mailpile · Dec 27, 2013 · beebd50 · beebd50
1 parent 0af5213
commit beebd50
Show file tree

Hide file tree

Showing 6 changed files with 322 additions and 5 deletions.
diff --git a/Makefile b/Makefile
@@ -8,7 +8,8 @@ dev:
 
 debian-dev:
 	sudo apt-get install python-imaging python-lxml python-jinja2 pep8 \
-	                     rubygems ruby-dev yui-compressor python-nose
+	                     rubygems ruby-dev yui-compressor python-nose \
+	                     spambayes
 	sudo gem install therubyracer less
 
 docs:

diff --git a/mailpile/plugins/__init__.py b/mailpile/plugins/__init__.py
@@ -13,13 +13,13 @@
 # These are the plugins we ship/import by default
 __all__ = [
     'search', 'tags', 'contacts', 'compose', 'groups',
-    'dates', 'sizes',
+    'dates', 'sizes', 'autotag',
     'setup_magic', 'networkgraph', 'exporters',
     'vcard_carddav', 'vcard_gnupg', 'vcard_gravatar', 'vcard_mork',
     'hacks'
 ]
 BUILTIN = (__all__[:] + [
-    'spambayes'
+    'autotag_sb'
 ])
 
 # These are plugins which we consider required

diff --git a/mailpile/plugins/autotag.py b/mailpile/plugins/autotag.py
@@ -0,0 +1,37 @@
+# This is an auto-tagging plugin that invokes external tools.
+
+import math
+import time
+import datetime
+from gettext import gettext as _
+
+import mailpile.plugins
+import mailpile.config
+
+
+##[ Configuration ]###########################################################
+
+mailpile.plugins.register_config_section('prefs', 'autotag', ["Auto-tag",
+{
+    'command': ['Shell command to invoke', str, ''],
+    'match_tag': ['Tag we are adding to automatically', str, ''],
+    'unsure_tag': ['If unsure, add to this tag', str, ''],
+    'ignore_kws': ['Ignore messages with these keywords', str, []],
+}, {}])
+
+
+##[ Keywords ]################################################################
+
+def filter_hook(session, msg_mid, msg, keywords):
+    """Classify this message."""
+    config = session.config.prefs.autotag
+
+    # FIXME: Iterate through all the autotag config options, invoke
+    #        the external command and interpret the response.
+
+    return keywords
+
+
+# We add a filter post-hook with a high (late) priority, to maximize
+# the amount of data we are feeding to the classifier.
+mailpile.plugins.register_filter_hook_post('90-autotag', filter_hook)
diff --git a/mailpile/plugins/autotag_sb.py b/mailpile/plugins/autotag_sb.py
@@ -0,0 +1,273 @@
+# This is an auto-tagging plugin that uses SpamBayes to classify messages.
+#
+# We feed the classifier the same terms as go into the search engine,
+# which should allow us to actually introspect a bit into the behavior
+# of the classifier.
+
+import math
+import time
+import datetime
+from gettext import gettext as _
+
+from spambayes.classifier import Classifier
+
+import mailpile.config
+import mailpile.plugins
+from mailpile.commands import Command
+from mailpile.mailutils import Email
+
+
+##[ Configuration ]###########################################################
+
+mailpile.plugins.register_config_section('prefs', 'autotag_sb', ["Spambayes",
+{
+    'match_tag': ['Tag we are adding to automatically', str, ''],
+    'unsure_tag': ['If unsure, add to this tag', str, ''],
+    'exclude_tags': ['Tags on messages we should never match (ham)', str, []],
+    'ignore_kws': ['Ignore messages with these keywords', str, []],
+    'corpus_size': ['How many messages do we train on?', int, 1000],
+    'threshold': ['Size of the sure/unsure ranges', float, 0.1],
+}, []])
+
+
+def SaveClassifier(config, match_tag):
+    config.save_pickle(config.autotag_sb[match_tag],
+                       'autotag_sb.%s' % match_tag)
+
+
+def LoadClassifier(config, match_tag):
+    if not hasattr(config, 'autotag_sb'):
+        config.autotag_sb = {}
+    if match_tag not in config.autotag_sb:
+        cfn = 'autotag_sb.%s' % match_tag
+        try:
+            config.autotag_sb[match_tag] = config.load_pickle(cfn)
+        except IOError:
+            config.autotag_sb[match_tag] = Classifier()
+            SaveClassifier(config, match_tag)
+        config.autotag_sb[match_tag].setup()
+    return config.autotag_sb[match_tag]
+
+
+mailpile.config.ConfigManager.load_sb_classifier = LoadClassifier;
+mailpile.config.ConfigManager.save_sb_classifier = SaveClassifier;
+
+
+##[ Commands ]################################################################
+
+
+class AutoTagCommand(Command):
+    ORDER = ('Tagging', 9)
+
+    def _get_keywords(self, e):
+        idx = self._idx()
+        kws, snippet = idx.read_message(self.session,
+            e.msg_mid(),
+            e.get_msg_info(field=idx.MSG_ID),
+            e.get_msg(),
+            e.get_msg_size(),
+            int(e.get_msg_info(field=idx.MSG_DATE), 36)
+        )
+        return kws
+
+
+#   - Add a command for retraining a particular tag, or all tags.
+#       - Should collect N messages for training, prefer messages
+#         that are interesting - having been read, replied to, tagged
+#         by hand or placed in the "ham" corpus.
+#       - Pad up to corpus_size with random selection biased towards
+#         recent mail?
+class Retrain(AutoTagCommand):
+    SYNOPSIS = (None, 'autotag_sb/retrain', None, '[<tags>]')
+
+    def command(self):
+        session, config, idx = self.session, self.session.config, self._idx()
+        tags = self.args or [asb['match_tag']
+                             for asb in config.prefs.autotag_sb]
+        tids = [config.get_tag(t)._key for t in tags if t]
+
+        session.ui.mark(_('Retraining SpamBayes autotaggers'))
+        if not hasattr(config, 'autotag_sb'):
+            config.autotag_sb = {}
+
+        # Find all the interesting messages! We don't look in the trash,
+        # but we do look at interesting spam.
+        #
+        # Note: By specifically stating that we DON'T want trash, we
+        #       disable the search engine's default result suppression
+        #       and guarantee these results don't corrupt the somewhat
+        #       lame/broken result cache.
+        #
+        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
+        interest = {}
+        for ttype in ('replied', 'read', 'tagged'):
+            interest[ttype] = set()
+            for tag in config.get_tags(type=ttype):
+                interest[ttype] |= idx.search(session,
+                                              ['in:%s' % tag.slug] + no_trash
+                                              ).as_set()
+            session.ui.notify(_('Have %d interesting %s messages'
+                                ) % (len(interest[ttype]), ttype))
+
+        retrained = []
+        for asb in config.prefs.autotag_sb:
+            asb_tag = config.get_tag(asb.match_tag)
+            if asb_tag and asb_tag._key in tids:
+                session.ui.mark('Retraining: %s' % asb_tag.name)
+
+                yn = [(set(), set(), 'in:%s' % asb_tag.slug, True),
+                      (set(), set(), '-in:%s' % asb_tag.slug, False)]
+
+                # Get the current message sets: tagged and untagged messages
+                # excluding trash.
+                for tset, mset, srch, which in yn:
+                    mset |= idx.search(session, [srch] + no_trash).as_set()
+
+                # If we have any exclude_tags, they are particularly
+                # interesting, so we'll look at them first.
+                interesting = []
+                for etagid in asb.exclude_tags:
+                    etag = config.get_tag(etagid)
+                    if etag._key not in interest:
+                        srch = ['in:%s' % etag._key] + no_trash
+                        interest[etag._key] = idx.search(session, srch
+                                                         ).as_set()
+                    interesting.append(etag._key)
+                interesting.extend(['replied', 'read', 'tagged'])
+
+                # Go through the interest types in order of preference and
+                # while we still lack training data, add to the training set.
+                for ttype in interesting:
+                    for tset, mset, srch, which in yn:
+                        # FIXME: Is this a good idea? No single data source
+                        # is allowed to be more than 50% of the corpus, to
+                        # try and encourage diversity.
+                        want = min(asb.corpus_size / 4,
+                                   max(0, asb.corpus_size / 2 - len(tset)))
+                        if want:
+                            if ttype:
+                                adding = sorted(list(mset & interest[ttype]))
+                            else:
+                                adding = sorted(list(mset))
+                            adding = set(list(reversed(adding))[:want])
+                            tset |= adding
+                            mset -= adding
+
+                # Start with a fresh classifier
+                bayes = Classifier()
+                count_all = 0
+                for tset, mset, srch, which in yn:
+                    count = 0
+                    for msg_idx in tset:
+                        e = Email(idx, msg_idx)
+                        count += 1
+                        count_all += 1
+                        session.ui.mark(('Reading %s (%d/%d, %s=%s)'
+                                         ) % (e.msg_mid(), count, len(tset),
+                                              asb_tag.name, which))
+                        bayes.learn(self._get_keywords(e), which)
+
+                # We got this far without crashing, so save the result.
+                config.autotag_sb[asb_tag._key] = bayes
+                config.save_sb_classifier(asb_tag._key)
+                retrained.append(asb_tag.name)
+
+        session.ui.mark(_('Retrained SpamBayes auto-tagging for %s'
+                          ) % ', '.join(retrained))
+        return {'retrained': retrained, 'read_messages': count_all}
+
+
+class Classify(AutoTagCommand):
+    SYNOPSIS = (None, 'autotag_sb/classify', None, '<msgs>')
+    ORDER = ('Tagging', 9)
+
+    def _classify(self, emails):
+        session, config, idx = self.session, self.session.config, self._idx()
+        results = {}
+        for e in emails:
+            kws = self._get_keywords(e)
+            result = results[e.msg_mid()] = {}
+            for asb in config.prefs.autotag_sb:
+                asb_tag = config.get_tag(asb.match_tag)
+                bayes = config.load_sb_classifier(asb_tag._key)
+                prob, evidence = bayes.chi2_spamprob(kws, evidence=True)
+                result[asb_tag._key] = (prob, evidence)
+        return results
+
+    def command(self):
+        session, config, idx = self.session, self.session.config, self._idx()
+        emails = [Email(idx, mid) for mid in self._choose_messages(self.args)]
+        return self._classify(emails)
+
+
+class AutoTag(Classify):
+    SYNOPSIS = (None, 'autotag_sb', None, '<msgs>')
+    ORDER = ('Tagging', 9)
+
+    def command(self):
+        session, config, idx = self.session, self.session.config, self._idx()
+        emails = [Email(idx, mid) for mid in self._choose_messages(self.args)]
+        scores = self._classify(emails)
+        tag = {}
+        for mid in scores:
+            for asb in config.prefs.autotag_sb:
+                asb_tag = config.get_tag(asb.match_tag)
+                score = scores[mid].get(asb_tag._key, (0, ))[0]
+
+                if score > (1.0 - asb.threshold):
+                    if asb.match_tag not in tag:
+                        tag[asb.match_tag] = [mid]
+                    else:
+                        tag[asb.match_tag].append(mid)
+
+                elif asb.unsure_tag and score >= asb.threshold:
+                    if asb.unsure_tag not in tag:
+                        tag[asb.unsure_tag] = [mid]
+                    else:
+                        tag[asb.unsure_tag].append(mid)
+
+        for tid in tag:
+            idx.add_tag(session, tid, msg_idxs=[int(i, 36) for i in tag[tid]])
+
+        return tag
+
+
+mailpile.plugins.register_commands(Retrain, Classify, AutoTag)
+
+
+##[ Cron jobs ]###############################################################
+
+# FIXME: We should periodically retrain?
+
+
+##[ Keywords ]################################################################
+
+def filter_hook(session, msg_mid, msg, keywords):
+    """Classify this message."""
+    config = session.config
+    for asb in config.prefs.autotag_sb:
+        try:
+            asb_tag = config.get_tag(asb.match_tag)
+            bayes = config.load_sb_classifier(asb_tag._key)
+            score = bayes.chi2_spamprob(keywords)
+            if score > (1 - asb.threshold):
+                if 'autotag' in config.sys.debug:
+                    session.ui.debug(('Autotagging %s with %s (score=%.3f)'
+                                      ) % (msg_mid, asb_tag.name, score))
+                keywords.add('%s:tag' % asb_tag._key)
+            elif asb.unsure_tag and score >= asb.threshold:
+                unsure_tag = config.get_tag(asb.unsure_tag)
+                if 'autotag' in config.sys.debug:
+                    session.ui.debug(('Autotagging %s with %s (score=%.3f)'
+                                      ) % (msg_mid, unsure_tag.name, score))
+                keywords.add('%s:tag' % unsure_tag._key)
+        except (KeyError, AttributeError, ValueError):
+            pass
+
+    return keywords
+
+
+# We add a filter pre-hook with a high (late) priority.  Late priority to
+# maximize the amount of data we are feeding to the classifier, but a
+# pre-hook so normal filter rules will override the autotagging.
+mailpile.plugins.register_filter_hook_post('90-autotag_sb', filter_hook)
diff --git a/mailpile/search.py b/mailpile/search.py
@@ -702,6 +702,9 @@ def _loader(p):
                 keywords.extend(['%s:email' % e for e in emails])
                 if 'list' in key_lower:
                     keywords.extend(['%s:list' % t for t in words])
+        for key in EXPECTED_HEADERS:
+            if not msg[key]:
+                keywords.append('missing:%s' % key)
 
         for extract in plugins.get_meta_kw_extractors():
             keywords.extend(extract(self, msg_mid, msg, msg_size, msg_ts))

diff --git a/mailpile/util.py b/mailpile/util.py
@@ -35,13 +35,16 @@
                          ':\"|;\'\\\<\>\?,\.\/\-]{2,}')
 
 STOPLIST = set(['an', 'and', 'are', 'as', 'at', 'by', 'for', 'from',
-                'has', 'http', 'in', 'is', 'it', 'mailto', 'og', 'or',
-                're', 'so', 'the', 'to', 'was'])
+                'has', 'http', 'https', 'i', 'in', 'is', 'it',
+                'mailto', 'me',
+                'og', 'or', 're', 'so', 'the', 'to', 'was', 'you'])
 
 BORING_HEADERS = ('received', 'date',
                   'content-type', 'content-disposition', 'mime-version',
                   'dkim-signature', 'domainkey-signature', 'received-spf')
 
+EXPECTED_HEADERS = ('from', 'to', 'subject', 'date')
+
 B64C_STRIP = '\n='
 
 B64C_TRANSLATE = string.maketrans('/', '_')