Skip to content

Commit

Permalink
Autotagging fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
BjarniRunar committed Dec 27, 2013
1 parent 28ee338 commit 6dc679d
Showing 1 changed file with 33 additions and 16 deletions.
49 changes: 33 additions & 16 deletions mailpile/plugins/autotag_sb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,19 @@


def SaveClassifier(config, match_tag):
config.save_pickle(config.autotag_sb[match_tag],
'autotag_sb.%s' % match_tag)
if config.autotag_sb[match_tag].trained:
config.save_pickle(config.autotag_sb[match_tag],
'pickled-autotag_sb.%s' % match_tag)


def LoadClassifier(config, match_tag):
if not hasattr(config, 'autotag_sb'):
config.autotag_sb = {}
if match_tag not in config.autotag_sb:
cfn = 'autotag_sb.%s' % match_tag
cfn = 'pickled-autotag_sb.%s' % match_tag
try:
config.autotag_sb[match_tag] = config.load_pickle(cfn)
config.autotag_sb[match_tag].trained = True
except IOError:
config.autotag_sb[match_tag] = Classifier()
config.autotag_sb[match_tag].trained = False
Expand All @@ -62,14 +64,19 @@ class AutoTagCommand(Command):

def _get_keywords(self, e):
idx = self._idx()
kws, snippet = idx.read_message(self.session,
e.msg_mid(),
e.get_msg_info(field=idx.MSG_ID),
e.get_msg(),
e.get_msg_size(),
int(e.get_msg_info(field=idx.MSG_DATE), 36)
)
return kws
if not hasattr(self, 'rcache'):
self.rcache = {}
mid = e.msg_mid()
if mid not in self.rcache:
kws, snippet = idx.read_message(self.session,
mid,
e.get_msg_info(field=idx.MSG_ID),
e.get_msg(),
e.get_msg_size(),
int(e.get_msg_info(field=idx.MSG_DATE), 36)
)
self.rcache[mid] = kws
return self.rcache[mid]


# - Add a command for retraining a particular tag, or all tags.
Expand Down Expand Up @@ -111,14 +118,15 @@ def command(self):
) % (len(interest[ttype]), ttype))

retrained = []
count_all = 0
for asb in config.prefs.autotag_sb:
asb_tag = config.get_tag(asb.match_tag)
if asb_tag and asb_tag._key in tids:
session.ui.mark('Retraining: %s' % asb_tag.name)

yn = [(set(), set(), 'in:%s' % asb_tag.slug, True),
(set(), set(), '-in:%s' % asb_tag.slug, False)]

# Get the current message sets: tagged and untagged messages
# excluding trash.
for tset, mset, srch, which in yn:
Expand All @@ -134,8 +142,8 @@ def command(self):
interest[etag._key] = idx.search(session, srch
).as_set()
interesting.append(etag._key)
interesting.extend(['replied', 'read', 'tagged'])
interesting.extend(['replied', 'read', 'tagged', None])

# Go through the interest types in order of preference and
# while we still lack training data, add to the training set.
for ttype in interesting:
Expand All @@ -156,8 +164,6 @@ def command(self):

# Start with a fresh classifier
bayes = Classifier()
bayes.trained = True
count_all = 0
for tset, mset, srch, which in yn:
count = 0
for msg_idx in tset:
Expand All @@ -170,6 +176,7 @@ def command(self):
bayes.learn(self._get_keywords(e), which)

# We got this far without crashing, so save the result.
bayes.trained = True
config.autotag_sb[asb_tag._key] = bayes
config.save_sb_classifier(asb_tag._key)
retrained.append(asb_tag.name)
Expand All @@ -186,11 +193,19 @@ class Classify(AutoTagCommand):
def _classify(self, emails):
session, config, idx = self.session, self.session.config, self._idx()
results = {}
unknown = []
for e in emails:
kws = self._get_keywords(e)
result = results[e.msg_mid()] = {}
for asb in config.prefs.autotag_sb:
if not asb.match_tag:
continue
continue
asb_tag = config.get_tag(asb.match_tag)
if not asb_tag and asb.match_tag not in unknown:
session.ui.error(_('Unknown tag: %s') % asb.match_tag)
unknown.append(asb.match_tag)
continue
bayes = config.load_sb_classifier(asb_tag._key)
if bayes.trained:
prob, evidence = bayes.chi2_spamprob(kws, evidence=True)
Expand All @@ -215,6 +230,8 @@ def command(self):
for mid in scores:
for asb in config.prefs.autotag_sb:
asb_tag = config.get_tag(asb.match_tag)
if not asb_tag:
continue
score = scores[mid].get(asb_tag._key, (0, ))[0]

if score > (1.0 - asb.threshold):
Expand Down

0 comments on commit 6dc679d

Please sign in to comment.