Find file
Fetching contributors…
Cannot retrieve contributors at this time
294 lines (240 sloc) 10.4 KB
# -*- coding: utf-8 -*-
from __future__ import division
Simple bag of ngrams classifier for tweets
Classifies messages based on their trigrams.
If a trigram in message is not in model then backs off to bigrams.
If a bigram in message is not in model then backs off to unigrams.
Trigram, bigram and unigram likliehoods are all smooothed
A tunable threshold is classifying based on posterior probabalities
The paramaters for all these things are at the start of BayesClassifier
import math
import preprocessing
class BayesClassifier:
# Precision = 0.946, Recall = 0.685, F1 = 0.794
smooth_unigram = 12.9621
smooth_bigram = 4.5806
smooth_trigram = 0.5088
backoff_bigram = 0.8687
backoff_trigram = 0.4266
threshold = 1.7553
def set_params(smooth_unigram, smooth_bigram, smooth_trigram,
backoff_bigram, backoff_trigram, threshold):
BayesClassifier.smooth_unigram = smooth_unigram
BayesClassifier.smooth_bigram = smooth_bigram
BayesClassifier.smooth_trigram = smooth_trigram
BayesClassifier.backoff_bigram = backoff_bigram
BayesClassifier.backoff_trigram = backoff_trigram
BayesClassifier.threshold = threshold
def get_params():
return (
def get_param_names():
return (
def get_weights():
weights = {
1 : BayesClassifier.backoff_bigram,
2 : BayesClassifier.backoff_trigram,
3 : 1.0
total = sum(weights.values())
return dict((n,w/total) for n,w in weights.items())
def get_smoothings():
return {
1: BayesClassifier.smooth_unigram,
2: BayesClassifier.smooth_bigram,
3: BayesClassifier.smooth_trigram
def get_features(ngrams):
"""Make features to classify on from ngrams
counts = {}
for w in ngrams:
counts[w] = counts.get(w,0) + 1
features = []
for k,v in counts.items():
for cnt in range(2,min(3,v)+1):
features.append('%s (COUNT=%d)' % (k,cnt))
return features
def get_cntv(counts):
"""counts is a dict of ngram:(neg,pos) where
neg is number of times ngram has appeared in a -ve
pos is number of times ngram has appeared in a +ve
returns cntn,cntp,v
cntn: total number of negatives
cntp: total number of positives
v: number of unique ngrams
cntn = sum([neg for neg,pos in counts.values()])
cntp = sum([pos for neg,pos in counts.values()])
v = len(counts.keys())
return cntn,cntp,v
def cnt_positivity(np):
"""np is (neg,pos) where pos is num positives and neg
is number of negatives
Return how positive this result is
neg,pos = np
if neg == 0 or pos == 0:
return pos - neg
return (pos - neg)/(pos + neg)
def cnt_show(ngram, np):
"""Return a string for an ngram with np = neg,pos
positive and negative counts
return "[%3d,%3d] %4.1f '%s'" % (np[0], np[1], BayesClassifier.cnt_positivity(np), ngram)
def __init__(self, training_data):
"""BayesClassifier initialization
ngram_counts are dicts of positive and negative
counts for each ngrams for n = 1,2,3
ngram_counts[n][term] = [neg,pos]
ngram_keys[n] is ngram_counts[n]'s key set
class_count = [neg,pos] is the total counts of negative and
positive training examples
self.class_count = [0,0]
self.ngram_counts = dict((n,{}) for n in (1,2,3))
self.ngram_keys = dict((n,set()) for n in (1,2,3))
def __repr__(self):
def counts_str(counts):
def n(g): return -BayesClassifier.cnt_positivity(counts[g]), g.lower()
return '\n'.join([BayesClassifier.cnt_show(key, counts[key]) for key in sorted(counts, key = lambda k : n(k))])
def show_counts(name, counts):
return '%s\n%s\n%s\n' % ('-' * 80, name, counts_str(counts))
totals = [
' (neg, pos, cnt)',
'TRIGRAMS %s' % str(self.cntv_ngrams[3]),
' BIGRAMS %s' % str(self.cntv_ngrams[2]),
'UNIGRAMS %s' % str(self.cntv_ngrams[1]),
totals_string = '\n'.join(totals) + '\n'
return totals_string \
+ show_counts('TRIGRAMS', self.ngram_counts[3]) \
+ show_counts('BIGRAMS', self.ngram_counts[2]) \
+ show_counts('UNIGRAMS', self.ngram_counts[1])
def _add_example(self, cls, message):
"""Add a training example
words = preprocessing.extract_words(message)
if not words: # Treat cases where 'paper cut' is pre-processing as negatives.
self.class_count[cls] += 1
# Update ngram_counts and ngram_keys for ngrams cls
for n in (1,2,3):
ngrams = preprocessing.get_ngrams(n, words)
ngrams = BayesClassifier.get_features(ngrams)
for g in ngrams:
count = self.ngram_counts[n].get(g, [0,0])
count[cls] += 1
self.ngram_counts[n][g] = count
def train(self, training_data):
for cls,message in training_data:
self._add_example(cls, message)
self.cntv_ngrams = dict((n,BayesClassifier.get_cntv(self.ngram_counts[n])) for n in (1,2,3))
def classify(self, message, detailed=False):
"""message is a string to classify. Return True or False classification.
Method is to calculate a log_odds from a liklihood based on
trigram, bigram and unigram (pos,neg) counts in the training set
For each trigram
return smoothed trigram score if trigram in training set, else
for the 2 bigrams in the trigram
return smoothed bigram score if bigram in training set, else
for the 2 unigrams in the bigram
return smoothed unigram score
get_score() shows the smoothed scoring
gram_score() shows the backoff and smoothing factors
words = preprocessing.extract_words(message)
if detailed:
print words
if not words:
return False, 0.0
# Using dicts with offset keys prevents the same ngram being included twice
ngrams = dict((n,{}) for n in (1,2,3))
from preprocessing import WORD_DELIMITER
# Best intuition is to compute back-off based on counts
for i in range(len(words)-3):
tri = WORD_DELIMITER.join(words[i:i+3])
if tri in self.ngram_keys[3]:
ngrams[3][i] = tri
for j in (0,1):
bi = WORD_DELIMITER.join(words[i+j:i+j+2])
if bi in self.ngram_keys[2]:
ngrams[2][i+j] = bi
for k in (0,1):
ngrams[1][i+j+k] = words[i+j+k]
for n in (1,2,3):
ngrams[n] = BayesClassifier.get_features(ngrams[n].values())
def get_score(counts, cntv, alpha):
"""Get a smoothed score for an ngram
counts = neg,pos
neg = number of negatives for ngram in training set
pos = number of positives for ngram in training set
cntv = (cntn,cntp,v) for an ngram training dict
cntn: total number of negatives
cntp: total number of positives
v: number of unique ngrams
alpha: smoothing factor.
Returns: a smoothed score for the ngram
neg,pos = counts
if neg == pos:
return 0
cntn,cntp,v = cntv
return math.log((pos+alpha)/(cntp+v*alpha)) - math.log((neg+alpha)/(cntn+v*alpha))
if detailed:
def _dbg(n, score, g): print '%d%s [%.2f] %s' % (n, ' ' * (3-n), score, g)
def _dbg(n, score, g): pass
weights = BayesClassifier.get_weights()
smoothings = BayesClassifier.get_smoothings()
def ngram_score(n, g):
score = get_score(self.ngram_counts[n].get(g, [0,0]), self.cntv_ngrams[n], smoothings[n])
_dbg(n, score, g)
return score
neg,pos = self.class_count
prior = math.log(pos) - math.log(neg)
likelihood = sum(weights[n] * sum(ngram_score(n,g) for g in ngrams[n]) for n in (1,2,3))
log_odds = prior + likelihood
if detailed:
n_gram_dict = {}
for n in (1,2,3):
for g in ngrams[n]:
n_gram_dict[g] = ngram_score(n,g) * weights[n]
print 'ngrams scores --------------'
for k in sorted(n_gram_dict, key = lambda x: n_gram_dict[x]):
print '%6.3f : %s ' % (n_gram_dict[k], k)
return log_odds > BayesClassifier.threshold, log_odds
if __name__ == '__main__':
# Just dump the BayesClassifier class to stdout
for k in sorted(BayesClassifier.__dict__):
if 'gram' in k:
print '%20s : %s' % (k, BayesClassifier.__dict__[k])