## eliminate low information features
from: http://streamhacker.com/tag/chi-square/

In [13]:
import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
 
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'pos precision:', precision(refsets['pos'], testsets['pos'])
    print 'pos recall:', recall(refsets['pos'], testsets['pos'])
    print 'neg precision:', precision(refsets['neg'], testsets['neg'])
    print 'neg recall:', recall(refsets['neg'], testsets['neg'])
    classifier.show_most_informative_features()
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
print 'evaluating single word features'
evaluate_classifier(word_feats)
 
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in movie_reviews.words(categories=['pos']):
    word_fd[word.lower()] += 1
    label_word_fd['pos'][word.lower()] += 1
 
for word in movie_reviews.words(categories=['neg']):
    word_fd[word.lower()] += 1
    label_word_fd['neg'][word.lower()] += 1
 
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
 
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
 
word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score
 
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
 
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
 
print 'evaluating best word features'
evaluate_classifier(best_word_feats)
 
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
 
print 'evaluating best words + bigram chi_sq word features'
evaluate_classifier(best_bigram_word_feats)

evaluating single word features
accuracy: 0.728
pos precision: 0.651595744681
pos recall: 0.98
neg precision: 0.959677419355
neg recall: 0.476
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
evaluating best word features
accuracy: 0.932
pos precision: 0.894160583942
pos r

In [10]:
from nltk.metrics.scores import precision

In [30]:
(best_word_feats(movie_reviews.words('pos/cv006_15448.txt')[:20]), 'pos')

({u',': True,
  u'a': True,
  u'american': True,
  u'apparently': True,
  u'director': True,
  u'film': True,
  u'had': True,
  u'his': True,
  u'history': True,
  u'kaye': True,
  u'new': True,
  u'tony': True,
  u'with': True},
 'pos')

[u'apparently',
 u',',
 u'director',
 u'tony',
 u'kaye',
 u'had',
 u'a',
 u'major',
 u'battle',
 u'with',
 u'new',
 u'line',
 u'regarding',
 u'his',
 u'new',
 u'film',
 u',',
 u'american',
 u'history',
 u'x']

In [11]:
import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
 
def evaluate_classifier(featx):
    non_poplr_feats = [(featx(tknzr.tokenize(f.as_matrix())), 'nonpoplr') for f in no_rt_all_tweet_df['tweet']]
    poplr_feats = [(featx(tknzr.tokenize(f.as_matrix())), 'poplr') for f in all_tweet_df['tweet']]
 
    nonpoplrcutoff = len(non_poplr_feats)*3/4
    poplrcutoff = len(poplr_feats)*3/4
 
    trainfeats = non_poplr_feats[:nonpoplrcutoff] + poplr_feats[:poplrcutoff]
    testfeats = non_poplr_feats[nonpoplrcutoff:] + poplr_feats[poplrcutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
    print 'poplr precision:', precision(refsets['poplr'], testsets['poplr'])
    print 'poplr recall:', recall(refsets['poplr'], testsets['poplr'])
    print 'nonpoplr precision:', precision(refsets['nonpoplr'], testsets['nonpoplr'])
    print 'nonpoplr recall:', recall(refsets['nonpoplr'], testsets['nonpoplr'])
    classifier.show_most_informative_features()
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
print 'evaluating single word features'
evaluate_classifier(word_feats)
 
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

poplr_results = set()
poplr_results = all_tweet_df['tweet'].str.lower().str.split().apply(poplr_results.update)
for word in tknzr.tokenize(re.sub(r"http\S+", "", ' '.join(list(poplr_results)))):
    word_fd[word.lower()] += 1
    label_word_fd['poplr'][word.lower()] += 1

nonpoplr_results = set()
nonpoplr_results = all_tweet_df['tweet'].str.lower().str.split().apply(nonpoplr_results.update)
for word in tknzr.tokenize(re.sub(r"http\S+", "", ' '.join(list(nonpoplr_results)))):
    word_fd[word.lower()] += 1
    label_word_fd['nonpoplr'][word.lower()] += 1
 
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
 
poplr_word_count = label_word_fd['poplr'].N()
nonpoplr_word_count = label_word_fd['nonpoplr'].N()
total_word_count = poplr_word_count + nonpoplr_word_count
 
word_scores = {}
 
for word, freq in word_fd.iteritems():
    poplr_score = BigramAssocMeasures.chi_sq(label_word_fd['poplr'][word],
        (freq, poplr_word_count), total_word_count)
    nonpoplr_score = BigramAssocMeasures.chi_sq(label_word_fd['nonpoplr'][word],
        (freq, nonpoplr_word_count), total_word_count)
    word_scores[word] = poplr_score + nonpoplr_score
 
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
 
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
 
print 'evaluating best word features'
evaluate_classifier(best_word_feats)
 
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
 
print 'evaluating best words + bigram chi_sq word features'
evaluate_classifier(best_bigram_word_feats)

Help on function precision in module nltk.metrics.scores:

precision(reference, test)
    Given a set of reference values and a set of test values, return
    the fraction of test values that appear in the reference set.
    In particular, return card(``reference`` intersection ``test``)/card(``test``).
    If ``test`` is empty, then return None.
    
    :type reference: set
    :param reference: A set of reference values.
    :type test: set
    :param test: A set of values to compare against the reference set.
    :rtype: float or None



In [40]:
len(movie_reviews.words(categories=['pos']))

832564

In [45]:
len([(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')[:3]])

3

In [43]:
negids

NameError: name 'negids' is not defined