In [1]:
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk import precision,recall

In [2]:
path = 'manual_labels'
print('subdirectories are:' + str(os.listdir(path)))
def get_files(path):
    return sorted([path+ os.sep +f for f in os.listdir(path) if f.endswith(".txt")])

subdirectories are:['test', 'train']


In [3]:
pos_train_files = get_files(path + os.sep + 'train' + os.sep + 'pos')
neg_train_files = get_files(path + os.sep + 'train' + os.sep + 'neg')
all_train_files = pos_train_files + neg_train_files

pos_test_files = get_files(path + os.sep + 'test' + os.sep + 'pos')
neg_test_files = get_files(path + os.sep + 'test' + os.sep + 'neg')
all_test_files = pos_test_files + neg_test_files

print('found %d positive and %d negative training files' %
      (len(pos_train_files), len(neg_train_files)))

print('found %d positive and %d negative test files' %
      (len(pos_test_files), len(neg_test_files)))

print('first positive file: %s' % pos_train_files[0])
print('first negative file: %s' % neg_train_files[0])

found 79 positive and 33 negative training files
found 26 positive and 7 negative test files
first positive file: manual_labels/train/pos/1.txt
first negative file: manual_labels/train/neg/102.txt


In [4]:
def evaluate_features(feature_select):
    positiveFeatures = []
    negativeFeatures = []
    for pos_file in pos_train_files:
        with open(pos_file, 'r') as posSentences:
            for i in posSentences:
                posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                posWords = [feature_select(posWords), 'pos']
                positiveFeatures.append(posWords)
    for neg_file in neg_train_files:
        with open(neg_file, 'r') as negSentences:
            for i in negSentences:
                negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                negWords = [feature_select(negWords), 'neg']
                negativeFeatures.append(negWords)

    #70% to train and remaining to test
    posCutoff = int(math.floor(len(positiveFeatures)*7/10))
    negCutoff = int(math.floor(len(negativeFeatures)*3/10))
    trainFeatures = positiveFeatures[:posCutoff] + negativeFeatures[:negCutoff]
    testFeatures = positiveFeatures[posCutoff:] + negativeFeatures[negCutoff:]

    classifier = NaiveBayesClassifier.train(trainFeatures)

    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', recall(referenceSets['neg'], testSets['neg'])
    classifier.show_most_informative_features(10)

#creates a feature selection mechanism that uses all words
def make_full_dict(words):
    return dict([(word, True) for word in words])

#tries using all words as the feature selection mechanism
print 'Now am using all the words as feature set'
evaluate_features(make_full_dict)

Now am using all the words as feature set
train on 342 instances, test on 274 instances
accuracy: 0.43795620438
pos precision: 0.432432432432
pos recall: 0.941176470588
neg precision: 0.533333333333
neg recall: 0.0516129032258
Most Informative Features
                       2 = True              neg : pos    =     12.4 : 1.0
                ordering = True              neg : pos    =      6.9 : 1.0
                 already = True              neg : pos    =      6.9 : 1.0
                    Naan = True              neg : pos    =      6.9 : 1.0
               something = True              neg : pos    =      6.9 : 1.0
                    give = True              neg : pos    =      6.9 : 1.0
                   hours = True              neg : pos    =      6.9 : 1.0
                   Don't = True              neg : pos    =      6.9 : 1.0
                    make = True              neg : pos    =      6.9 : 1.0
                  Sunday = True              neg : pos    =      6.9 : 1

In [5]:
def create_word_scores():
    posWords = []
    negWords = []
    for pos_file in pos_train_files:
        with open(pos_file, 'r') as posSentences:
            for i in posSentences:
                posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                posWords.append(posWord)
    for neg_file in neg_train_files:
        with open(neg_file, 'r') as negSentences:
            for i in negSentences:
                negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
                negWords.append(negWord)
        posWords = list(itertools.chain(*posWords))
        negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1


    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores

In [6]:
word_scores = create_word_scores()

In [7]:
def find_best_words(word_scores, number):
    best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return best_words

In [8]:
def best_word_features(words):
    return dict([(word, True) for word in words if word in best_words])

In [9]:
numbers_to_test = [10, 100, 1000, 10000, 15000]
for num in numbers_to_test:
    print 'evaluating best %d word features' % (num)
    best_words = find_best_words(word_scores, num)
    evaluate_features(best_word_features)

evaluating best 10 word features
train on 342 instances, test on 274 instances
accuracy: 0.434306569343
pos precision: 0.434306569343
pos recall: 1.0
neg precision: None
neg recall: 0.0
Most Informative Features
                       ! = True              pos : neg    =      5.1 : 1.0
                      of = True              pos : neg    =      1.8 : 1.0
                     the = True              pos : neg    =      1.6 : 1.0
                     was = True              neg : pos    =      1.4 : 1.0
                     but = True              pos : neg    =      1.4 : 1.0
                      it = True              neg : pos    =      1.3 : 1.0
                     the = None              neg : pos    =      1.3 : 1.0
                    food = True              neg : pos    =      1.3 : 1.0
                       ! = None              neg : pos    =      1.2 : 1.0
                      of = None              neg : pos    =      1.1 : 1.0
evaluating best 100 word features
trai