In [12]:
# Andy Bromberg's Simple Sentiment Analysis System
# Uses data from Pang & Lee (2005)
# Uses a Naive Bayes Classifier Train the System

import re, math, collections, itertools, sys, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

from nltk.tokenize.regexp import WordPunctTokenizer

from nltk.corpus import stopwords
from nltk import precision
from nltk import recall



nltk_tokenizer = WordPunctTokenizer()


#__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

def tokenize(text):
    return nltk_tokenizer.tokenize(' '.join(text).lower())

def evaluate_features(feature_select):
    negSentences = open('rt-polarity-neg.txt', 'r', encoding="utf8")
    posSentences = open('rt-polarity-pos.txt', 'r', encoding="utf8")
    print(type(negSentences)) 
    posSentences_nostop = []
    negSentences_nostop = []
    
    for sentence in (posSentences):
        sentence = ' '.join([word for word in sentence.split() if (word not in stopwords.words("english")) and (word.isalpha())])
        posSentences_nostop.append(sentence)

    for sentence in (negSentences):
        sentence = ' '.join([word for word in sentence.split() if (word not in stopwords.words("english")) and (word.isalpha())])
        negSentences_nostop.append(sentence)  
    
     
    posFeatures = []
    negFeatures = []


    # breaks up the sentences into lists of individual words
    # creates instance structures for classifier
    for i in posSentences_nostop:
        posWords = re.findall(r"[\w']+|[.,!?;]", i)
        posWords = [feature_select(posWords), 'pos']
        posFeatures.append(posWords)
    for i in negSentences_nostop:
        negWords = re.findall(r"[\w']+|[.,!?;]", i)
        negWords = [feature_select(negWords), 'neg']
        negFeatures.append(negWords)
        
    posCutoff = int(math.floor(len(posFeatures)*3/4))
    negCutoff = int(math.floor(len(negFeatures)*3/4))
    trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
    testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
    
    #Runs the classifier on the testFeatures
    classifier = NaiveBayesClassifier.train(trainFeatures)
    
    #Sets up labels to look at output
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)
    for i, (features, label) in enumerate(testFeatures): # enumerate adds number-count to each item
        referenceSets[label].add(i)               # recorded polarity for these test sentences
        predicted = classifier.classify(features) # classifiers' proposed polarity for tests
        print("I predicted {} as {}".format(features, predicted))
        testSets[predicted].add(i)

    #Outputs
    print('train on %s instances, test on %s instances'% (len(trainFeatures), len(testFeatures)))
    print('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
    print('pos precision:', precision(referenceSets['pos'], testSets['pos']))
    print('pos recall:', recall(referenceSets['pos'], testSets['pos']))
    print('neg precision:', precision(referenceSets['neg'], testSets['neg']))
    print('neg recall:', recall(referenceSets['neg'], testSets['neg']))
    classifier.show_most_informative_features(10)
    
    return posSentences_nostop, negSentences_nostop
    
def make_full_dict(words):
    return dict([(word, True) for word in words])

print('using all words as features')
pos_stuff, neg_stuff = evaluate_features(make_full_dict)

using all words as features
<class '_io.TextIOWrapper'>
I predicted {'delicious': True, 'crime': True, 'drama': True, 'par': True, 'slickest': True, 'mamet': True} as pos
I predicted {'charming': True, 'witty': True, 'also': True, 'somewhat': True, 'clumsy': True} as pos
I predicted {'directed': True, 'purpose': True, 'finesse': True, 'roger': True, 'mitchell': True, 'handily': True, 'makes': True, 'move': True, 'pleasing': True, 'relatively': True, 'lightweight': True, 'commercial': True, 'fare': True, 'notting': True, 'hill': True, 'real': True, 'thematic': True, 'heft': True} as pos
I predicted {'escapes': True, 'precious': True, 'trappings': True, 'romantic': True, 'comedies': True, 'infusing': True, 'story': True, 'real': True, 'complicated': True, 'emotions': True} as pos
I predicted {'big': True, 'screen': True, 'caper': True, 'good': True, 'bark': True, 'far': True} as neg
I predicted {'manages': True, 'breathe': True, 'life': True, 'somewhat': True, 'tired': True, 'premise': T

In [8]:
print(pos_stuff)


NameError: name 'pos_stuff' is not defined