In [1]:
import csv
import re

In [2]:
#breaks up the sentences into lists of individual words and appends 'pos' or 'neg' after each list
pre_posFeatures = []
pre_negFeatures = []
with open('Sentiment Analysis Dataset.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    print "start importing text data..."
    for row in reader:
        if row['Sentiment'] == '1':
            posWords = re.findall(r"[\w']+|[.,!?:;]", row['SentimentText'].rstrip())
            pre_posFeatures.append([posWords, 'pos'])
        elif row['Sentiment'] == '0':
            negWords = re.findall(r"[\w']+|[.,!?:;]", row['SentimentText'].rstrip())
            pre_negFeatures.append([negWords, 'neg'])  
    print "posFeatures size: " + str(len(pre_posFeatures))
    print "negFeatures size: " + str(len(pre_negFeatures))

start importing text data...
posFeatures size: 790178
negFeatures size: 788436


In [3]:
print "pre_posFeatures: " + str(pre_posFeatures[0])
print "pre_negFeatures: " + str(pre_negFeatures[0])

pre_posFeatures: [['omg', 'its', 'already', '7', ':', '30', ':', 'O'], 'pos']
pre_negFeatures: [['is', 'so', 'sad', 'for', 'my', 'APL', 'friend', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.'], 'neg']


In [4]:
#pre-process data
import nltk
from nltk.corpus import stopwords
import string


posFeatures = []
negFeatures = []

print "start pre-processing text data..."
#remove punctuation & digit & stopwords
for sentence in pre_posFeatures:
    sentence_temp = []
    for word in sentence[0]:
        if not word.isdigit() and word not in string.punctuation:
            sentence_temp.append(word.lower())
    posFeatures.append([sentence_temp, 'pos'])
    

for sentence in pre_negFeatures:
    sentence_temp = []
    for word in sentence[0]:
        if not word.isdigit() and word not in string.punctuation:
            sentence_temp.append(word.lower())
    negFeatures.append([sentence_temp, 'neg'])

print "end pre-processing text data"
#text = nltk.corpus.words.words('en')


start pre-processing text data...
end pre-processing text data


In [5]:
print "posFeature: " + str(posFeatures[0])
print "negFeature: " + str(negFeatures[0])

posFeature: [['omg', 'its', 'already', 'o'], 'pos']
negFeature: [['is', 'so', 'sad', 'for', 'my', 'apl', 'friend'], 'neg']


In [6]:
#assign each word of posFeatures #i the value True
for sentence in posFeatures:
    sentence [0] = dict([word, True] for word in sentence[0])        

In [7]:
#assign each word of negFeatures #i the value True
for sentence in negFeatures:
    sentence [0] = dict([word, True] for word in sentence[0])        

In [8]:
import math

#selects 3/4 of the features to be used for training and 1/4 to be used for testing
posCutoff = int(math.floor(len(posFeatures)*3/4))
negCutoff = int(math.floor(len(negFeatures)*3/4))
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]

In [9]:
from nltk.classify import NaiveBayesClassifier

print "start Naive Bayes Classifier..."

#trains a Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(trainFeatures)

start Naive Bayes Classifier...


In [12]:
import collections

#initiates referenceSets and testSets
trueSets = collections.defaultdict(set)
predSets = collections.defaultdict(set)

In [14]:
#puts correctly labeled sentences in trueSets and the predictively labeled version in predSets
for i, (features, label) in enumerate(testFeatures):
    trueSets[label].add(i)
    predicted = classifier.classify(features)
    predSets[predicted].add(i)

In [19]:
import nltk.classify.util
from nltk.metrics.scores import precision as precision
from nltk.metrics.scores import recall as recall

print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
print 'pos precision:', precision(trueSets['pos'], predSets['pos'])
print 'pos recall:', recall(trueSets['pos'], predSets['pos'])
print 'neg precision:', precision(trueSets['neg'], predSets['neg'])
print 'neg recall:', recall(trueSets['neg'], predSets['neg'])
classifier.show_most_informative_features(10)

 train on 1183960 instances, test on 394654 instances
accuracy: 0.746818732358
pos precision: 0.869035018749
pos recall: 0.581887671164
neg precision: 0.685207064509
neg recall: 0.912114616786
Most Informative Features
            tweeteradder = True              pos : neg    =    487.3 : 1.0
              banksyart2 = True              pos : neg    =     57.5 : 1.0
                 mcmahon = True              neg : pos    =     54.3 : 1.0
                    sadd = True              neg : pos    =     51.8 : 1.0
                 saddens = True              neg : pos    =     49.1 : 1.0
             shareholder = True              pos : neg    =     48.2 : 1.0
                    owie = True              neg : pos    =     39.1 : 1.0
                  farrah = True              neg : pos    =     35.2 : 1.0
                  boohoo = True              neg : pos    =     33.3 : 1.0
         iammaxathotspot = True              pos : neg    =     30.3 : 1.0
