In [2]:
 # Michael Morales - IST 664 Final Project

In [36]:
import os
import sys
import random
import time
import pandas
import re

import nltk
from nltk.corpus import subjectivity
from nltk.corpus import sentence_polarity
from nltk.corpus import stopwords
from nltk.corpus.reader.api import *
from nltk.tokenize import *
from nltk.collocations import *
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import sentiment_read_LIWC_pos_neg_words

In [37]:
# Set directory path

dirPath = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/kagglemoviereviews'

In [38]:
# Function: define features

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [39]:
# Function: cross-validation

def cross_validation_PRF(num_folds, featuresets, labels):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    # for the number of labels - start the totals lists with zeroes
    num_labels = len(labels)
    total_precision_list = [0] * num_labels
    total_recall_list = [0] * num_labels
    total_F1_list = [0] * num_labels

    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round to produce the gold and predicted labels
        goldlist = []
        predictedlist = []
        for (features, label) in test_this_round:
            goldlist.append(label)
            predictedlist.append(classifier.classify(features))

        # computes evaluation measures for this fold and
        #   returns list of measures for each label
        print('Fold', i)
        (precision_list, recall_list, F1_list) \
                  = eval_measures(goldlist, predictedlist, labels)
        # take off triple string to print precision, recall and F1 for each fold
        '''
        print('\tPrecision\tRecall\t\tF1')
        # print measures for each label
        for i, lab in enumerate(labels):
            print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
              "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
        '''
        # for each label add to the sums in the total lists
        for i in range(num_labels):
            # for each label, add the 3 measures to the 3 lists of totals
            total_precision_list[i] += precision_list[i]
            total_recall_list[i] += recall_list[i]
            total_F1_list[i] += F1_list[i]

    # find precision, recall and F measure averaged over all rounds for all labels
    # compute averages from the totals lists
    precision_list = [tot/num_folds for tot in total_precision_list]
    recall_list = [tot/num_folds for tot in total_recall_list]
    F1_list = [tot/num_folds for tot in total_F1_list]
    # the evaluation measures in a table with one row per label
    print('\nAverage Precision\tRecall\t\tF1 \tPer Label')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))
    
    # print macro average over all labels - treats each label equally
    print('\nMacro Average Precision\tRecall\t\tF1 \tOver All Labels')
    print('\t', "{:10.3f}".format(sum(precision_list)/num_labels), \
          "{:10.3f}".format(sum(recall_list)/num_labels), \
          "{:10.3f}".format(sum(F1_list)/num_labels))

    # for micro averaging, weight the scores for each label by the number of items
    #    this is better for labels with imbalance
    # first intialize a dictionary for label counts and then count them
    label_counts = {}
    for lab in labels:
      label_counts[lab] = 0 
    # count the labels
    for (doc, lab) in featuresets:
      label_counts[lab] += 1
    # make weights compared to the number of documents in featuresets
    num_docs = len(featuresets)
    label_weights = [(label_counts[lab] / num_docs) for lab in labels]
    print('\nLabel Counts', label_counts)
    #print('Label weights', label_weights)
    # print macro average over all labels
    print('Micro Average Precision\tRecall\t\tF1 \tOver All Labels')
    precision = sum([a * b for a,b in zip(precision_list, label_weights)])
    recall = sum([a * b for a,b in zip(recall_list, label_weights)])
    F1 = sum([a * b for a,b in zip(F1_list, label_weights)])
    print( '\t', "{:10.3f}".format(precision), \
      "{:10.3f}".format(recall), "{:10.3f}".format(F1))

In [40]:
# Function: compute precision, recall, and F1

def eval_measures(gold, predicted, labels):
    
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []

    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        # for small numbers, guard against dividing by zero in computing measures
        if (TP == 0) or (FP == 0) or (FN == 0):
          recall_list.append (0)
          precision_list.append (0)
          F1_list.append(0)
        else:
          recall = TP / (TP + FP)
          precision = TP / (TP + FN)
          recall_list.append(recall)
          precision_list.append(precision)
          F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    return (precision_list, recall_list, F1_list)

In [41]:
# Function: define bigram features

def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [42]:
# Function: write featuresets to CSV file

def writeFeatureSets(featuresets, outpath):
    # Open outpath for writing
    f = open(outpath, 'w')
    # Get the feature names from the feature dictionary in the first featureset
    featurenames = featuresets[0][0].keys()
    # Create the first list of the file as comma-separated feature names
    #   with the word class as the last feature name
    featurenameline = ''
    for featurename in featurenames:
        # Replace forbidden characters with text abbreviations
        featurename = featurename.replace(',','CM')
        featurename = featurename.replace("'","DQ")
        featurename = featurename.replace('"','QU')
        featurenameline += featurename + ','
        featurenameline += 'class'
    # Write this as the first line in the csv file
    f.write(featurenameline)
    f.write('\n')
    # Convert each feature set to a line in the file with comma separated feature values,
    # each feature value is converted to a string 
    #   for booleans this is the words true and false
    #   for numbers, this is the string with the number
    for featureset in featuresets:
        featureline = ''
        for key in featurenames:
            try:
                featureline += str(featureset[0].get(key, []))+','
            except KeyError:
                continue
        featureline += str(featureset[1])
        # Write each feature set values to the file
        f.write(featureline)
        f.write('\n')
    f.close()

In [43]:
# Function: define sentiment lexicon features

def readSubjectivity(path):
	flexicon = open(path, 'r')
	# initialize an empty dictionary
	sldict = { }
	for line in flexicon:
		fields = line.split()   # default is to split on whitespace
		# split each field on the '=' and keep the second part as the value
		strength = fields[0].split("=")[1]
		word = fields[2].split("=")[1]
		posTag = fields[3].split("=")[1]
		stemmed = fields[4].split("=")[1]
		polarity = fields[5].split("=")[1]
		if (stemmed == 'y'):
			isStemmed = True
		else:
			isStemmed = False
		# put a dictionary entry with the word as the keyword
		#     and a list of the other values
		sldict[word] = [strength, posTag, isStemmed, polarity]
	return sldict

In [44]:
# Function: define sentiment lexicon features

def SL_features(document, word_features, SL):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
            features['positivecount'] = weakPos + (2 * strongPos)
            features['negativecount'] = weakNeg + (2 * strongNeg)      
    return features

In [45]:
# Function: retrieve poslist and neglist from LIWC dictionary

def read_words():
  poslist = []
  neglist = []

  flexicon = open('C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/liwcdic2007.dic', encoding='latin1')
  # read all LIWC words from file
  wordlines = [line.strip() for line in flexicon]
  # each line has a word or a stem followed by * and numbers of the word classes it is in
  # word class 126 is positive emotion and 127 is negative emotion
  for line in wordlines:
    if not line == '':
      items = line.split()
      word = items[0]
      classes = items[1:]
      for c in classes:
        if c == '126':
          poslist.append( word )
        if c == '127':
          neglist.append( word )
  return (poslist, neglist)

poslist, neglist = read_words()

In [46]:
# Function: define LIWC sentiment lexicon features
# https://github.com/bjprogrammer/Kaggle-Movie-Review/blob/master/kagglemoviereviews/classifyKaggle.py

def liwc_features(doc, word_features,poslist,neglist):
  doc_words = set(doc)
  features = {}
  for word in word_features:
    features['contains({})'.format(word)] = (word in doc_words)
  pos = 0
  neg = 0
  for word in doc_words:
    if sentiment_read_LIWC_pos_neg_words.isPresent(word,poslist):
      pos += 1
    if sentiment_read_LIWC_pos_neg_words.isPresent(word,neglist):
      neg += 1
    features['positivecount'] = pos
    features['negativecount'] = neg
  if 'positivecount' not in features:
    features['positivecount']=0
  if 'negativecount' not in features:
    features['negativecount']=0  
  return features

In [47]:
# Function: define a combination of SL and LIWC lexicons
# https://github.com/bjprogrammer/Kaggle-Movie-Review/blob/master/kagglemoviereviews/classifyKaggle.py

def SL_liwc_features(doc, word_features, SL,poslist,neglist):
  document_words = set(doc)
  features = {}
  for word in word_features:
    features['contains({})'.format(word)] = (word in document_words)
  # count variables for the 4 classes of subjectivity
  weakPos = 0
  strongPos = 0
  weakNeg = 0
  strongNeg = 0
  for word in document_words:
    if sentiment_read_LIWC_pos_neg_words.isPresent(word,poslist):
      strongPos += 1
    elif sentiment_read_LIWC_pos_neg_words.isPresent(word,neglist):
      strongNeg += 1
    elif word in SL:
      strength, posTag, isStemmed, polarity = SL[word]
      if strength == 'weaksubj' and polarity == 'positive':
        weakPos += 1
      if strength == 'strongsubj' and polarity == 'positive':
        strongPos += 1
      if strength == 'weaksubj' and polarity == 'negative':
        weakNeg += 1
      if strength == 'strongsubj' and polarity == 'negative':
        strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
  
  if 'positivecount' not in features:
    features['positivecount']=0
  if 'negativecount' not in features:
    features['negativecount']=0      
  return features

In [48]:
# Function: define part-of-speech tagging features

def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [49]:
# Function: Representing negation

def NOT_features(document, word_features, negationwords):
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = False
        features['V_NOT{}'.format(word)] = False
    # go through document words in order
    for i in range(0, len(document)):
        word = document[i]
        if ((i + 1) < len(document)) and ((word in negationwords) or (word.endswith("n't"))):
            i += 1
            features['V_NOT{}'.format(document[i])] = (document[i] in word_features)
        else:
            features['V_{}'.format(word)] = (word in word_features)
    return features

negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']

In [50]:
# Function: Bing Liu's Opinion Lexicon

dirPath = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/kagglemoviereviews'

def read_opinionlexicon():
    POSITIVE_REVIEWS = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/rt-polarity-pos.txt'
    NEGATIVE_REVIEWS = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/rt-polarity-neg.txt'
    
    pos_features = []
    neg_features = []
    for line in open(POSITIVE_REVIEWS, 'r').readlines()[35:]:
        pos_words = re.findall(r"[\w']+|[.,!?;]", line.rstrip())
        pos_features.append(pos_words[0])
        
    for line in open(NEGATIVE_REVIEWS, 'r').readlines()[35:]:
        neg_words = re.findall(r"[\w']+|[.,!?;]", line.rstrip())
        neg_features.append(neg_words[0])
  
    return pos_features,neg_features

poslist2,neglist2 = read_opinionlexicon()

In [51]:
# I could not get this code to work when encapsulated in the processkaggle() function
# Featureset1: Bag of words / unigram (baseline)

vocab_size = 500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

10357


In [52]:
label_list = [c for (d,c) in docs]
labels = list(set(label_list))
num_folds = 5

In [53]:
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.208      0.178      0.190
1 	      0.214      0.344      0.263
2 	      0.826      0.620      0.709
3 	      0.215      0.390      0.277
4 	      0.179      0.277      0.217

Macro Average Precision	Recall		F1 	Over All Labels
	      0.328      0.362      0.331

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.528      0.485      0.490
39.65596914291382  seconds elapsed.


In [54]:
# Naive Bayes for featuresets(baseline)

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.504

In [55]:
# Featureset2: bigram

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
# print(bigram_features[:50])
featuresets2 = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featuresets2, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.208      0.177      0.190
1 	      0.214      0.344      0.263
2 	      0.827      0.620      0.709
3 	      0.215      0.390      0.277
4 	      0.179      0.277      0.217

Macro Average Precision	Recall		F1 	Over All Labels
	      0.328      0.362      0.331

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.528      0.485      0.490
79.61417579650879  seconds elapsed.


In [56]:
# Naive Bayes for featuresets2(bigrams)

train_set, test_set = featuresets2[1000:], featuresets2[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.504

In [57]:
# Featureset3: Sentiment Lexicon

SLpath = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/kagglemoviereviews/SentimentLexicons/subjclueslen1-HLTEMNLP05.tff'
SL = readSubjectivity(SLpath)

featureset3 = [(SL_features(d, word_features, SL), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset3, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.248      0.182      0.209
1 	      0.242      0.328      0.279
2 	      0.751      0.666      0.706
3 	      0.353      0.418      0.383
4 	      0.255      0.280      0.267

Macro Average Precision	Recall		F1 	Over All Labels
	      0.370      0.375      0.369

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.529      0.512      0.517
39.48142886161804  seconds elapsed.


In [58]:
# Naive Bayes for featureset3 (sentiment lexicon)

train_set, test_set = featureset3[1000:], featureset3[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.487

In [59]:
# Featureset4 : LIWC

featureset4 = [(liwc_features(d, word_features,poslist,neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset4, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.232      0.195      0.211
1 	      0.213      0.349      0.264
2 	      0.817      0.639      0.717
3 	      0.276      0.424      0.334
4 	      0.227      0.293      0.255

Macro Average Precision	Recall		F1 	Over All Labels
	      0.353      0.380      0.356

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.540      0.504      0.510
39.65498113632202  seconds elapsed.


In [60]:
# Naive Bayes for featureset4 (LIWC sentiment lexicon)

train_set, test_set = featureset4[1000:], featureset4[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.509

In [61]:
# Featureset5: Combination SL and LIWC

featureset5 = [(SL_liwc_features(d, word_features, SL, poslist, neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset5, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.241      0.192      0.213
1 	      0.219      0.353      0.270
2 	      0.812      0.647      0.720
3 	      0.289      0.421      0.342
4 	      0.245      0.314      0.275

Macro Average Precision	Recall		F1 	Over All Labels
	      0.361      0.385      0.364

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.542      0.509      0.515
38.23581027984619  seconds elapsed.


In [62]:
# Naive Bayes for featureset5 (combination SL and LIWC)

train_set, test_set = featureset5[1000:], featureset5[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.515

In [63]:
# Featureset 6: Part-of-speech tagging

featureset6 = [(POS_features(d, word_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset6, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.247      0.154      0.189
1 	      0.234      0.344      0.279
2 	      0.803      0.630      0.706
3 	      0.198      0.381      0.260
4 	      0.208      0.272      0.235

Macro Average Precision	Recall		F1 	Over All Labels
	      0.338      0.356      0.334

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.519      0.487      0.489
39.859434604644775  seconds elapsed.


In [64]:
# Naive Bayes for featureset6 (part-of-speech tagging)

train_set, test_set = featureset6[1000:], featureset6[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.495

In [65]:
# Featureset 7: Representing negation

featureset7 = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset7, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.470      0.139      0.214
1 	      0.217      0.348      0.267
2 	      0.680      0.687      0.684
3 	      0.261      0.425      0.323
4 	      0.381      0.216      0.275

Macro Average Precision	Recall		F1 	Over All Labels
	      0.402      0.363      0.352

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.486      0.522      0.492
77.46689009666443  seconds elapsed.


In [66]:
# Naive Bayes for featureset7 (representing negation)

train_set, test_set = featureset7[1000:], featureset7[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.474

In [67]:
# Featureset 8: Using Bing Liu's Opinion Lexicon, obtained at:
#   https://www.cs.uic.edu/~liub/

featureset8 = [(liwc_features(d, word_features,poslist2,neglist2), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset8, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.225      0.141      0.172
1 	      0.230      0.324      0.269
2 	      0.790      0.626      0.698
3 	      0.205      0.378      0.266
4 	      0.197      0.272      0.228

Macro Average Precision	Recall		F1 	Over All Labels
	      0.329      0.348      0.327

Label Counts {0: 444, 1: 1691, 2: 5160, 3: 2119, 4: 586}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.511      0.480      0.483
37.89269280433655  seconds elapsed.


In [68]:
# Naive Bayes for featureset8 (Bing Liu's Opinion Lexicon)

train_set, test_set = featureset8[1000:], featureset8[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.483

In [69]:
# Featureset1: Bag of words / unigram (baseline)

vocab_size = 1000
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

10366


In [70]:
# Featureset 1: Bag of words / unigram (baseline), vocabulary size 1000

start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.222      0.165      0.189
1 	      0.243      0.387      0.298
2 	      0.826      0.630      0.715
3 	      0.215      0.392      0.277
4 	      0.195      0.282      0.229

Macro Average Precision	Recall		F1 	Over All Labels
	      0.340      0.371      0.342

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.533      0.496      0.499
92.4079442024231  seconds elapsed.


In [71]:
# Naive Bayes for featuresets(baseline), vocab 1000

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.528

In [72]:
# Featureset2: bigram, vocab 1000

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_words_list)
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
# print(bigram_features[:50])
featuresets2 = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featuresets2, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.222      0.165      0.189
1 	      0.243      0.387      0.298
2 	      0.826      0.630      0.715
3 	      0.215      0.392      0.277
4 	      0.195      0.282      0.229

Macro Average Precision	Recall		F1 	Over All Labels
	      0.340      0.371      0.342

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.533      0.496      0.499
129.47090649604797  seconds elapsed.


In [73]:
# Naive Bayes for featuresets2(bigrams), vocab 1000

train_set, test_set = featuresets2[1000:], featuresets2[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.528

In [75]:
# Featureset3: Sentiment Lexicon, vocab 1000

SLpath = 'C:/Users/madmo/OneDrive/Syracuse/IST664 - NLP/Final Project/kagglemoviereviews/kagglemoviereviews/SentimentLexicons/subjclueslen1-HLTEMNLP05.tff'
SL = readSubjectivity(SLpath)

featureset3 = [(SL_features(d, word_features, SL), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset3, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.270      0.191      0.223
1 	      0.297      0.400      0.340
2 	      0.764      0.674      0.716
3 	      0.345      0.423      0.379
4 	      0.288      0.309      0.296

Macro Average Precision	Recall		F1 	Over All Labels
	      0.393      0.399      0.391

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.545      0.530      0.533
81.77736592292786  seconds elapsed.


In [76]:
# Naive Bayes for featureset3 (sentiment lexicon), vocab 1000

train_set, test_set = featureset3[1000:], featureset3[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.521

In [77]:
# Featureset4 : LIWC, vocab 1000

featureset4 = [(liwc_features(d, word_features,poslist,neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset4, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.248      0.196      0.218
1 	      0.258      0.403      0.314
2 	      0.821      0.648      0.724
3 	      0.271      0.431      0.332
4 	      0.252      0.307      0.275

Macro Average Precision	Recall		F1 	Over All Labels
	      0.370      0.397      0.373

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.549      0.519      0.521
80.67737126350403  seconds elapsed.


In [78]:
# Naive Bayes for featureset4 (LIWC sentiment lexicon), vocab 1000

train_set, test_set = featureset4[1000:], featureset4[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.538

In [79]:
# Featureset5: Combination SL and LIWC, vocab 1000

featureset5 = [(SL_liwc_features(d, word_features, SL, poslist, neglist), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset5, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.268      0.204      0.231
1 	      0.263      0.411      0.320
2 	      0.816      0.654      0.726
3 	      0.289      0.437      0.347
4 	      0.260      0.308      0.280

Macro Average Precision	Recall		F1 	Over All Labels
	      0.379      0.403      0.381

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.552      0.525      0.527
82.95425748825073  seconds elapsed.


In [80]:
# Naive Bayes for featureset5 (combination SL and LIWC), vocab 1000

train_set, test_set = featureset5[1000:], featureset5[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.527

In [81]:
# Featureset 6: Part-of-speech tagging, vocab 1000

featureset6 = [(POS_features(d, word_features), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset6, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.261      0.153      0.193
1 	      0.251      0.378      0.301
2 	      0.812      0.638      0.714
3 	      0.202      0.392      0.266
4 	      0.209      0.276      0.236

Macro Average Precision	Recall		F1 	Over All Labels
	      0.347      0.368      0.342

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.527      0.498      0.497
82.21129083633423  seconds elapsed.


In [82]:
# Naive Bayes for featureset6 (part-of-speech tagging), vocab 1000

train_set, test_set = featureset6[1000:], featureset6[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.52

In [83]:
# Featureset 7: Representing negation, vocab 1000

featureset7 = [(NOT_features(d, word_features, negationwords), c) for (d, c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset7, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.441      0.159      0.233
1 	      0.255      0.390      0.308
2 	      0.741      0.677      0.708
3 	      0.238      0.409      0.300
4 	      0.305      0.241      0.268

Macro Average Precision	Recall		F1 	Over All Labels
	      0.396      0.375      0.363

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.513      0.521      0.506
155.97499585151672  seconds elapsed.


In [84]:
# Naive Bayes for featureset7 (representing negation), vocab 1000

train_set, test_set = featureset7[1000:], featureset7[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.5

In [85]:
# Featureset 8: Using Bing Liu's Opinion Lexicon, vocab 1000

featureset8 = [(liwc_features(d, word_features,poslist2,neglist2), c) for (d,c) in docs]

# Cross-validation
start = time.time()
cross_validation_PRF(num_folds, featureset8, labels)
end = time.time()
print(end-start, " seconds elapsed.")

Each fold size: 2000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4

Average Precision	Recall		F1 	Per Label
0 	      0.272      0.158      0.200
1 	      0.254      0.364      0.299
2 	      0.800      0.635      0.708
3 	      0.195      0.379      0.257
4 	      0.213      0.277      0.240

Macro Average Precision	Recall		F1 	Over All Labels
	      0.347      0.363      0.341

Label Counts {0: 455, 1: 1745, 2: 5136, 3: 2026, 4: 638}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.521      0.492      0.492
88.94922804832458  seconds elapsed.


In [86]:
# Naive Bayes for featureset8 (Bing Liu's Opinion Lexicon), vocab 1000

train_set, test_set = featureset8[1000:], featureset8[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.505

In [87]:
# Featureset1: Bag of words / unigram (baseline), vocab 500, 10 folds

vocab_size = 500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10448
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.171      0.175      0.173
1 	      0.239      0.357      0.286
2 	      0.817      0.607      0.696
3 	      0.233      0.412      0.297
4 	      0.169      0.244      0.197

Macro Average Precision	Recall		F1 	Over All Labels
	      0.326      0.359      0.330

Label Counts {0: 437, 1: 1794, 2: 4986, 3: 2182, 4: 601}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.519      0.479      0.483
60.70177412033081  seconds elapsed.


In [88]:
# Naive Bayes for featuresets(baseline), vocab 500, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.522

In [89]:
# Featureset1: Bag of words / unigram (baseline), vocab 1000, 10 folds

vocab_size = 1000
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10367
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.290      0.216      0.247
1 	      0.241      0.386      0.296
2 	      0.817      0.630      0.711
3 	      0.255      0.434      0.321
4 	      0.217      0.301      0.251

Macro Average Precision	Recall		F1 	Over All Labels
	      0.364      0.393      0.365

Label Counts {0: 468, 1: 1743, 2: 5116, 3: 2106, 4: 567}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.540      0.508      0.509
127.32099318504333  seconds elapsed.


In [90]:
# Naive Bayes for featuresets(baseline), vocab 1000, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.533

In [91]:
# Featureset1: Bag of words / unigram (baseline), vocab 1500, 10 folds

vocab_size = 1500
limit = int(10000)
os.chdir(dirPath)
f = open('./train.tsv', 'r')
phrasedata=[]
for line in f:
    if (not line.startswith('Phrase')):
        line = line.strip()
        phrasedata.append(line.split('\t')[2:4])
random.shuffle(phrasedata)
phraselist = phrasedata[:limit]
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))
word_items = all_words.most_common(vocab_size)
word_features = [word for (word, count) in word_items]
featuresets = [(document_features(d, word_features), c) for (d,c) in docs]

num_folds = 10
start = time.time()
cross_validation_PRF(num_folds, featuresets, labels)
end = time.time()
print(end-start, " seconds elapsed.")

10465
Each fold size: 1000
Fold 0
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9

Average Precision	Recall		F1 	Per Label
0 	      0.263      0.227      0.241
1 	      0.228      0.340      0.273
2 	      0.817      0.628      0.710
3 	      0.244      0.418      0.307
4 	      0.237      0.321      0.271

Macro Average Precision	Recall		F1 	Over All Labels
	      0.358      0.387      0.360

Label Counts {0: 489, 1: 1718, 2: 5060, 3: 2138, 4: 595}
Micro Average Precision	Recall		F1 	Over All Labels
	      0.532      0.496      0.500
189.41062307357788  seconds elapsed.


In [92]:
# Naive Bayes for featuresets(baseline), vocab 1500, 10 folds

train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.561