#Classification of Cornell Movie Review Polarity Dataset

In [None]:
import nltk, re
from os import listdir
import string
from string import punctuation
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
from collections import Counter
from nltk import tokenize
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk import FreqDist
import random
from nltk.classify import apply_features

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#Data
The Movie Review Data, gathered by Bo Pang and Lillian Lee, is a collection of various movie reviews obtained from imdb.com. The dataset is available in the Cornell Movie Review data repository. The dataset has 1000 positive review documents and 1000 negative review documents, and do not include any explicit tagging other than being stored in two folders labeled as 'neg' for negative reviews and 'pos' positive reviews.
In every document, every line corresponds to a single sentence and the data is unprocessed. The rating information was removed from the reviews, unless the author included it as part of the review. The authors determined if the review was positive by looking at whether it was at least grade B or above for letter grade or three and half stars out of five stars. If it was below a C- grade or if it received below one and half starts, then the review is negative.

In [None]:
ndata_loc = '/content/gdrive/My Drive/Datasets/txt_sentoken/neg/'
pdata_loc = '/content/gdrive/My Drive/Datasets/txt_sentoken/pos/'

Function to read the files and store the tokens in two seperate list of lists for positive and negative reviews.
Every document will be a list of tokens inside a list

In [None]:
def load_files(dir):
  words_list = []
  for filename in listdir(dir):
    file = open(dir+filename, 'r')
    content = file.read()
    words = nltk.word_tokenize(content)
    #print(sents)
    words_list.append(words)
  return words_list
    
nsents = load_files(ndata_loc)
psents = load_files(pdata_loc)

Tag the document (now a list of tokens) with the respective sentiment (pos or neg)

In [None]:
def tag_docs(sents, rtype):
  l = []
  for sublist in sents:
    #print(sublist)
    l.append((sublist, rtype))
  return l
  
tagn = tag_docs(nsents, "neg")
tagp = tag_docs(psents, "pos")

#Combine the negative and positive tagged review data into one list
all_tagged = tagn + tagp
#print("before:", all_tagged[0])

#The data is shuffled to remove prior ordering
random.shuffle(all_tagged)
#print("after:", all_tagged[0])

In [None]:
#Get all the words from the all the review documents as one list
all_words = []
def get_words(sents):
  l = []
  for sublist in sents:
    for item in sublist:
      #print(item)
      l.append(item.lower())
  return l    

all_words.append(get_words(nsents))
all_words.append(get_words(psents))
all_words = get_words(all_words)
print("Total words:", len(all_words))

Total words: 1524490


In [None]:
num_chars = sum(len(word) for word in all_words)
num_words = len(all_words)
num_vocab = len(set(w.lower() for w in all_words))

print('Number of documents:', len(all_tagged))
print('Average characters per document:', round(num_chars/len(all_tagged)))
print('Average words per document:', round(num_words/len(all_tagged)))
print('Lexical diversity:', round(num_words/num_vocab))

Number of documents: 2000
Average characters per document: 3122
Average words per document: 762
Lexical diversity: 33


#Describe the Data

In [None]:
#Freq Distribution
 
aw_freq = FreqDist(all_words)
 
print (aw_freq)
 
# top 10
print (aw_freq.most_common(10))

<FreqDist with 46716 samples and 1524490 outcomes>
[(',', 77717), ('the', 76217), ('.', 65876), ('a', 37980), ('and', 35404), ('of', 33972), ('to', 31772), ('is', 26054), ('in', 21611), ("'s", 18128)]


#Data Cleaning

In [None]:
#Remove punctuations
wo_stopwords = [w for w in all_words if w not in stop_words]

#Remove punctuations
clean = [w for w in wo_stopwords if w not in string.punctuation]
clean = [w for w in clean if w not in "\``"]
clean = [w for w in clean if w not in "\'"]
clean = [w for w in clean if w not in "\""]

#Remove very short words
clean = [w for w in clean if len(w) > 3]
print("Length of clean dataset:", len(clean))

Length of clean dataset: 638260


Data was cleaned to remove stop words and punctuation marks. Very small words of length less than three were also removed. The total number of words in the whole dataset reduced from 15,24,490 words to 6,38,260 words

In [None]:
#Check frequency distribution again for the clean data
aw_freq = FreqDist(clean)
 
print (aw_freq)
 
# top 10
print (aw_freq.most_common(10))
print(len(aw_freq))

#For featureset, the words with over 
most_common  = aw_freq.most_common(2000)

#Get only words and not counts
word_features = [m[0] for m in most_common]

<FreqDist with 45060 samples and 638260 outcomes>
[('film', 9443), ('movie', 5671), ('like', 3545), ('even', 2556), ('good', 2316), ('time', 2282), ('would', 2264), ('story', 2145), ('much', 2024), ('character', 1996)]
45060


#The Analysis

After data cleaning was performed, the frequency distribution was measured again and the most frequent words were viewed. The most frequent words now included words such as "movie", "good" or "like", which are more relevant to the problem that is under consideration in this project.

From the list of the most commonly occurring words, the top 2000 were chosen to function as the feature set which will tell the classifier what aspects of the review document it should pay attention to. A feature extractor function was defined to check if the words in any given review are present in the feature set.

In [None]:
#Function to apply feature set
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features
  

#Test-Train

The dataset was split into training, development testing and testing set. Eighty (80) percent of the dataset was used for training, ten percent (10%) for development testing to analyze errors and the rest of the ten percent (10%) was used for testing. The size of the training set, development testing set and testing sets were 1600, 200 and 200 respectively. The training set was used to train a Naive Bayes classifier.

In [None]:
featuresets = [(document_features(d), c) for (d,c) in all_tagged]
data_length = len(featuresets)
print("Length of feature set:",len(featuresets))
train_set, devtest_set, test_set = featuresets[:int(data_length*.8)], featuresets[int(data_length*.8):int(data_length*.9)], featuresets[int(data_length*.9):]

print("Size  of training set: ",len(train_set))
print("Size of dev set:", len(devtest_set))
print("Size of test set", len(test_set))

classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Accuracy = " , nltk.classify.accuracy(classifier, test_set))

Length of feature set: 2000
Size  of training set:  1600
Size of dev set: 200
Size of test set 200
Accuracy =  0.775


#Results

The designed Naive Bayes classifier achieved an accuracy of 77.5% on the testing set, which is a good result in the world of text classification. The performance of the classifier was analyzed further by looking into the most informative features, and confusion matrix values.

In example of informative features, it was seen that the word "Outstanding" was used 21.6 times more in positive reviews than in negative reviews. It was also seen that the word 'Seagal' (probably Steven Seagal) was used 9.7 times more in negative reviews than in positive reviews.

In [None]:
classifier.show_most_informative_features(20)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     21.6 : 1.0
        contains(seagal) = True              neg : pos    =      9.7 : 1.0
    contains(schumacher) = True              neg : pos    =      9.1 : 1.0
        contains(poorly) = True              neg : pos    =      6.8 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
          contains(lame) = True              neg : pos    =      5.3 : 1.0
   contains(masterpiece) = True              pos : neg    =      5.2 : 1.0
         contains(waste) = True              neg : pos    =      4.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      4.6 : 1.0
         contains(awful) = True              neg : pos    =      4.6 : 1.0
    contains(ridiculous) = True              neg : pos    =      4.6 : 1.0
         contains(worst) = True              neg : pos    =      4.6 : 1.0

In [None]:
#Errors
errors = []

##loop over the original documents in the dev set
for (document, tag) in devtest_set:
    ##guess at the classification
    guess = classifier.classify(document_features(document))
    ##if wrong, put it in the errors list
    if guess != tag:
        errors.append( (tag, guess, document) )
        
errors[1]

('pos',
 'neg',
 {'contains(film)': True,
  'contains(movie)': False,
  'contains(like)': False,
  'contains(even)': True,
  'contains(good)': False,
  'contains(time)': True,
  'contains(would)': True,
  'contains(story)': False,
  'contains(much)': True,
  'contains(character)': False,
  'contains(also)': True,
  'contains(characters)': False,
  'contains(first)': False,
  'contains(well)': True,
  'contains(could)': True,
  'contains(make)': True,
  'contains(really)': False,
  'contains(films)': False,
  'contains(little)': False,
  'contains(life)': False,
  'contains(plot)': True,
  'contains(people)': False,
  'contains(scene)': True,
  'contains(never)': False,
  'contains(best)': False,
  'contains(many)': False,
  'contains(scenes)': False,
  'contains(know)': False,
  'contains(movies)': False,
  'contains(great)': True,
  'contains(another)': True,
  'contains(director)': False,
  'contains(love)': False,
  'contains(action)': False,
  'contains(something)': False,
  'conta

The development test set was used to produce a list of errors that the classifier was making when predicting the movie sentiment. By examining individual errors or creating a frequency distribution of the error data, additional information could be found that could be made available to the classifier so that it can make a correct prediction.

In [None]:
from nltk.metrics.scores import (precision, recall, f_measure)
import collections

#creates a spot for the data to go
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
refsetsd = collections.defaultdict(set)
devsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

for i, (feats, label) in enumerate(devtest_set):
    refsetsd[label].add(i)
    observed = classifier.classify(feats)
    devsets[observed].add(i)

In [None]:
answer = [tag for (document, tag) in test_set]

print(len(answer))
print(answer[:50])

200
['pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg']


In [None]:
guesses = []

##loop over the names in the test test
for (document, tag) in test_set:
    ##guess at the classification
    guess_this = classifier.classify(document)
    guesses.append(guess_this)
  
    
#print(len(guesses))
#print(guesses[:50])

Confusion matrix 

In [None]:
cm = nltk.ConfusionMatrix(answer, guesses)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <40.0%> 10.0% |
pos |  12.5% <37.5%>|
----+---------------+
(row = reference; col = test)



In [None]:
print("Precision, Recall and F-Measure for positive review guesses on test set")
print('Precision:', precision(refsets['pos'], testsets['pos']))
print('Recall:', recall(refsets['pos'], testsets['pos']))
print('F-Measure:', f_measure(refsets['pos'], testsets['pos']))

Precision, Recall and F-Measure for positive review guesses on test set
Precision: 0.7894736842105263
Recall: 0.75
F-Measure: 0.7692307692307694


In [None]:
print("Precision, Recall and F-Measure for negative guesses on test set")
print('Precision:', precision(refsets['neg'], testsets['neg']))
print('Recall:', recall(refsets['neg'], testsets['neg']))
print('F-Measure:', f_measure(refsets['neg'], testsets['neg']))

Precision, Recall and F-Measure for negative guesses on test set
Precision: 0.7619047619047619
Recall: 0.8
F-Measure: 0.7804878048780488


In [None]:
print("Precision, Recall and F-Measure for positive guesses on dev set")
print('Precision:', precision(refsetsd['pos'], devsets['pos']))
print('Recall:', recall(refsetsd['pos'], devsets['pos']))
print('F-Measure:', f_measure(refsetsd['pos'], devsets['pos']))

Precision, Recall and F-Measure for positive guesses on dev set
Precision: 0.7857142857142857
Recall: 0.6804123711340206
F-Measure: 0.7292817679558011


In [None]:
print("Precision, Recall and F-Measure for negative guesses on dev set")
print('Precision:', precision(refsetsd['neg'], devsets['neg']))
print('Recall:', recall(refsetsd['neg'], devsets['neg']))
print('F-Measure:', f_measure(refsetsd['neg'], devsets['neg']))

Precision, Recall and F-Measure for negative guesses on dev set
Precision: 0.7327586206896551
Recall: 0.8252427184466019
F-Measure: 0.776255707762557
