# Sentiment Analysis with Python

## Import and read data

In [4]:
import numpy as np
import pandas as pd
import nltk
import random
import string

from nltk.corpus import movie_reviews

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('movie_reviews')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nswam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nswam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\nswam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [5]:
len(movie_reviews.fileids())

2000

In [8]:
movie_reviews.raw(movie_reviews.fileids()[0])

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [9]:
nltk.FreqDist(movie_reviews.words()).most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [20]:
stopwords = nltk.corpus.stopwords.words('english') +list(string.punctuation)


all_words = (w.lower() for w in movie_reviews.words())
all_words_clean = [word for word in all_words if word not in stopwords]



all_words_freq = nltk.FreqDist(all_words_clean)

word_features = [item[0] for item in all_words_freq.most_common(2000)]
print(all_words_freq)


<FreqDist with 39587 samples and 710579 outcomes>


In [21]:
word_features

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much',
 'character',
 'also',
 'get',
 'two',
 'well',
 'characters',
 'first',
 '--',
 'see',
 'way',
 'make',
 'life',
 'really',
 'films',
 'plot',
 'little',
 'people',
 'could',
 'scene',
 'man',
 'bad',
 'never',
 'best',
 'new',
 'scenes',
 'many',
 'director',
 'know',
 'movies',
 'action',
 'great',
 'another',
 'love',
 'go',
 'made',
 'us',
 'big',
 'end',
 'something',
 'back',
 'still',
 'world',
 'seems',
 'work',
 'makes',
 'however',
 'every',
 'though',
 'better',
 'real',
 'audience',
 'enough',
 'seen',
 'take',
 'around',
 'going',
 'year',
 'performance',
 'role',
 'old',
 'gets',
 'may',
 'things',
 'think',
 'years',
 'last',
 'comedy',
 'funny',
 'actually',
 'long',
 'look',
 'almost',
 'thing',
 'fact',
 'nothing',
 'say',
 'right',
 'john',
 'although',
 'played',
 'find',
 'script',
 'come',
 'ever',
 'cast',
 'since',
 'star',
 'plays',
 'young',
 'show',
 'comes',
 'part',

In [23]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


In [26]:
documents[0]#[0][0]

(['nosferatu',
  'the',
  'vampyre',
  '(',
  'germany',
  '1979',
  ')',
  'a',
  'film',
  'review',
  'by',
  'mike',
  'watson',
  'copyright',
  '1999',
  'mike',
  'watson',
  'this',
  'extraordinary',
  're',
  '-',
  'telling',
  'of',
  'bram',
  'stoker',
  "'",
  's',
  '"',
  'dracula',
  '"',
  'by',
  'german',
  'filmmaker',
  'werner',
  'herzog',
  'deserves',
  'the',
  'most',
  'prominent',
  'of',
  'places',
  'in',
  'cinematic',
  'vampire',
  'lore',
  '.',
  'inspired',
  'by',
  'f',
  '.',
  'w',
  '.',
  'murnau',
  "'",
  's',
  '1922',
  'silent',
  'film',
  'of',
  'the',
  'same',
  'name',
  ',',
  'herzog',
  "'",
  's',
  'film',
  'is',
  'a',
  'work',
  'of',
  'exquisite',
  'bleakness',
  ',',
  'an',
  'oddly',
  'touching',
  'tragedy',
  'with',
  'a',
  'beautiful',
  'and',
  'uniquely',
  'haunting',
  'quality',
  'that',
  'lingers',
  'long',
  'afterwards',
  '.',
  'the',
  'original',
  'full',
  '-',
  'length',
  'english',
  'an

In [45]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


In [46]:
# Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [47]:
print(nltk.classify.accuracy(classifier, test_set))

0.8


In [49]:
classifier.show_most_informative_features(10)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.2 : 1.0
         contains(damon) = True              pos : neg    =      6.1 : 1.0
         contains(flynt) = True              pos : neg    =      5.7 : 1.0
        contains(wasted) = True              neg : pos    =      5.6 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.4 : 1.0
          contains(jedi) = True              pos : neg    =      5.3 : 1.0
          contains(lame) = True              neg : pos    =      5.2 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0
