In [32]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

print subj_docs

[([u'smart', u'and', u'alert', u',', u'thirteen', u'conversations', u'about', u'one', u'thing', u'is', u'a', u'small', u'gem', u'.'], 'subj'), ([u'color', u',', u'musical', u'bounce', u'and', u'warm', u'seas', u'lapping', u'on', u'island', u'shores', u'.', u'and', u'just', u'enough', u'science', u'to', u'send', u'you', u'home', u'thinking', u'.'], 'subj'), ([u'it', u'is', u'not', u'a', u'mass-market', u'entertainment', u'but', u'an', u'uncompromising', u'attempt', u'by', u'one', u'artist', u'to', u'think', u'about', u'another', u'.'], 'subj'), ([u'a', u'light-hearted', u'french', u'film', u'about', u'the', u'spiritual', u'quest', u'of', u'a', u'fashion', u'model', u'seeking', u'peace', u'of', u'mind', u'while', u'in', u'a', u'love', u'affair', u'with', u'a', u'veterinarian', u'who', u'is', u'a', u'non-practicing', u'jew', u'.'], 'subj'), ([u'my', u'wife', u'is', u'an', u'actress', u'has', u'its', u'moments', u'in', u'looking', u'at', u'the', u'comic', u'effects', u'of', u'jealousy', u'

In [3]:
len(subj_docs),len(obj_docs)

(100, 100)

In [9]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
print all_words_neg



In [13]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
print unigram_feats

[u'.', u'the', u',', u'a', u'and', u'of', u'to', u'is', u'in', u'with', u'it', u'that', u'his', u'for', u'on', u'an', u'who', u'by', u'he', u'her', u'"', u'from', u'as', u'film', u'movie', u'this', u'their', u'but', u'one', u'at', u'the_NEG', u'about', u'a_NEG', u"there's", u'to_NEG', u'story', u'are', u'(', u'when', u'so', u'they', u',_NEG', u'be', u')', u'life', u'not', u'you', u'all', u'what', u'into', u'out', u'have', u'she', u'will', u'like', u'even', u'has', u'can', u'only', u'--', u'more', u'its', u':', u';', u'if', u'where', u'search', u'most', u'him', u'look', u"it's", u'home', u'them', u'begins', u'make', u'love', u'but_NEG', u'of_NEG', u'two', u'both', u'some', u'which', u'made']


In [10]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)

In [11]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))
        

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8


In [14]:
# Vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
"VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
"VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
"VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
"VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
"VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
"The book was good.",         # positive sentence
"The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
"The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
"A really bad, horrible book.",       # negative sentence with booster words
"At least it isn't a horrible book.", # negated negative sentence with contraction
":) and :D",     # emoticons handled
"",              # an empty string is correctly handled
"Today sux",     #  negative slang handled
"Today sux!",    #  negative slang with punctuation emphasis handled
"Today SUX!",    #  negative slang with capitalization emphasis
"Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
]

paragraph = "It was one of the worst movies I've seen, despite good reviews. \
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"



In [16]:
from nltk import tokenize
lines_list = tokenize.sent_tokenize(paragraph)
print lines_list
sentences.extend(lines_list)
print sentences

["It was one of the worst movies I've seen, despite good reviews.", 'Unbelievably bad acting!!', 'Poor direction.', 'VERY poor production.', 'The movie was bad.', 'Very bad movie.', 'VERY bad movie.', 'VERY BAD movie.', 'VERY BAD movie!']
['VADER is smart, handsome, and funny.', 'VADER is smart, handsome, and funny!', 'VADER is very smart, handsome, and funny.', 'VADER is VERY SMART, handsome, and FUNNY.', 'VADER is VERY SMART, handsome, and FUNNY!!!', 'VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!', 'The book was good.', 'The book was kind of good.', 'The plot was good, but the characters are uncompelling and the dialog is not great.', 'A really bad, horrible book.', "At least it isn't a horrible book.", ':) and :D', '', 'Today sux', 'Today sux!', 'Today SUX!', "Today kinda sux! But I'll get by, lol", "It was one of the worst movies I've seen, despite good reviews.", 'Unbelievably bad acting!!', 'Poor direction.', 'VERY poor production.', 'The movie was bad.', 'Very ba

In [20]:
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print '{0}: {1}, '.format(k, ss[k])
    print "\n"
    

VADER is smart, handsome, and funny.
compound: 0.8316, 
neg: 0.0, 
neu: 0.254, 
pos: 0.746, 


VADER is smart, handsome, and funny!
compound: 0.8439, 
neg: 0.0, 
neu: 0.248, 
pos: 0.752, 


VADER is very smart, handsome, and funny.
compound: 0.8545, 
neg: 0.0, 
neu: 0.299, 
pos: 0.701, 


VADER is VERY SMART, handsome, and FUNNY.
compound: 0.9227, 
neg: 0.0, 
neu: 0.246, 
pos: 0.754, 


VADER is VERY SMART, handsome, and FUNNY!!!
compound: 0.9342, 
neg: 0.0, 
neu: 0.233, 
pos: 0.767, 


VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
compound: 0.9469, 
neg: 0.0, 
neu: 0.294, 
pos: 0.706, 


The book was good.
compound: 0.4404, 
neg: 0.0, 
neu: 0.508, 
pos: 0.492, 


The book was kind of good.
compound: 0.3832, 
neg: 0.0, 
neu: 0.657, 
pos: 0.343, 


The plot was good, but the characters are uncompelling and the dialog is not great.
compound: -0.7042, 
neg: 0.327, 
neu: 0.579, 
pos: 0.094, 


A really bad, horrible book.
compound: -0.8211, 
neg: 0.791, 
neu: 0.209, 
pos: 0

In [25]:
# get synonyms
from nltk.corpus import wordnet as wn
print wn.synsets('food')
print wn.synset('food.n.01').definition()
print wn.synset('food.n.01').lemma_names()

print wn.synsets('service')
print wn.synset('service.n.03').definition()
print wn.synset('service.n.02').lemma_names()

print wn.synset('pizza.n.01').lemma_names()


[Synset('food.n.01'), Synset('food.n.02'), Synset('food.n.03')]
any substance that can be metabolized by an animal to give energy and build tissue
[u'food', u'nutrient']
[Synset('service.n.01'), Synset('service.n.02'), Synset('service.n.03'), Synset('service.n.04'), Synset('service.n.05'), Synset('military_service.n.01'), Synset('service.n.07'), Synset('avail.n.01'), Synset('service.n.09'), Synset('servicing.n.01'), Synset('service.n.11'), Synset('serve.n.01'), Synset('service.n.13'), Synset('overhaul.n.01'), Synset('service.n.15'), Synset('service.v.01'), Synset('service.v.02'), Synset('serve.v.14')]
the act of public worship following prescribed rules
[u'service']
[u'pizza', u'pizza_pie']


<bound method Synset.definition of Synset('car.n.01')>

In [18]:
# hyponyms
from nltk.corpus import wordnet as wn

for synset in wn.synsets('dish'):
    paths = synset.hypernym_paths()
    print len(paths)
    print [synset.name() for synset in paths[0]]


2
[u'entity.n.01', u'physical_entity.n.01', u'object.n.01', u'whole.n.02', u'artifact.n.01', u'instrumentality.n.03', u'container.n.01', u'dish.n.01']
1
[u'entity.n.01', u'physical_entity.n.01', u'matter.n.03', u'substance.n.07', u'food.n.01', u'nutriment.n.01', u'dish.n.02']
1
[u'entity.n.01', u'abstraction.n.06', u'measure.n.02', u'indefinite_quantity.n.01', u'containerful.n.01', u'dish.n.03']
4
[u'entity.n.01', u'physical_entity.n.01', u'causal_agent.n.01', u'person.n.01', u'adult.n.01', u'woman.n.01', u'smasher.n.02']
1
[u'entity.n.01', u'physical_entity.n.01', u'object.n.01', u'whole.n.02', u'artifact.n.01', u'instrumentality.n.03', u'device.n.01', u'electrical_device.n.01', u'antenna.n.01', u'directional_antenna.n.01', u'dish.n.05']
1
[u'entity.n.01', u'abstraction.n.06', u'psychological_feature.n.01', u'event.n.01', u'act.n.02', u'activity.n.01', u'cup_of_tea.n.01']
1
[u'transfer.v.05', u'give.v.03', u'provide.v.02', u'serve.v.06']
1
[u'change.v.01', u'shape.v.03', u'dish.v.02']

In [17]:
# Synonyms
for synset in wn.synsets('sauce'):
    print(synset.lemma_names())


[u'sauce']
[u'sauce']
[u'sauce']
[u'sauce']


In [4]:
# extract nouns
import nltk
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()


word_tag_pairs = nltk.bigrams(brown_news_tagged)
for (a, b) in word_tag_pairs:
    print (a, b)


((u'The', u'DET'), (u'Fulton', u'NOUN'))
((u'Fulton', u'NOUN'), (u'County', u'NOUN'))
((u'County', u'NOUN'), (u'Grand', u'ADJ'))
((u'Grand', u'ADJ'), (u'Jury', u'NOUN'))
((u'Jury', u'NOUN'), (u'said', u'VERB'))
((u'said', u'VERB'), (u'Friday', u'NOUN'))
((u'Friday', u'NOUN'), (u'an', u'DET'))
((u'an', u'DET'), (u'investigation', u'NOUN'))
((u'investigation', u'NOUN'), (u'of', u'ADP'))
((u'of', u'ADP'), (u"Atlanta's", u'NOUN'))
((u"Atlanta's", u'NOUN'), (u'recent', u'ADJ'))
((u'recent', u'ADJ'), (u'primary', u'NOUN'))
((u'primary', u'NOUN'), (u'election', u'NOUN'))
((u'election', u'NOUN'), (u'produced', u'VERB'))
((u'produced', u'VERB'), (u'``', u'.'))
((u'``', u'.'), (u'no', u'DET'))
((u'no', u'DET'), (u'evidence', u'NOUN'))
((u'evidence', u'NOUN'), (u"''", u'.'))
((u"''", u'.'), (u'that', u'ADP'))
((u'that', u'ADP'), (u'any', u'DET'))
((u'any', u'DET'), (u'irregularities', u'NOUN'))
((u'irregularities', u'NOUN'), (u'took', u'VERB'))
((u'took', u'VERB'), (u'place', u'NOUN'))
((u'place'

In [30]:
import nltk
from nltk.collocations import *


bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = "I do not like green eggs and ham, I do not like them Sam I am!"
tokens = nltk.wordpunct_tokenize(text)
finder = BigramCollocationFinder.from_words(tokens)
scored = finder.score_ngrams(bigram_measures.raw_freq)
print scored
sorted(bigram for bigram, score in scored)  # doctest: +NORMALIZE_WHITESPACE

print "\n"
print list(nltk.bigrams(tokens))
print list(nltk.trigrams(tokens))



[(('I', 'do'), 0.1111111111111111), (('do', 'not'), 0.1111111111111111), (('not', 'like'), 0.1111111111111111), ((',', 'I'), 0.05555555555555555), (('I', 'am'), 0.05555555555555555), (('Sam', 'I'), 0.05555555555555555), (('am', '!'), 0.05555555555555555), (('and', 'ham'), 0.05555555555555555), (('eggs', 'and'), 0.05555555555555555), (('green', 'eggs'), 0.05555555555555555), (('ham', ','), 0.05555555555555555), (('like', 'green'), 0.05555555555555555), (('like', 'them'), 0.05555555555555555), (('them', 'Sam'), 0.05555555555555555)]


[('I', 'do'), ('do', 'not'), ('not', 'like'), ('like', 'green'), ('green', 'eggs'), ('eggs', 'and'), ('and', 'ham'), ('ham', ','), (',', 'I'), ('I', 'do'), ('do', 'not'), ('not', 'like'), ('like', 'them'), ('them', 'Sam'), ('Sam', 'I'), ('I', 'am'), ('am', '!')]
[('I', 'do', 'not'), ('do', 'not', 'like'), ('not', 'like', 'green'), ('like', 'green', 'eggs'), ('green', 'eggs', 'and'), ('eggs', 'and', 'ham'), ('and', 'ham', ','), ('ham', ',', 'I'), (',', 'I', 