In [1]:
# Install packages and import libraries

import nltk

nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
    
])

[nltk_data] Downloading package names to /usr/local/share/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/l

True

In [2]:
# Extracting concordance

# Create new word list with stop words
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

# Create finders for trigrams, bigrams, quadgrams
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
trigramfinder = nltk.collocations.TrigramCollocationFinder.from_words(words)
bigramfinder = nltk.collocations.BigramCollocationFinder.from_words(words)
quadgramfinder = nltk.collocations.QuadgramCollocationFinder.from_words(words)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [3]:
# Using VADER 
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Test VADER with twitter_samples and movie_samples databases

# Load twitter_samples into a list of strings, rendering URLs inactive
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

# Use polarity_scores function to classify tweers
from random import shuffle

def is_positive(tweet:str) -> bool:
    return sia.polarity_scores(tweet)['compound'] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

# Rewrite is_positive to work on an entire review

from statistics import mean

def is_positive(review_id: str) -> bool:
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

> True Firmly believe Farage was brilliant then
Hashtag speaks for itself http//t.co/YTlpVBXJ6t
> False @ItsMe_Ryry online please :(
> False Tbh, bestfriend breakups are even worse than relationship breakups. :(
> False RT @StephenNoon: Miliband has just sacrificed @ScottishLabour - he'd let the Tories stay in government rather than work with the #SNP #mili…
> True "Good morning, beautiful :)" That's all it takes.
> True Yes, Pearson had that reputation. Does Nikkei have that reputation? Hmmm...ask Michael Woodford. :D @W7VOA @FT
> False Ok, back to work :( http//t.co/QFtLZ2Cl0w
> True @undeux you look amazing April love the glasses :D
> True It sure sounded like it to me. Amatain! https//t.co/LGtaTDp5Hr
> True RT @NicolaSturgeon: If Miliband is going to let Tories in rather than work with SNP, we will definitely need lots of SNP MPs to protect Sco…


In [4]:
# Revealing effects of feature selection (specifically positive/negative) on SA

# Exclude stopwords 
unwanted = nltk.corpus.stopwords.words('english')
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

# Function to skip stopwords
def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

# Create sets of positive and negative words
# Determine which occur most frequently


positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

# Create frequency distributions for feature
# Find words present in both positvie and negative word sets

positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

# Find meaningful collocations

unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

# Find positive bigrams such as "well done"
positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])

# Find negative bigrams such as "no good"
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

In [5]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

# extract_features will return a dictionary with three features: 
# average compound score, average positive score, amount of words in the text
# that are also in the top 100 words for all positive reviews

# Build a list of features for each text to be analyzed
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [6]:
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)


Most Informative Features
               wordcount = 2                 pos : neg    =      9.8 : 1.0
               wordcount = 4                 pos : neg    =      3.1 : 1.0
               wordcount = 0                 neg : pos    =      1.6 : 1.0
               wordcount = 1                 pos : neg    =      1.3 : 1.0
