# NLTK: Twitter Sentiment Analysis


# [Natural Language Processing (NLP)]

In [8]:
from nltk.corpus import twitter_samples
print (twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [9]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets))
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets))
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets))

5000


5000


20000


In [10]:
for tweet in pos_tweets[:5]:
    print (tweet)

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


In [11]:
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer

import string
import re
from nltk.corpus import twitter_samples
print (twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


# Tokenize Tweets

In [None]:

def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
 







In [14]:
print (pos_tweets[5])

@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM


In [15]:
print (clean_tweets(pos_tweets[5]))

['one', 'irresist', ':)', 'flipkartfashionfriday']


# Feature Extraction
We define a simple bag_of_words function that extracts unigram features from the tweets.

In [16]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [17]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print (bag_of_words(custom_tweet))

In [18]:
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))    
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
print (len(pos_tweets_set), len(neg_tweets_set))


5000 5000


# Create Train and Test Set

There are 5000 positive tweets set and 5000 negative tweets set. We take 20% (i.e. 1000) of positive
tweets and 20% (i.e. 1000) of negative tweets as the test set. The remaining negative and positive
tweets will be taken as the training set.

In [19]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program

from random import shuffle
shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
  
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
print(len(test_set),  len(train_set)) 


2000 8000


# Training Classifier and Calculating Accuracy

We train Naive Bayes Classifier using the training set and calculate the classification accuracy of the
trained classifier using the test set.

In [20]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765
 
print (classifier.show_most_informative_features(10))    


0.9915
Most Informative Features


                      :) = True              pos : neg    =   1099.0 : 1.0
                     via = True              pos : neg    =     35.7 : 1.0
                     sad = True              neg : pos    =     27.6 : 1.0
                     bam = True              pos : neg    =     26.3 : 1.0
                     x15 = True              neg : pos    =     20.3 : 1.0
                     ugh = True              neg : pos    =     16.3 : 1.0
                    blog = True              pos : neg    =     16.3 : 1.0
                 perfect = True              pos : neg    =     14.3 : 1.0
                      aw = True              neg : pos    =     13.8 : 1.0
                    glad = True              pos : neg    =     13.0 : 1.0
None


# Testing Classifier with Custom Tweet

I provide custom tweet and check the classification output of the trained classifier. The classifier
correctly predicts both negative and positive tweets provided.

# NEGATIVE TWEET

In [21]:
custom_tweet = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_tweet_set = bag_of_words(custom_tweet)
print (classifier.classify(custom_tweet_set)) # Output: neg
 # Negative tweet correctly classified as negative
  
 # probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result) # Output: <ProbDist with 2 samples>
print (prob_result.max()) # Output: neg
print (prob_result.prob("neg")) # Output: 0.941844352481
print (prob_result.prob("pos"))

neg
<ProbDist with 2 samples>
neg
0.9128249642254496
0.08717503577454862


# POSITIVE TWEET

In [22]:
custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)
  
print (classifier.classify(custom_tweet_set)) # Output: pos
 # Positive tweet correctly classified as positive
  
 # probability result
prob_result = classifier.prob_classify(custom_tweet_set)
print (prob_result)
print (prob_result.max()) 
print (prob_result.prob("neg"))
print (prob_result.prob("pos"))

pos
<ProbDist with 2 samples>
pos
0.0009002610289191033
0.9990997389710786
