In [1]:
import nltk
from nltk.corpus import twitter_samples

In [2]:
print (twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [3]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
print (len(pos_tweets))
neg_tweets = twitter_samples.strings('negative_tweets.json')
print (len(neg_tweets)) 
all_tweets=twitter_samples.strings('tweets.20150430-223406.json')
print (len(all_tweets))

5000
5000
20000


In [4]:

from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [5]:
for tweet in pos_tweets[:5]:
    print(tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


In [6]:
import string
import re

from nltk.corpus import stopwords
stopwords=stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer=PorterStemmer()

# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = [] 
    for words in tweet_tokens:
        if(words not in stopwords and
             words not in emoticons and
               words not in string.punctuation):
            stem_word=stemmer.stem(words)
            tweets_clean.append(stem_word)
    return tweets_clean        

In [7]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

In [8]:
print(clean_tweets(custom_tweet))    

['hello', 'great', 'day', 'good', 'morn']


In [9]:
def bag_of_words(tweet):
    words=clean_tweets(tweet)
    word_dict=dict([word,True] for word in words)
    return word_dict
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
print(bag_of_words(custom_tweet))

{'morn': True, 'hello': True, 'day': True, 'good': True, 'great': True}


In [16]:
#postive tweets
positive_tweets=[]
for tweet in pos_tweets:
    positive_tweets.append((bag_of_words(tweet),'pos'))

#negative_tweet
negative_tweet=[]
for tweet in neg_tweets:
    negative_tweet.append((bag_of_words(tweet),'neg'))
    

In [17]:
print (len(positive_tweets))

5000


In [18]:
from random import shuffle
shuffle(positive_tweets)
shuffle(negative_tweet)

test_set=positive_tweets[:1000]+negative_tweet[:1000]
train_set=positive_tweets[1000:]+negative_tweet[1000:]

print(len(train_set),len(test_set))

8000 2000


In [19]:
#Training classifier and Calculating Accuracy
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765

0.7355


In [20]:
print (classifier.show_most_informative_features(10))  

Most Informative Features
                     via = True              pos : neg    =     34.3 : 1.0
                     x15 = True              neg : pos    =     18.3 : 1.0
                 appreci = True              pos : neg    =     16.3 : 1.0
                     ugh = True              neg : pos    =     16.3 : 1.0
                     sad = True              neg : pos    =     15.9 : 1.0
                      ff = True              pos : neg    =     15.8 : 1.0
                    glad = True              pos : neg    =     14.6 : 1.0
                   arriv = True              pos : neg    =     14.4 : 1.0
               goodnight = True              pos : neg    =     14.3 : 1.0
                opportun = True              pos : neg    =     13.7 : 1.0
None


In [35]:
custom_tweet = "go for this"
custom_tweet_set=bag_of_words(custom_tweet)
print(classifier.classify(custom_tweet_set))

neg
