# Twitter data sentiment analysis

In [1]:
import re, string
import random
import nltk
#nltk.download('twitter_samples')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk import FreqDist
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
stop_words = stopwords.words('english')

## Load data

In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
test_tweets = twitter_samples.strings('tweets.20150430-223406.json')
pos_tweets[1:5]

['@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

Positive tweets

In [3]:
pos_tweets[1:5]

['@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',
 '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',
 '@97sides CONGRATS :)',
 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']

Negative tweets

In [4]:
neg_tweets[1:5]

["Everything in the kids section of IKEA is so cute. Shame I'm nearly 19 in 2 months :(",
 '@Hegelbon That heart sliding into the waste basket. :(',
 '“@ketchBurning: I hate Japanese call him "bani" :( :(”\n\nMe too',
 'Dang starting next week I have "work" :(']

### Tokenizing

In [5]:
tweets = twitter_samples.tokenized('positive_tweets.json')
tweet = tweets[0]
tweet

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

### Tagging

In [6]:
pos_tag(tweet)

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

### Lemmatizing

In [7]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized.append(lemmatizer.lemmatize(word, pos))
    return lemmatized

lemmatize(tweet)

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'be',
 'top',
 'engage',
 'member',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

### Stop words removal

In [8]:
def sw_remove(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|''(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

sw_remove(tweet, stop_words)

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']

### Putting all together:

In [9]:
pos_tokens = twitter_samples.tokenized('positive_tweets.json')
neg_tokens = twitter_samples.tokenized('negative_tweets.json')

pos_clean = []
neg_clean = []

for tokens in pos_tokens:
    pos_clean.append(sw_remove(tokens, stop_words))

for tokens in neg_tokens:
    neg_clean.append(sw_remove(tokens, stop_words))

In [10]:
print('Raw  : \t', pos_tokens[500])
print('Clean :\t', pos_clean[500])

Raw  : 	 ['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
Clean :	 ['dang', 'rad', '#fanart', ':d']


In [11]:
print('Raw  : \t', neg_tokens[500])
print('Clean :\t', neg_clean[500])

Raw  : 	 ['Can', 'u', 'feel', 'it', '?', ':(', '(:', '(', '#exo', 'http://t.co/ghsa262ORm']
Clean :	 ['u', 'feel', ':(', '(:', '#exo']


### Word density

In [12]:
def generator(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

gen = generator(pos_clean)
gen

<generator object generator at 0x0000022CAF193660>

In [13]:
freq = FreqDist(gen)
freq.most_common(10)

[(':)', 3691),
 (':-)', 701),
 (':d', 658),
 ('thanks', 388),
 ('follow', 357),
 ('love', 333),
 ('...', 290),
 ('good', 283),
 ('get', 263),
 ('thank', 253)]

### Data preparation

In [14]:
def get_tokens(tokens):
    for tokens in tokens:
        yield dict([token, True] for token in tokens)

pos_tokens = get_tokens(pos_clean)
neg_tokens = get_tokens(neg_clean)

In [15]:
pos_ds = [(token, "Positive") for token in pos_tokens]
neg_ds = [(token, "Negative") for token in neg_tokens]

ds = pos_ds + neg_ds
random.shuffle(ds)

split_index = 7000
ds_train = ds[:split_index]
ds_test = ds[split_index:]

In [16]:
ds_train[1:10]

[({'think': True, 'hard': True, 'bed': True, ':p': True}, 'Positive'),
 ({'suspect': True,
   'travel': True,
   'widely': True,
   'eventually': True,
   'return': True,
   'home': True,
   'reaapearing': True,
   'fridge': True,
   ':-)': True},
  'Positive'),
 ({'ah': True, 'jaysus': True, ':(': True}, 'Negative'),
 ({'phone': True,
   'shit': True,
   'always': True,
   'run': True,
   'memory': True,
   ':(': True,
   '...': True,
   '2': True,
   'many': True,
   'nude': True},
  'Negative'),
 ({'anyone': True,
   'need': True,
   'ride': True,
   '#educampakl': True,
   'tomorrow': True,
   'leave': True,
   'rotorua': True,
   'early': True,
   'morning': True,
   'pick': True,
   'people': True,
   'way': True,
   'like': True,
   'come': True,
   ':)': True},
  'Positive'),
 ({':)': True, '☕': True, 'thank': True, 'rita': True}, 'Positive'),
 ({'laguna': True, ':(': True}, 'Negative'),
 ({'powys': True,
   'close': True,
   'shropshire': True,
   'border': True,
   ':)': True

### Build the model

In [17]:
model = NaiveBayesClassifier.train(ds_train)

print("Accuracy : ", classify.accuracy(model, ds_test))
print(model.show_most_informative_features(10))

Accuracy :  0.996
Most Informative Features
                      :( = True           Negati : Positi =   2050.5 : 1.0
                      :) = True           Positi : Negati =    994.6 : 1.0
                     sad = True           Negati : Positi =     33.3 : 1.0
                  arrive = True           Positi : Negati =     24.4 : 1.0
                    glad = True           Positi : Negati =     21.1 : 1.0
                follower = True           Positi : Negati =     17.4 : 1.0
                     x15 = True           Negati : Positi =     16.3 : 1.0
               community = True           Positi : Negati =     15.7 : 1.0
                followed = True           Negati : Positi =     15.0 : 1.0
                     ugh = True           Negati : Positi =     13.0 : 1.0
None


### Testing

In [18]:
tweet1 = "I ordered just once from TerribleCo, they screwed up, never used the app again."
tokens = sw_remove(word_tokenize(tweet1))
model.classify(dict([token, True] for token in tokens))

'Negative'

In [19]:
tweet2 = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'
tokens = sw_remove(word_tokenize(tweet2))
model.classify(dict([token, True] for token in tokens))

'Positive'

## Credits & Links

https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk