# Twitter Sentiment Analysis

### By Netra Mittal

## Introduction

This project trains a naive Bayes classifer to predict sentiments of tweets. 

In [1]:
import nltk
from nltk.corpus import twitter_samples
import numpy as np

In [2]:
## Download Twitter dataset
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [3]:
## We can load the text fields of the positive and negative tweets by using the module's strings() method

all_pos_tweets = twitter_samples.strings('positive_tweets.json')
all_neg_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
## Print report with no. of positive and negative tweets

print('Number of positive tweets : ', len(all_pos_tweets))
print('Number of negitive tweets : ', len(all_neg_tweets))

print('\nType of all_pos_tweets is : ', type(all_pos_tweets))
print('Type of all_neg_tweets is : ', type(all_neg_tweets))

Number of positive tweets :  5000
Number of negitive tweets :  5000

Type of all_pos_tweets is :  <class 'list'>
Type of all_neg_tweets is :  <class 'list'>


In [6]:
# Examples of tweets

print("Positive Tweet Ex: \n", all_pos_tweets[0])
print("\nNegative Tweet Ex: \n", all_neg_tweets[0])

Positive Tweet Ex: 
 #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Negative Tweet Ex: 
 hopeless for tmr :(


## Clean the dataset/ Preprocess the tweets

In [7]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer 

### Removing hyperlinks, Twitter marks and styles

We don't want to use every word in a tweet as many have hastags, retweet marks, and hyperlinks. We'll use regex to remove these from a tweet.

In [8]:
def remove_1(tweet):

  new_tweet = re.sub(r'^RT[\s]+', '', tweet) #removing retweets
  new_tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', new_tweet) #removing hyperlinks
  new_tweet = re.sub(r'#', '', new_tweet) #removing hashtags 

  return new_tweet 


### Tokenizing the string

Splitting the strings into words

In [9]:
#don't want to preserve capitals, handles, or repeated words
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def tokenize_tweet(tweet):

  tweet_tokens = tokenizer.tokenize(tweet)

  return tweet_tokens

### Removing stop words, punctuations

In [10]:
from nltk.tokenize.sonority_sequencing import punctuation
nltk.download('stopwords')

## importing english stop words from nltk
stopwords_eng = stopwords.words('english')

punctuations = string.punctuation 

def remove_stopwords_punctuations(tweet_tokens):

  tweets_clean = []

  for word in tweet_tokens:
    if (word not in stopwords_eng and word not in punctuations):
      tweets_clean.append(word)

  return tweets_clean

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Stemming 

Next, we'll convert the word into the most basic form. Ex: Learning/ Learned/ Learnt converts to the base form 'Learn'.

In [11]:
stemmer = PorterStemmer()

def get_stem(tweets_clean):

  tweets_stem = []

  for word in tweets_clean:
    stem_word = stemmer.stem(word)
    tweets_stem.append(stem_word)

  return tweets_stem

We can also check how a tweet is processed through each of these steps: 

In [14]:
ex_tweet = all_pos_tweets[7]

step1 = remove_1(ex_tweet)
step2 = tokenize_tweet(step1)
step3 = remove_stopwords_punctuations(step2)
step4 = get_stem(step3)

print(ex_tweet, '\n\n1. Removed hyperlinks etc. : ', step1, 
      '\n\n2. Tokenized tweet : ', step2,
      '\n\n3. Removed stopwords : ', step3,
      '\n\n4. Stem words : ', step4)

@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying. 

1. Removed hyperlinks etc. :  @Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying. 

2. Tokenized tweet :  ['on', 'second', 'thought', ',', 'there', '’', 's', 'just', 'not', 'enough', 'time', 'for', 'a', 'dd', ':)', 'but', 'new', 'shorts', 'entering', 'system', '.', 'sheep', 'must', 'be', 'buying', '.'] 

3. Removed stopwords :  ['second', 'thought', '’', 'enough', 'time', 'dd', ':)', 'new', 'shorts', 'entering', 'system', 'sheep', 'must', 'buying'] 

4. Stem words :  ['second', 'thought', '’', 'enough', 'time', 'dd', ':)', 'new', 'short', 'enter', 'system', 'sheep', 'must', 'buy']


## Combining all the functions into one process

In [15]:
def process_tweets(tweet):
  step1 = remove_1(tweet)
  step2 = tokenize_tweet(step1)
  step3 = remove_stopwords_punctuations(step2)
  final = get_stem(step3)

  return final

In [16]:
## check function 

process_tweets(all_neg_tweets[7])

['work', 'neighbour', 'motor', 'ask', 'said', 'hate', 'updat', 'search', ':(']

### Split data into training and testing sets

In [18]:
test_pos = all_pos_tweets[4000:]
train_pos = all_pos_tweets[:4000]

test_neg = all_neg_tweets[4000:]
train_neg = all_neg_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg))) #1-- pos, 0--neg
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

### Creating frequency dictionary

In [23]:
def create_frequency(tweets, ys): #list of tweets and corresponding y value
  freq_d = {}

  for tweet,y in zip(tweets,ys): #iterate through each word and val
    for word in process_tweets(tweet): #turns into list of keywords

      pair = (word, y) #tuple of the word and the y value

      if pair in freq_d: #if the pair is already in the dictionary, increase val by 1
        freq_d[pair] += 1

      else: #initialize the key value starting at one
        freq_d[pair] = freq_d.get(pair, 1)

  return freq_d

In [25]:
## testing the function 

tweets = ['i am happy', 'i was tricked', 'i am angry', 'i am excited', 'i am happy']
ys = [1, 0, 0, 1, 1]

freq_d = create_frequency(tweets, ys)
print(freq_d)

{('happi', 1): 2, ('trick', 0): 1, ('angri', 0): 1, ('excit', 1): 1}


### Training the model using Naive Bayes

In [28]:
def train_naive_bayes(freqs, train_x, train_y):
  '''
  Inputs:
    freqs: dict from (word, label) for how often a word appears
    train_x: list of tweets
    train_y: list of labels corresponding to the tweets
  Outputs:
    logprior: log prior
    loglikelihood: log likelihood of naive bayes eq

  '''

  loglikelihood = {}
  logprior = 0

  #calculate unique words
  unique_words = set([pair[0] for pair in freqs.keys()])
  V = len(unique_words)

  #calculate N_pos and N_neg words
  N_pos = N_neg = 0
  for pair in freqs.keys():
    if pair[1] > 0: #if label is positive
      N_pos += freqs[(pair)]

    else: #neg label
      N_neg += freqs[(pair)]

    D = train_y.shape[0] #total tweets

    D_pos = sum(train_y) #total pos tweets (note: different (<) from pos words)

    D_neg = D - D_pos #total neg

  logprior = np.log(D_pos) - np.log(D_neg) #log to avoid extreme values

  for word in unique_words: #iterate through each unique word and get no. of times
                            #the word appeared in pos vs neg tweets

    freq_pos = freqs.get((word,1), 0) #getting frequency
    freq_neg = freqs.get((word, 0), 0)

    p_w_pos = (freq_pos + 1) / (N_pos + V) #getting probability 
    p_w_neg = (freq_neg + 1) / (N_neg + V)

    loglikelihood[word] = np.log(p_w_pos / p_w_neg) #log likelihood of a word
    #this tells us that if loglikelihood > 1 --> word is likely positive

  return logprior, loglikelihood

In [33]:
freqs = create_frequency(train_x, train_y)
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)

print(logprior, '\n', len(loglikelihood))

0.0 
 9085


### Predicting tweets

In [36]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
  '''
  Output: 
    p: sum of all the loglikelihoods + logprior

  '''

  word_l = process_tweets(tweet)

  p = 0 #initialize prob to 0

  p += logprior

  for word in word_l:
    if word in loglikelihood:
      p += loglikelihood[word]

  return p

In [37]:
list_of_twt = ['I am happy that I went for a walk today!', 'Good good great', 'Bad bad horrible', 'That movie was horribly depressing', 'I hope my wish to study at Harvard comes true!', 'It was immaculate. Exceptional taste. Exceptional, reveal.']

for tweet in list_of_twt:
  p = naive_bayes_predict(tweet, logprior, loglikelihood)

  print(f'{tweet} --> {p:.2f}')

I am happy that I went for a walk today! --> 1.66
Good good great --> 3.80
Bad bad horrible --> -4.09
That movie was horribly depressing --> -3.10
I hope my wish to study at Harvard comes true! --> -0.36
It was immaculate. Exceptional taste. Exceptional, reveal. --> -0.85


We can see that longer tweets have a higher chance of being diagnosed wrong. Meanwhile, short tweets with recurring key words have a greater chance of being diagnosed correctly. 