# Sentiment Analysis

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print(tf.__version__)

2.3.0


In [None]:
import nltk

nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
len(brown.words())

1161192

### Vocab

In [None]:
# word frequency
from nltk import ngrams, FreqDist

def word_freq(words_list):
  counts = {}
  for word in words_list:
    if word in counts.keys():
      counts[word] += 1
    else:
      counts[word] = 1
  return counts

In [None]:
def lower_case(word_list):
  word_list = [word.lower() for word in word_list]
  return word_list

assert ['my', 'university'] == lower_case(['My', 'University'])

In [None]:
# Vocab is all Unique words in the corpus
def Vocab(corpus_words):
  corpus_words = lower_case(corpus_words)
  V_ = word_freq(corpus_words).keys()
  V_ = list(V_)
  return V_

print(len(Vocab(brown.words())))

49815


### Convert words into sparse vectors

In [None]:
# this is heavy process and it has lots of zeros 
def sparse_represent(sentence, V):
  sparse_representation = [0]*len(V)
  i = 0
  for word in sentence:
    for i in np.arange(len(V)):
      if word == V[i]:
        sparse_representation[i] = 1
  return sparse_representation

V = Vocab(brown.words())
test_sentence = ['i', 'am', 'happy']
assert len(sparse_represent(test_sentence, V)) == 49815

### Using nltk Sentiment analyser

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sentiment_analyzer is used for some basic sentiment analysis tasks
nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

a1 = 'This was the best, most awesome movie EVER MADE!!!'
print(sid.polarity_scores(a1))
a2 = 'I hate this movie, but I am happy'
print(sid.polarity_scores(a2))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}
{'neg': 0.206, 'neu': 0.351, 'pos': 0.443, 'compound': 0.5719}


### Stopwords

In [None]:
from nltk.corpus import stopwords
#nltk.download('stopwords')
print(stopwords.words('english')[:5])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we']


### Sentiment Analysis on tweets using logistic regression and NLTK

In [None]:
from nltk.corpus import twitter_samples
#nltk.download('twitter_samples')

pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


In [None]:
val_pos = pos_tweets[4000:]
train_pos = pos_tweets[:4000]
val_neg = neg_tweets[4000:]
train_neg = neg_tweets[:4000] 
train_x = train_pos + train_neg 
val_x  = val_pos + val_neg
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))

Process tweets and remove all 

In [None]:
import re
import string
# helper function to clean tweets
def process_sentence(sentence):
    # Remove HTML special entities (e.g. &amp;)
    sentence = re.sub(r'\&\w*;', '', sentence)
    #Convert @username to AT_USER
    sentence = re.sub('@[^\s]+','',sentence)
    # Remove tickers
    sentence = re.sub(r'\$\w*', '', sentence)
    # To lowercase
    sentence = sentence.lower()
    # Remove hyperlinks
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    # Remove hashtags
    sentence = re.sub(r'#\w*', '', sentence)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    sentence = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', sentence)
    # Remove words with 2 or fewer letters
    sentence = re.sub(r'\b\w{1,2}\b', '', sentence)
    # Remove whitespace (including new line characters)
    sentence = re.sub(r'\s\s+', ' ', sentence)
    # Remove single space remaining at the front of the tweet.
    sentence = sentence.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    sentence = ''.join(c for c in sentence if c <= '\uFFFF') 
    return sentence

In [None]:
process_sentence(train_x[1])

'hey james how odd please call our contact centre 02392441234 and will able assist you many thanks '

In [None]:
len(train_x), len(val_x)

(8000, 2000)

In [None]:
def features_of_sentences(sentences):
  features = []
  for sentence in sentences:
    sid = SentimentIntensityAnalyzer()
    pos = sid.polarity_scores(sentence)['pos']
    neg = sid.polarity_scores(sentence)['neg']
    f = [pos, neg]
    features.append(f)
  return features

In [None]:
train_x_features = features_of_sentences(train_x)
val_x_features = features_of_sentences(val_x)

In [None]:
val_x_features = np.array(val_x_features)
train_x_features = np.array(train_x_features)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=0)
logreg.fit(train_x_features, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
print(logreg.score(val_x_features, val_y))

0.8785


In [None]:
test_sentences = ["I love machine learning", "I like machine learning but it's stupid"]
test_sentences_features = features_of_sentences(test_sentences)
prediction = logreg.predict(test_sentences_features)
prediction_proba = logreg.predict_proba(test_sentences_features)
print(test_sentences[0]," : ",  prediction[0], " with probablities", prediction_proba[0])
print(test_sentences[1]," : ",  prediction[1], " with probablities", prediction_proba[1])

I love machine learning  :  1.0  with probablities [0.023144 0.976856]
I like machine learning but it's stupid  :  0.0  with probablities [0.97238472 0.02761528]
