# Sentiment Analysis

##### Sentiment analysis performed using twitter samples data from nltk

## Import Libraries

In [1]:
# Import libraries (Note: we might import more libraries depending on our analysis)
import nltk                                  
from nltk.corpus import twitter_samples      
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string 
import random

## Twitter Samples using nltk

In [2]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [3]:
print('Total Positive Tweets', len(all_positive_tweets))
print('Total Negative Tweets', len(all_negative_tweets))

Total Positive Tweets 5000
Total Negative Tweets 5000


## Shuffle tweet list

In [4]:
random.shuffle(all_positive_tweets)
random.shuffle(all_negative_tweets)

## Train Test Split

In [5]:
# Get asssign 80% of tweets to training set and 20% to test set
X_train = all_positive_tweets[:4000] + all_negative_tweets[:4000]
X_test = all_positive_tweets[4000:] + all_negative_tweets[4000:]

# Get label for train-test data. 1 indicates positive and 0 indicates negative
y_train = np.append(np.ones(4000), np.zeros(4000))
y_test = np.append(np.ones(1000), np.zeros(1000))

## Function to clean text

In [6]:
def clean_text(text):
    # Get Stopwords 
    stopwords = nltk.corpus.stopwords.words('english') 
    # Create porterstemmer object
    ps = nltk.PorterStemmer()
    
    # Removing any hyperlinks 
    text = re.sub('https?:\S+', '', text)
    
    # Removing @mentions,
    text = re.sub('@\w+', '', text)
    
    # Replacing happy and sad face smileys with placeholders possmiley11 and negsmiley10 respectively
    text = re.sub(':\)|:-\)|;\)', 'possmiley11', text)
    text = re.sub(':\(|:-\(|;-\(', 'negsmiley10', text)
    
    # Removing any punctuations
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    
    # Tokenize
    tokens = re.split('\W+', text)
    
    # Applying stemming
    text = ' '.join([ps.stem(word) for word in tokens if word not in stopwords and word != ''])
    
    return text

In [7]:
# Update X_train and X_test by cleaning the text
X_train = [clean_text(text) for text in X_train]
X_test = [clean_text(text) for text in X_test]

## Vectorizing

In [8]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score

lr = LogisticRegression()

scores = cross_val_score(lr, X_train_tfidf, y_train,  cv=5, scoring='accuracy')
print('Accuracy at each step at each step: ', scores)
print('Average for accuracy: ', np.mean(scores))
print('Standard deviation for accuracy', np.std(scores))

Accuracy at each step at each step:  [0.989375 0.990625 0.985625 0.98875  0.98625 ]
Average for accuracy:  0.9881249999999999
Standard deviation for accuracy 0.0018957188610128986


Based on accuracy, we should consider Logistic Regression as one of the possible model for sentiment analysis

In [10]:
# Fit the training data
lr.fit(X_train_tfidf, y_train)

# Prediction on test data
y_pred = lr.predict(X_test_tfidf)

# Metrics evaluation
from sklearn.metrics import precision_recall_fscore_support as score

scores = score(y_test, y_pred, pos_label=0, average='binary')

print('Precision:', scores[0])
print('Recall:', scores[1])
print('F1-Score:', scores[2])

Precision: 0.9979570990806946
Recall: 0.977
F1-Score: 0.9873673572511369


## Naive Bayes

In [11]:
from sklearn.naive_bayes import BernoulliNB
nb =  BernoulliNB()

scores = cross_val_score(nb, X_train_tfidf, y_train,  cv=5, scoring='accuracy')
print('Accuracy at each step at each step: ', scores)
print('Average for accuracy: ', np.mean(scores))
print('Standard deviation for accuracy', np.std(scores))

Accuracy at each step at each step:  [0.985    0.98375  0.981875 0.98625  0.98375 ]
Average for accuracy:  0.984125
Standard deviation for accuracy 0.001457737973711294


Based on accuracy, we should consider Bernoulli Naive Bayes as one of the possible model for sentiment analysis

In [12]:
# Fit the training data
nb.fit(X_train_tfidf, y_train)

# Prediction on test data
y_pred = nb.predict(X_test_tfidf)

# Metrics evaluation
scores = score(y_test, y_pred, pos_label=0, average='binary')

print('Precision:', scores[0])
print('Recall:', scores[1])
print('F1-Score:', scores[2])

Precision: 0.987
Recall: 0.987
F1-Score: 0.987


## Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier(n_jobs=-1)
param_grid = {'n_estimators': [i for i in range(5, 100, 5)], 
             'max_depth': [i for i in range(5, 41, 5)], 
             'min_samples_split': [int(i) for i in np.linspace(2, 10, 5)], 
             'min_samples_leaf': [int(i) for i in range(1, 10, 2)], 
             'bootstrap': [True, False], 
             'max_features': [None, 'sqrt', 'log2']}


rf_rsearch = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=5, 
                                scoring='accuracy', verbose=1, random_state=75)


# Fit the model on training data
rf_rsearch.fit(X_train_tfidf, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40],
                                        'max_features': [None, 'sqrt', 'log2'],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [2, 4, 6, 8, 10],
                                        'n_estimators': [5, 10, 15, 20, 25, 30,
                                                         35, 40, 45, 50, 55, 60,
                                                         65, 70, 75, 80, 85, 90,
                                                         95]},
                   random_state=75, scoring='accuracy', verbose=1)

In [14]:
# Finding the best paramters
print(rf_rsearch.best_params_)

{'n_estimators': 65, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 20, 'bootstrap': True}


In [15]:
# Predict in test data
y_pred = rf_rsearch.predict(X_test_tfidf)

# Model Evaluation
scores = score(y_test, y_pred, pos_label=0, average='binary')

print('Precision:', scores[0])
print('Recall:', scores[1])
print('F1-Score:', scores[2])

Precision: 1.0
Recall: 0.977
F1-Score: 0.9883662114314619


## Gradient Boosting Classifier

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
param_grid = {'n_estimators': [i for i in range(5, 100, 5)], 
             'max_depth': [i for i in range(5, 41, 5)], 
             'min_samples_split': [int(i) for i in np.linspace(2, 10, 5)], 
             'min_samples_leaf': [int(i) for i in range(1, 10, 2)], 
             'max_features': [None, 'sqrt', 'log2'], 
             'learning_rate': [0.01, 0.05, 0.1]}

gb_rsearch = RandomizedSearchCV(estimator=gb, param_distributions=param_grid, n_iter=10, cv=5, 
                                scoring='accuracy', verbose=1, random_state=75)

# Fit model on training data
gb_rsearch.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(),
                   param_distributions={'learning_rate': [0.01, 0.05, 0.1],
                                        'max_depth': [5, 10, 15, 20, 25, 30, 35,
                                                      40],
                                        'max_features': [None, 'sqrt', 'log2'],
                                        'min_samples_leaf': [1, 3, 5, 7, 9],
                                        'min_samples_split': [2, 4, 6, 8, 10],
                                        'n_estimators': [5, 10, 15, 20, 25, 30,
                                                         35, 40, 45, 50, 55, 60,
                                                         65, 70, 75, 80, 85, 90,
                                                         95]},
                   random_state=75, scoring='accuracy', verbose=1)

In [17]:
print(gb_rsearch.best_params_)

{'n_estimators': 90, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 15, 'learning_rate': 0.01}


In [18]:
# Prediction on test data
y_pred = gb_rsearch.predict(X_test_tfidf)

# Model Evaluation
scores = score(y_test, y_pred, pos_label=0, average='binary')

print('Precision:', scores[0])
print('Recall:', scores[1])
print('F1-Score:', scores[2])

Precision: 1.0
Recall: 0.978
F1-Score: 0.9888776541961577


Based on precision, recall and f-score, all models are doing extremely well. We can pick any model but let's use logistic regression model for making new predictions.

In [19]:
# Find Sentiments of new tweets
def new_tweets(text):
    output = lr.predict(tfidf_vect.transform([clean_text(text)]))
    if output[0] == 1:
        print('Sentiment - This is a positive tweet!!!')
    else:
        print('Sentiment - This is a negative tweet!!!')

In [20]:
# Check tweets
print('To quit type: -1')

while True:
    text = input('Enter or copy a tweet: ')
    if text == '-1':
        break
    new_tweets(text)
    
print('Done, Thank you!')

To quit type: -1
Enter or copy a tweet: I enjoyed working on sentiment analysis. This was fun!!!
Sentiment - This is a positive tweet!!!
Enter or copy a tweet: I do not know what to say. Very very disappointed :(
Sentiment - This is a negative tweet!!!
Enter or copy a tweet: That's the way to do it guys, what a game. You made us proud :-)
Sentiment - This is a positive tweet!!!
Enter or copy a tweet: Sorry but this is bad that you lost to a weaker opponent! Everyone is shocked and un-happy...
Sentiment - This is a negative tweet!!!
Enter or copy a tweet: -1
Done, Thank you!


## Saving the model and Vectorizer as picke file

In [21]:
import pickle

with open('LogisticReg.pkl', 'wb') as pickle_file:
    pickle.dump(lr, pickle_file)
    
with open('tfidf.pkl', 'wb') as pickle_file2:
    pickle.dump(tfidf_vect, pickle_file2)