# Sentiment Analysis 

In [1]:
#Import all the required libraries.
import pandas as pd
import numpy as np
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Input, Flatten, Conv1D, LSTM, GRU, Bidirectional
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers.convolutional import MaxPooling1D
from tensorflow import keras

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Read the data file. If tun in Jupyter use the other code line.
tweets = pd.read_csv('/content/Sentiment140.tenPercent.sample.tweets.tsv',sep='\t')
#tweets = pd.read_csv('Sentiment140.tenPercent.sample.tweets.tsv',sep='\t') Use with Jupyter

#tweets.isnull().values.any()
#tweets.shape

In [None]:
#Have a look of the data.Sentiment labe 0 and 4. No neutrals (2).
tweets.head()

In [None]:
#Data types and if anything missing.
tweets.info()

In [None]:
#Check one sample data.
tweets["tweet_text"][1200]

In [None]:
#How balanced is the sentiment data.
import seaborn as sns

sns.countplot(x='sentiment_label', data=tweets)

In [3]:
#Removing all the unnecessary characters.
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    # Removing quotes
    sentence = re.sub('quot', '', sentence)
    #sentence = sentence.strip('quot')
    # Removing amps
    sentence = re.sub('amp', '', sentence)
    #sentence = sentence.strip('amp')

    #sentence = re.sub('@[^\s]+','',sentence)

    return sentence

In [4]:
#Call the function to remove unwanted characters.
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [5]:
#Create the features variable array.
X = []
sentences = list(tweets['tweet_text'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [None]:
#Check for most common wording.
import collections
import itertools
# Create counter

# List of all words across tweets
#all_words_no_urls = list(itertools.chain(*X))
counts_no_urls = collections.Counter(X)
counts_no_urls.most_common(20)

In [None]:
#Show most common wording on a bar plot.
fig, ax = plt.subplots(figsize=(8, 8))
clean_tweets_no_urls = pd.DataFrame(counts_no_urls.most_common(20),
                             columns=['words', 'count'])
# Plot horizontal bar graph
clean_tweets_no_urls.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="blue")

ax.set_title("Common Words Found in Tweets (Including All Words)")

plt.show()



In [None]:
from wordcloud import WordCloud, STOPWORDS 

comment_words = '' 
stopwords = set(STOPWORDS) 

# iterate through the csv file 
all_tweets = pd.DataFrame(tweets)
for val in X: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 1600, height = 1200,
                max_words = 200000,       
                #background_color ='white', 
                stopwords = stopwords,
                colormap = 'Dark2', 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (15, 15), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [9]:
#Retrieve the label information.
y = np.array(tweets['sentiment_label'])
y = np.array(list(map(lambda x: 0 if x==0 else 1, y)))

In [10]:
#Split the data to training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
#DO NOT RUN. Removing stopwords. Takes LOOONG time to run it.Code can be completed without this. Not a huge impact on results.
print(stopwords.words('english'))

X_train = [word for word in X_train if not word in stopwords.words()]
X_test = [word for word in X_test if not word in stopwords.words()]

In [11]:
#Tokenize both training and test sets.
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
#Check how the data looks.
X_train[1202]

In [14]:
# Do the padding.
vocab_size = len(tokenizer.word_index) + 1
#vocab_size = len(tokenizer.word_index)

maxlen = 50

X_train = pad_sequences(X_train, padding='pre' , maxlen=maxlen)
X_test = pad_sequences(X_test, padding='pre', maxlen=maxlen)

In [15]:
#Start enbedding the words using GloVe pre-trained word vectors.Use the commented line in case running this code in Jupyter.
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/content/glove.6B.50d.txt',encoding="utf8") 
#glove_file = open('glove.6B.50d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [16]:
#Complete embedding.
embedding_matrix = zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
#See how enbedding looks.
print(embedding_matrix.shape)
plt.plot(embedding_matrix[5])
plt.plot(embedding_matrix[10])
plt.plot(embedding_matrix[20])
plt.plot(embedding_matrix[1000])
plt.title('Embedding Vectors')

## RNN

In [None]:
#Build the network. layers are embedding, flatten and dense. See the vizualized layout at the bottom.
#Compile the model with optimize type, loss method and KPIs.

from tensorflow.keras import layers, optimizers, losses

model = keras.Sequential([                     
    layers.Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=maxlen),
    layers.Dropout(0.2),
    layers.Conv1D(128, 3, activation='relu'),
    layers.Dropout(0.2),
    layers.MaxPool1D(3),
    layers.Dropout(0.2),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

In [None]:
#Fit the model using hyperparameters.Save the theperformance data for further analysis.
history_RNN = model.fit(X_train, y_train, batch_size=64, epochs=6, verbose=1, validation_split=0.2)

score = model.evaluate(X_test, y_test, verbose=1)

In [22]:
#Check the accuracy.
score = model.evaluate(X_test, y_test, verbose=1)



In [23]:
#Print final model test performance.
print("Test Loss:", score[0])
print("Test Accuracy:", score[1])

Test Loss: 0.44850629568099976
Test Accuracy: 0.7910742163658142


In [None]:
#Have a look on prediction data.
prediction = model.predict(X_test)
prediction[:1]

In [None]:
#Plot the data for analysis.
plt.plot(history_RNN.history['acc'])
plt.plot(history_RNN.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

plt.plot(history_RNN.history['loss'])
plt.plot(history_RNN.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc = 'upper left')
plt.show()

## RandomForest Classifier

In [23]:
#Print RF classifier prediction results for all the rounds training is done.
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
#Run RF training for multiple parameter settings. Takes pretty LONG time to run as max depth is not limited.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [50, 100, 200, 250],
    'max_depth': [20, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

In [None]:
#Check three best models once again for final best fit.
rf1 = RandomForestClassifier(n_estimators=200, max_depth=20)
rf1.fit(X_train, y_train)

rf2 = RandomForestClassifier(n_estimators=200, max_depth=None)
rf2.fit(X_train, y_train)

rf3 = RandomForestClassifier(n_estimators=250, max_depth=None)
rf3.fit(X_train, y_train)

In [None]:
#Print results
from sklearn.metrics import accuracy_score, precision_score, recall_score

for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- Accuracy: {} / Presicion: {} / Recall: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

In [None]:
#Confusion matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

## **Network Visualization**

In [28]:
#Import libraries to enable network architecture visualization.
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

In [None]:
#Visualize network architecture
SVG(model_to_dot(model, dpi=64, show_shapes=True).create(prog="dot", format="svg"))

In [None]:
#Visualize the random forest tree.
from sklearn import tree
len(rf1.estimators_)
feat = pd.DataFrame(X_train)
plt.figure(figsize=(20,20))
_ = tree.plot_tree(rf1.estimators_[0], feature_names=feat.columns, filled=True, max_depth =3)