<a href="https://colab.research.google.com/github/paultsr/jecc/blob/master/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

import nltk
nltk.download('treebank')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.engine import Input

import gensim
from gensim.models import KeyedVectors, Word2Vec, FastText

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [1]:
from google.colab import files
uploaded = files.upload()

Saving sa_data.csv to sa_data.csv


In [None]:
#Reading the data
data = pd.read_csv('sa_data.csv')
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [None]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                           
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words))

In [None]:
data['text'] = data['text'].apply(lambda x: clean_data(x))
print(data['text'])

0        rt nancyleegrahn everyone feel climate change ...
1        rt scottwalker catch full gopdebate last night...
2        rt tjmshow mention tamir rice gopdebate held c...
3        rt robgeorge carly fiorina trending hours deba...
4        rt danscavino gopdebate w realdonaldtrump deli...
                               ...                        
13866    rt cappy yarbrough love see men never faced pr...
13867    rt georgehenryw thought huckabee exceeded expe...
13868    rt lrihendry tedcruz president always tell tru...
13869    rt jrehling gopdebate donald trump says time p...
13870    rt lrihendry tedcruz headed presidential debat...
Name: text, Length: 13871, dtype: object


In [None]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

print(data['text'])

0        nancyleegrahn everyone feel climate change que...
1        scottwalker catch full gopdebate last night sc...
2        tjmshow mention tamir rice gopdebate held clev...
3        robgeorge carly fiorina trending hours debate ...
4        danscavino gopdebate w realdonaldtrump deliver...
                               ...                        
13866    cappy yarbrough love see men never faced pregn...
13867    georgehenryw thought huckabee exceeded expecta...
13868    lrihendry tedcruz president always tell truth ...
13869    jrehling gopdebate donald trump says time poli...
13870    lrihendry tedcruz headed presidential debates ...
Name: text, Length: 13871, dtype: object


In [None]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

### Machine Learning with Bag-of-Word features

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
print(vectorizer.get_feature_names())



In [None]:
X = X.toarray()
print(X[0:2])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
le = LabelEncoder()
Y = le.fit_transform(sentiment)

In [None]:
print(X.shape,Y.shape)

(13871, 18525) (13871,)


In [None]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(11096, 18525) (11096,)
(2775, 18525) (2775,)


In [None]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(y_test))

Shape train data =  (11096, 18525)
Shape of train label =  (11096,)
Shape of test data =  (2775, 18525)
Shape of test label =  (2775,)


In [None]:
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [None]:
#Generating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[736 493 493]
 [173 224 215]
 [ 88  80 273]]


In [None]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  44.43


### Machine Learning with TF-IDF (Term Frequency - Inverse Document Frequency) features


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(reviews)
X = X.toarray()

In [None]:
print(vectorizer.get_feature_names())



In [None]:
X[0:2]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])


### Machine Learning with Word2vec features



In [None]:
corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [None]:
corpus[0:2]

[['nancyleegrahn',
  'everyone',
  'feel',
  'climate',
  'change',
  'question',
  'last',
  'night',
  'exactly',
  'gopdebate'],
 ['scottwalker',
  'catch',
  'full',
  'gopdebate',
  'last',
  'night',
  'scott',
  'best',
  'lines',
  '90',
  'seconds',
  'walker16',
  'http',
  'co',
  'zsff']]

In [None]:
#Creating word embedding for the words. Embedding dimension = 100
wvmodel = Word2Vec(corpus, size=100, window=3, min_count=1)

In [None]:
c = corpus[0:2]

for i in range(len(c)):
  for j in c[i]:
    print(j)
  print("..............")


nancyleegrahn
everyone
feel
climate
change
question
last
night
exactly
gopdebate
..............
scottwalker
catch
full
gopdebate
last
night
scott
best
lines
90
seconds
walker16
http
co
zsff
..............


In [None]:
np.shape(wvmodel['co'])

  """Entry point for launching an IPython kernel.


(100,)

In [None]:
#Creating the input data
X = np.zeros((len(corpus),100)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
  emb = np.zeros((1,100))
  for w in corpus[i]:
    emb = emb +  wvmodel[w]
  X[i] = emb 

  


In [None]:
np.shape(X)

(13871, 100)

In [None]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(11096, 100) (11096,)
(2775, 100) (2775,)


In [None]:
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [None]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  35.50


### Deep Learning for Sentiment Analysis

In [None]:
data = pd.read_csv('sa_data.csv')

In [None]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [None]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words))

In [None]:
data['text'] = data['text'].apply(lambda x: clean_data(x))

In [None]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

In [None]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

In [None]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)



In [None]:
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence

In [None]:
Y = to_categorical(Y)
print(Y)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [None]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
embed_dim = 500
hidden_layer = 100

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SimpleRNN(hidden_layer))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 500)           600000    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 100)               60100     
_________________________________________________________________
dense (Dense)                (None, 3)                 303       
Total params: 660,403
Trainable params: 660,403
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5f2c8096a0>

In [None]:
score = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 63.06


In [None]:
test = data['text'][0]
test = clean_data(test)
test = test.replace('rt ','')
test = [test]
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=24, padding = 'post')

In [None]:
class_label = model.predict_classes(test)
print(le.inverse_transform(class_label))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
['Neutral']
