# Tweet Sentiment Classifier

Import Libraries

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import nltk
import matplotlib.pyplot as plt
nltk.download('stopwords')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Flatten, Dropout, Activation, BatchNormalization, Bidirectional, GlobalMaxPool1D, Conv1D, MaxPooling1D, SimpleRNN
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import Model, Sequential
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer

stopwords = nltk.corpus.stopwords.words('english')
pd.set_option('display.max_colwidth', 200)

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


Import Data

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on the releases we already bought","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth,http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth,neutral
6,6e0c6d75b1,2am feedings for the baby are fun when he is all smiles and coos,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both of you,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u just became cooler. hehe... (is that possible!?),Wow... u just became cooler.,positive


Drop na columns

In [3]:
train = train.dropna(how='any', axis=0)

Remove Stop Words

In [4]:
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
train.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d responded, I going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I miss San Diego!!!,Sooo SAD,negative
2,088c60f138,boss bullying me...,bullying me,negative
3,9642c003ef,interview! leave alone,leave me alone,negative
4,358bd9e861,"Sons ****, couldn`t put releases already bought","Sons of ****,",negative
5,28b57f3990,http://www.dothebouncy.com/smf - shameless plugging best Rangers forum earth,http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth,neutral
6,6e0c6d75b1,2am feedings baby fun smiles coos,fun,positive
7,50e14c0bb8,Soooo high,Soooo high,neutral
8,e050245fbd,Both,Both of you,neutral
9,fc2cbefa9d,Journey!? Wow... u became cooler. hehe... (is possible!?),Wow... u just became cooler.,positive


Delete Punctuations, Convert To Lower Case & Delete Double Space

In [5]:
train['text'] = train['text'].apply(lambda x: re.sub('[!@#$:).;,?&*/]', "", x.lower()))
train['text'] = train['text'].apply(lambda x: re.sub(' ', ' ', x))
train.head(10)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,i`d responded i going,"I`d have responded, if I were going",neutral
1,549e992a42,sooo sad i miss san diego,Sooo SAD,negative
2,088c60f138,boss bullying me,bullying me,negative
3,9642c003ef,interview leave alone,leave me alone,negative
4,358bd9e861,sons couldn`t put releases already bought,"Sons of ****,",negative
5,28b57f3990,httpwwwdothebouncycomsmf - shameless plugging best rangers forum earth,http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth,neutral
6,6e0c6d75b1,2am feedings baby fun smiles coos,fun,positive
7,50e14c0bb8,soooo high,Soooo high,neutral
8,e050245fbd,both,Both of you,neutral
9,fc2cbefa9d,journey wow u became cooler hehe (is possible,Wow... u just became cooler.,positive


Seperate Training Text & Labels

In [6]:
list_sentences = train['text']
list_labels = train['sentiment']
train = train[['text', 'sentiment']]
train

Unnamed: 0,text,sentiment
0,i`d responded i going,neutral
1,sooo sad i miss san diego,negative
2,boss bullying me,negative
3,interview leave alone,negative
4,sons couldn`t put releases already bought,negative
5,httpwwwdothebouncycomsmf - shameless plugging best rangers forum earth,neutral
6,2am feedings baby fun smiles coos,positive
7,soooo high,neutral
8,both,neutral
9,journey wow u became cooler hehe (is possible,positive


# Data Preparation For Model Building

Train & Test Split With 80:20 Ratio

In [7]:
train_data, test_data = train_test_split(train, test_size=0.2)

Define the sequence lengths, number of max words and embedding dimensions

1. Sequence length of each sentence. If more, truncate if less, pad with zeros

In [8]:
MAX_SEQUENCE_LENGTH = 300

2. Number Of Max Words

- Top 20000 frequently occuring words

In [9]:
MAX_NB_WORDS = 20000

- Get frequently Occuring Words

In [10]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train.text)
train_sequences = tokenizer.texts_to_sequences(train_data.text)
test_sequences = tokenizer.texts_to_sequences(test_data.text)

Create a dictionary containing words and their index

In [11]:
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 28976 unique tokens.


Get The Top Frequent Words On Train

In [12]:
training_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Get The Top Frequent Words On Test

In [13]:
testing_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
print(training_data.shape)
print(testing_data.shape)

(21984, 300)
(5496, 300)


In [15]:
train_labels = train_data['sentiment']
test_labels = test_data['sentiment']


Convert Sentiment From Character Array To Numeric Array

In [16]:
le = LabelEncoder()
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

['negative' 'neutral' 'positive']
(array([0, 1, 2]), array([6236, 8900, 6848]))
(array([0, 1, 2]), array([1545, 2217, 1734]))


Changing Data Types

In [17]:
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))

print('Shape of data tensor. ', training_data.shape)
print('Shape of data tensor. ', labels_train.shape)
print('Shape of data tensor. ', labels_test.shape)

Shape of data tensor.  (21984, 300)
Shape of data tensor.  (21984, 3)
Shape of data tensor.  (5496, 3)


In [18]:
EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)

300


# Model Building & Predicting

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
EMBEDDING_DIM,
input_length=MAX_SEQUENCE_LENGTH
))
model.add(Bidirectional(LSTM(60, return_sequences=True,
dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(32, kernel_size = 3, padding = "valid",
kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(60, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'binary_crossentropy',
optimizer='adam',metrics = ['accuracy'])
model.fit(training_data, labels_train,
batch_size=32,
epochs=3,
validation_data=(testing_data, labels_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 21984 samples, validate on 5496 samples
Epoch 1/3