In [19]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.preprocessing import sequence
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import re

In [3]:
data = pd.read_csv('sentiment_bn.csv', delimiter=";")
# Keeping only the neccessary columns
data = data[['text','sentiment']]
data.sentiment.unique()

array(['Love', 'Like', 'Consciousness', 'Protestant', 'Smiley', 'Angry',
       'Blush', 'Skip', 'Rocking', 'Fail', 'Shocking', 'WOW', 'Bad',
       'HaHa', 'Sad', 'Skeptical', 'Evil', 'Provocative'], dtype=object)

In [8]:
data = data[data.sentiment != "Neutral"]
# data['text'] = data.text.apply(lambda x: x.strip)
data['sentiment'] = data['sentiment'].apply((lambda x: re.sub('[^a-zA-z\s]','',x)))
# print(data['sentiment'])
data.sentiment.unique()
print(data[ data['sentiment'] == 'Love'].size)
print(data[ data['sentiment'] == 'Like'].size)
    
max_fatures = 100
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
X

2624
22198


array([[ 0,  0,  0, ...,  7, 45, 14],
       [ 0,  0,  0, ...,  2, 19, 90],
       [ 0,  0,  0, ...,  0,  0, 68],
       ...,
       [ 0,  0,  0, ..., 62, 27, 52],
       [ 0,  0,  0, ...,  0, 24,  1],
       [ 0,  0,  0, ...,  0, 24,  1]])

In [14]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
X_train = sequence.pad_sequences(X_train, maxlen=200)
X_test = sequence.pad_sequences(X_test, maxlen=200)
print("Review length: ")
result = [len(x) for x in X]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))

(18579, 143) (18579, 18)
(9152, 143) (9152, 18)
Review length: 
Mean 143.00 words (0.000000)


In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7


In [27]:
seed = 7
np.random.seed(seed)
model = Sequential()
model.add(Embedding(200, 32, input_length=200))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(18, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 200, 32)           6400      
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 200, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 250)               800250    
_________________________________________________________________
dense_9 (Dense)              (None, 18)                4518      
Total params: 814,272
Trainable params: 814,272
Non-trainable params: 0
_________________________________________________________________
None

In [32]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
validation_size = 2000

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size =128)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 18579 samples, validate on 7152 samples
Epoch 1/10
 - 28s - loss: 0.1536 - acc: 0.9485 - val_loss: 0.1824 - val_acc: 0.9434
Epoch 2/10
 - 28s - loss: 0.1530 - acc: 0.9487 - val_loss: 0.1828 - val_acc: 0.9439
Epoch 3/10
 - 28s - loss: 0.1522 - acc: 0.9488 - val_loss: 0.1832 - val_acc: 0.9435
Epoch 4/10
 - 28s - loss: 0.1516 - acc: 0.9491 - val_loss: 0.1839 - val_acc: 0.9434
Epoch 5/10
 - 28s - loss: 0.1511 - acc: 0.9491 - val_loss: 0.1844 - val_acc: 0.9435
Epoch 6/10
 - 28s - loss: 0.1507 - acc: 0.9491 - val_loss: 0.1846 - val_acc: 0.9434
Epoch 7/10
 - 28s - loss: 0.1500 - acc: 0.9493 - val_loss: 0.1854 - val_acc: 0.9438
Epoch 8/10
 - 30s - loss: 0.1496 - acc: 0.9492 - val_loss: 0.1857 - val_acc: 0.9426
Epoch 9/10
 - 29s - loss: 0.1490 - acc: 0.9495 - val_loss: 0.1870 - val_acc: 0.9425
Epoch 10/10
 - 29s - loss: 0.1488 - acc: 0.9495 - val_loss: 0.1883 - val_acc: 0.9436
score: 0.19
acc: 0.94
Accuracy: 94.38%
