# Sentiment Analysis LSTM and GRU

In [1]:
import tensorflow as tf

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [5]:
import numpy as np
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.layers import Embedding, Reshape, Activation, Input, Lambda, Dense, GRU, LSTM, CuDNNLSTM, CuDNNGRU, Dropout, Conv1D, MaxPooling1D, Flatten, concatenate
from tensorflow.python.keras.layers.merge import Dot
from tensorflow.python.keras.utils import np_utils
from tensorflow.python.keras.utils.data_utils import get_file
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import skipgrams, pad_sequences
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.python.keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
# from tensorflow.python.keras.optimizers import Adam
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import gensim

In [9]:
from tensorflow.python.keras.datasets import imdb

In [10]:
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer # takes into consideration the morphological analysis of the words
from nltk.stem.porter import PorterStemmer # cutting off the end or the beginning of the word.

In [11]:
remove_terms = punctuation + '0123456789'

In [12]:
num_words = 20000

Dividing Datasets

In [13]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [14]:
print(len(X_train),'train_data')
print(len(X_test),'test_data')

25000 train_data
25000 test_data


In [15]:
print('text already in tokens:')
len(X_train[0]),len(X_train[1]),len(X_train[2])

text already in tokens:


(218, 189, 141)

Setting some parameters

In [16]:
max_len = 256
embedding_size = 10
batch_size = 128
n_epochs = 10

In [17]:
pad = 'pre' #'post' # with you want to pad pre or post the text

In [18]:
X_train_pad = pad_sequences(X_train, maxlen=max_len, padding=pad, truncating=pad)
X_test_pad = pad_sequences(X_test, maxlen=max_len, padding=pad, truncating=pad)

- Creating Model - 

In [14]:
model = Sequential()
model.add(Embedding(
    input_dim=num_words,output_dim=embedding_size,input_length=max_len,
    name='layer_embedding'                
    ))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(250,activation='relu'))
model.add(Dense(1,activation='sigmoid',name='classification'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 256, 10)           200000    
_________________________________________________________________
conv1d (Conv1D)              (None, 256, 128)          3968      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 128, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 128)          49280     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 64, 128)           49280     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)           0

In [16]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
callback_early_stopping = EarlyStopping(monitor='val_loss',patience=5,verbose=1)

In [18]:
model.fit(
    X_train_pad, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.05,
    callbacks=[callback_early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 00009: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fe25c1a49a0>

In [20]:
eval_ = model.evaluate(X_test_pad, y_test)



In [21]:
print('Loss', eval_[0])
print('Accuracy', eval_[1])

Loss 0.416238933801651
Accuracy 0.8701599836349487


### Combining all approaches

In [48]:
model = Sequential()
model.add(Embedding(
    input_dim=num_words,output_dim=embedding_size,input_length=max_len,
    name='layer_embedding'                
    ))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(Conv1D(filters=256, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.5))
model.add(CuDNNLSTM(256))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid',name='classification'))

In [49]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 256, 10)           200000    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 256, 128)          6528      
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 85, 128)           0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 85, 128)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 85, 256)           164096    
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 28, 256)           0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 28, 256)          

In [50]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

In [51]:
callback_early_stopping = EarlyStopping(monitor='val_loss',patience=5,verbose=1)

In [52]:
model.fit(
    X_train_pad, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.05,
    callbacks=[callback_early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fe21c5075e0>

In [53]:
eval_ = model.evaluate(X_test_pad, y_test)



In [54]:
print('Loss', eval_[0])
print('Accuracy', eval_[1])

Loss 0.6870551705360413
Accuracy 0.8304399847984314


### Trying Something New

In [66]:
conv = []
filter_sizes = [3,4,5]

embedding_layer = Embedding(
    input_dim=num_words,output_dim=embedding_size,input_length=max_len,
    name='layer_embedding'                
    )

In [67]:
seq_input = Input(shape=(max_len,),dtype='int32')
embed_seq = embedding_layer(seq_input)

In [68]:
for f in filter_sizes:
    _conv = Conv1D(filters=128, kernel_size=f, activation='relu')(embed_seq)
    _pool = MaxPooling1D(5)(_conv)
    conv.append(_pool)

In [69]:
_concat = concatenate(conv, axis=1) # axis = 1 for performing the operation in columns with same size
_conv1 = Conv1D(filters=128, kernel_size=5, activation='relu')(_concat)
_pool1 = MaxPooling1D(5)(_conv1)
_drop1 = Dropout(0.5)(_pool1)
_conv2 = Conv1D(filters=128, kernel_size=5, activation='relu')(_drop1)
_pool2 = MaxPooling1D(5)(_conv2)
_drop2 = Dropout(0.5)(_pool2)
_flat = Flatten()(_drop2)
_drop3 = Dropout(0.5)(_flat)
_dense = Dense(128,activation='relu')(_drop3)
preds = Dense(1, activation='sigmoid')(_dense)
model = Model(seq_input,preds)

In [70]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
layer_embedding (Embedding)     (None, 256, 10)      200000      input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_22 (Conv1D)              (None, 254, 128)     3968        layer_embedding[0][0]            
__________________________________________________________________________________________________
conv1d_23 (Conv1D)              (None, 253, 128)     5248        layer_embedding[0][0]            
______________________________________________________________________________________________

In [71]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

In [72]:
callback_early_stopping = EarlyStopping(monitor='val_loss',patience=5,verbose=1)

In [73]:
model.fit(
    X_train_pad, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.05,
    callbacks=[callback_early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fe1c034ba60>

In [74]:
eval_ = model.evaluate(X_test_pad, y_test)



In [76]:
print('Loss', eval_[0])
print('Accuracy', eval_[1])

Loss 0.626409113407135
Accuracy 0.8430799841880798


### Different

In [77]:
conv = []
filter_sizes = [3,4,5]

embedding_layer = Embedding(
    input_dim=num_words,output_dim=embedding_size,input_length=max_len,
    name='layer_embedding'                
    )

In [78]:
seq_input = Input(shape=(max_len,),dtype='int32')
embed_seq = embedding_layer(seq_input)

In [79]:
for f in filter_sizes:
    _conv = Conv1D(filters=256, kernel_size=f, activation='relu')(embed_seq)
    _pool = MaxPooling1D(5)(_conv)
    conv.append(_pool)

In [80]:
_concat = concatenate(conv, axis=1) # axis = 1 for performing the operation in columns with same size
_conv1 = Conv1D(filters=128, kernel_size=5, activation='relu')(_concat)
_pool1 = MaxPooling1D(5)(_conv1)
_drop1 = Dropout(0.5)(_pool1)
_conv2 = Conv1D(filters=128, kernel_size=5, activation='relu')(_drop1)
_pool2 = MaxPooling1D(5)(_conv2)
_drop2 = Dropout(0.5)(_pool2)
_flat = Flatten()(_drop2)
_drop3 = Dropout(0.5)(_flat)
_dense = Dense(128,activation='relu')(_drop3)
preds = Dense(1, activation='sigmoid')(_dense)
model = Model(seq_input,preds)

In [81]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
layer_embedding (Embedding)     (None, 256, 10)      200000      input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_27 (Conv1D)              (None, 254, 256)     7936        layer_embedding[0][0]            
__________________________________________________________________________________________________
conv1d_28 (Conv1D)              (None, 253, 256)     10496       layer_embedding[0][0]            
____________________________________________________________________________________________

In [82]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
callback_early_stopping = EarlyStopping(monitor='val_loss',patience=5,verbose=1)

In [84]:
model.fit(
    X_train_pad, y_train,
    epochs=n_epochs,
    batch_size=batch_size,
    validation_split=0.05,
    callbacks=[callback_early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 00008: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fe1b069a220>

In [85]:
eval_ = model.evaluate(X_test_pad, y_test)



In [86]:
print('Loss', eval_[0])
print('Accuracy', eval_[1])

Loss 0.477815181016922
Accuracy 0.8456000089645386


# Auto Keras

In [19]:
%load_ext autoreload
%autoreload 2

In [20]:
import autokeras as ak

In [52]:
clf = ak.StructuredDataClassifier(max_trials=2000)
clf.fit(X_train_pad, y_train, epochs=20)

Trial 2 Complete [00h 00m 47s]
val_accuracy: 0.5026167631149292

Best val_accuracy So Far: 0.5346215963363647
Total elapsed time: 00h 01m 18s
INFO:tensorflow:Oracle triggered exit
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


In [53]:
results = clf.predict(X_test_pad)



In [54]:
from sklearn.metrics import accuracy_score

In [55]:
accuracy_score(y_test,results)

0.51856