<a href="https://colab.research.google.com/github/rawar/tensorflow-notebooks/blob/master/lstm_sentiment_detection_raw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tensorflow-gpu==2.0.0-alpha0



In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf


### get the Yelp sentiment data from GitHub Gist 

In [0]:
url = 'https://gist.githubusercontent.com/rawar/3d5343ce39dd16bc457ef4ae56c87f4a/raw/6e6c150745de44e002f0b7fb435d0ccd1e10faa7/yelp-sentiment-data.csv'
sentiment_data = pd.read_csv(url)

### take a look to the data

In [4]:
sentiment_data.head(5)

Unnamed: 0.1,Unnamed: 0,stars,text,sentiment
0,0,5.0,hands down best bloody mary ever so many thing...,pos
1,1,5.0,its true this hotel carries you while in vegas...,pos
2,2,4.0,first time lemme not exaggerate but yo nothin...,pos
3,3,4.0,sushisamba had very tasty sushis my table and ...,pos
4,4,5.0,good food nice setting jazz ambience\n\nmade f...,pos


### remove some empty values

In [0]:
sentiment_data = sentiment_data.dropna()

In [0]:
sentiment_data = sentiment_data.applymap(lambda x: x.strip() if type(x)==str else x)

In [7]:
print(sentiment_data['text'].values)

['hands down best bloody mary ever so many things in my bm veggies  mmm bacon  nailed it its an appetizer i also had the burger with fries who knew fries could be so good the presentation doesnt disappoint ether very nice great seating outside although we sat inside would love to see this restaurant near my hometown seriously get the bm you rock guy'
 'its true this hotel carries you while in vegas we visited the venetian to experience the awesome interior and romantic evening events the entire hotel sparkled its huge too  weekendinvegas'
 'first time  lemme not exaggerate but yo nothing but love and fun cups full and aint nothing but love nothing but good vibes tonight some people dapper some people coolin all age groups people eating'
 ...
 'if i could give this place  stars i would handsdown the best service ive received at a shisha spot worldwide \n\ni was taken care of from the moment i walked in to when i walked out the staff were so attentive and checked on me nonstop they def k

### tokenize text reviews

In [0]:
vocab_size = 4700

In [0]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, lower=False, split=" ")

In [0]:
tokenizer.fit_on_texts(sentiment_data['text'].values)

In [11]:
print('Number of words: %d' % len(tokenizer.word_index))

Number of words: 25474


In [0]:
X = tokenizer.texts_to_sequences(sentiment_data['text'].values)

### fill empty words with none to bring every review to the same world length

In [0]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=vocab_size)

In [14]:
print(X.shape[1])

4700


In [15]:
embed_dim = 128
lstm_out = 200

#model = tf.keras.models.Sequential([
#    tf.keras.layers.Embedding(vocab_size, embed_dim,input_length = X.shape[1]),
#    tf.keras.layers.LSTM(60, return_sequences=True, recurrent_dropout=0.5),
#    tf.keras.layers.Dropout(0.5),
#    tf.keras.layers.LSTM(60, recurrent_dropout=0.5),
#    tf.keras.layers.Dense(60, activation='relu'),
#    tf.keras.layers.Dense(2,activation='softmax') # classification layer with two classes (neg, pos)
#])

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim,input_length = X.shape[1]),
    tf.keras.layers.LSTM(lstm_out, recurrent_dropout=0.5),
    tf.keras.layers.Dense(2,activation='softmax') # classification layer with two classes (neg, pos)
])


model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())  

W0414 12:11:13.707231 139894846355328 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f3b5f344940>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0414 12:11:14.266299 139894846355328 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py:4081: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4700, 128)         601600    
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (None, 200)               263200    
_________________________________________________________________
dense (Dense)                (None, 2)                 402       
Total params: 865,202
Trainable params: 865,202
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
print(sentiment_data['sentiment'].head(5))

0    pos
1    pos
2    pos
3    pos
4    pos
Name: sentiment, dtype: object


### 1 = postive, 0 = negativ

In [17]:
Y = pd.get_dummies(sentiment_data['sentiment']).values
print(Y)

[[0 1]
 [0 1]
 [0 1]
 ...
 [0 1]
 [1 0]
 [1 0]]


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [19]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4690,4691,4692,4693,4694,4695,4696,4697,4698,4699
0,0,0,0,0,0,0,0,0,0,0,...,23,4,130,115,27,1,1282,228,435,19
1,0,0,0,0,0,0,0,0,0,0,...,150,11,6,133,4391,2,144,858,26,208
2,0,0,0,0,0,0,0,0,0,0,...,16,9,41,375,210,2,252,344,183,173
3,0,0,0,0,0,0,0,0,0,0,...,309,2,120,34,1706,16,29,9,4,387
4,0,0,0,0,0,0,0,0,0,0,...,579,6,219,18,2048,70,618,97,80,283


In [20]:
pd.DataFrame(Y_train).head()

Unnamed: 0,0,1
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0


In [0]:
num_epochs = 5
batch_size = 10

In [0]:
history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), batch_size = batch_size, epochs = num_epochs, verbose = 1)

Train on 5251 samples, validate on 1313 samples
Epoch 1/5


In [0]:
history.history.keys()

In [0]:
from matplotlib import pyplot

pyplot.plot(history.history['accuracy'])
pyplot.plot(history.history['val_accuracy'])
pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('Training results')
pyplot.ylabel('accuracy and loss')
pyplot.xlabel('epoch')
pyplot.legend(['train_acc', 'val_acc', 'train_loss', 'val_loss'], loc='upper left')
pyplot.show()



In [0]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f" % (acc))

In [0]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_test)):
    
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    
    if np.argmax(result) == np.argmax(Y_test[x]):
        if np.argmax(Y_test[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
            
    if np.argmax(Y_test[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

In [0]:
print("positive accuracy: %.2f" % (pos_correct/pos_cnt*100))
print("negative accuracy: %.2f" % (neg_correct/neg_cnt*100))

### Serialize the Keras model to JSON and save the model weight

In [0]:
from google.colab import files

In [0]:
model_json = model.to_json()
with open("yelp-sentiment-model.json", "w") as json_file:
    json_file.write(model_json)

In [0]:
files.download("yelp-sentiment-model.json")

In [0]:
model.save_weights("yelp-sentiment-model-model.h5")
print("Saved model to disk")

In [0]:
files.download('yelp-sentiment-model-model.h5')

### Load model and use it to predict new reviews

In [0]:
json_file = open('yelp-sentiment-model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("yelp-sentiment-model.h5")
 
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
score = loaded_model.evaluate(X_val, Y_val, verbose=0)

In [0]:
xval = X_val[1].reshape(1,X_val.shape[1])
print(xval)

In [0]:
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    words = []
    # Looking up words in dictionary
    #words = [reverse_word_map.get(letter) for letter in list_of_indices]
    for letter in list_of_indices:
        w = reverse_word_map.get(letter)
        if(w != None):
            words.append(w)
    return(words)

In [0]:
xval_str = list(map(sequence_to_text, xval))

In [0]:
yresult = model.predict(xtest)
print("words=%s, predicted sentiment=%s" % (xtest_str, yresult))

In [0]:
# 1 = positiv, 0 = negativ
result = model.predict_classes(xtest)
print(result)

In [0]:
yhat = model.predict_proba(xtest) 
print(yhat)