### **1. Importing libraries and loading the dataset**


In [2]:
from tensorflow.keras.datasets import imdb
import pandas as pd
import numpy as np
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from tensorflow.keras.models import Model
import string
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/NLP/IMDB Dataset.csv')

data['review'] = data['review'].str.lower()


In [4]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

In [5]:
def remove_stopwords(data):
  data['review without stopwords'] = data['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
  return data

def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result


In [6]:
data_without_stopwords = remove_stopwords(data)
data_without_stopwords['clean_review']= data_without_stopwords['review without stopwords'].apply(lambda cw : remove_tags(cw))
data_without_stopwords['clean_review'] = data_without_stopwords['clean_review'].str.replace('[{}]'.format(string.punctuation), ' ')

In [7]:
data_without_stopwords.head()

Unnamed: 0,review,sentiment,review without stopwords,clean_review
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching just 1 oz epi...,one reviewers mentioned watching just 1 oz epi...
1,a wonderful little production. <br /><br />the...,positive,wonderful little production. <br /><br />the f...,wonderful little production the filming techn...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically family little boy (jake) thinks zomb...,basically family little boy jake thinks zomb...
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love time money"" visually stu...",petter mattei s love time money visually stu...


In [8]:

reviews = data_without_stopwords['clean_review']
reviews

0        one reviewers mentioned watching just 1 oz epi...
1        wonderful little production  the filming techn...
2        thought wonderful way spend time hot summer we...
3        basically family little boy  jake  thinks zomb...
4        petter mattei s  love time money  visually stu...
                               ...                        
49995    thought movie right good job  wasn t creative ...
49996    bad plot  bad dialogue  bad acting  idiotic di...
49997    catholic taught parochial elementary schools n...
49998    going disagree previous comment side maltin on...
49999    no one expects star trek movies high art  fans...
Name: clean_review, Length: 50000, dtype: object

In [9]:
reviews_list = []
for i in range(len(reviews)):
  reviews_list.append(reviews[i])



In [10]:
sentiment = data_without_stopwords['sentiment']

In [11]:
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, sentiment)))

In [12]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
X_train, X_test,Y_train, Y_test = train_test_split(reviews_list, y, test_size=0.2, random_state = 45)

In [14]:
len(Y_train)

40000

In [15]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [16]:
words_to_index = tokenizer.word_index

In [17]:
len(words_to_index)

95419

In [18]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map
    

In [19]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip

In [20]:
# !apt install unzip
# !unzip -u '/content/glove.6B.zip' -d '/content/drive/MyDrive/Colab Notebooks/NLP/glove.6B'

In [21]:
word_to_vec_map = read_glove_vector('/content/drive/MyDrive/Colab Notebooks/NLP/glove.6B/glove.6B.50d.txt')

In [22]:
maxLen = 150


In [23]:

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)






In [24]:
def imdb_rating(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [25]:
model = imdb_rating((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 50)           4770950   
_________________________________________________________________
lstm (LSTM)                  (None, 150, 128)          91648     
_________________________________________________________________
dropout (Dropout)            (None, 150, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584

In [26]:
# def conv1d_model(input_shape):

#   X_indices = Input(input_shape)

#   embeddings = embedding_layer(X_indices)

#   X = Conv1D(512,3,activation='relu')(embeddings)
  
#   X = MaxPooling1D(3)(X)

#   X = Conv1D(256,3,activation='relu')(X)
  
#   X = MaxPooling1D(3)(X)

#   X = Conv1D(256,3,activation='relu')(X)
#   X = Dropout(0.8)(X)
#   X = MaxPooling1D(3)(X)

#   X = GlobalMaxPooling1D()(X)

#   X = Dense(256, activation='relu')(X)
#   X = Dense(1, activation='sigmoid')(X)

#   model = Model(inputs=X_indices, outputs=X)

#   return model
                                    
                      
              
                                      




In [27]:
# model_1d = conv1d_model((maxLen,))
# model_1d.summary()

In [28]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

In [29]:
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(40000, 150)

In [30]:
from tensorflow.keras.optimizers import Adam


In [31]:
# adam = Adam(learning_rate = 0.0001)
# model_1d.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
# model_1d.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

In [33]:
adam = Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f49401ead10>

In [35]:
X_test_indices = tokenizer.texts_to_sequences(X_test)
print(X_test[0], X_test_indices[0], sep="\n")
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
print(X_test[0], X_test_indices[0], sep="\n")

movie absolute waste time  nothing wanna be gangster movie  contains predictable plot  feelings unsympathetic characters  dialogue mediocre best  half time looking something else do  movie boring  since already know going happen  half time desperately hoping protagonist grows pair balls just ends life jumping off bridge something  also  secondary characters part one  dimensional  no depth characters movie  no depth 
[1, 1521, 375, 14, 81, 2832, 417, 1992, 1, 1367, 644, 43, 1317, 34, 331, 1514, 47, 223, 14, 176, 62, 241, 427, 1, 272, 151, 390, 49, 86, 511, 223, 14, 2917, 1383, 1875, 3354, 1967, 4980, 8, 589, 46, 3451, 53, 3102, 62, 26, 4894, 34, 90, 6, 2014, 13, 1076, 34, 1, 13, 1076]
movie absolute waste time  nothing wanna be gangster movie  contains predictable plot  feelings unsympathetic characters  dialogue mediocre best  half time looking something else do  movie boring  since already know going happen  half time desperately hoping protagonist grows pair balls just ends life jump

In [36]:
model.evaluate(X_test_indices, Y_test)



[0.3498759865760803, 0.8490999937057495]

In [37]:
# model_1d.evaluate(X_test_indices, Y_test)

In [38]:
preds = model.predict(X_test_indices)

In [39]:
print(X_test_indices)
len(X_test_indices)


[[   1 1521  375 ...    0    0    0]
 [ 318  161    1 ...    0    0    0]
 [ 307    5 4714 ...    0    0    0]
 ...
 [ 618 1891 4572 ...    0    0    0]
 [1920    3  103 ...   11  271   10]
 [1058  123  812 ...  736  629   79]]


10000

In [40]:
n = np.random.randint(0,9999)

X_test[n]


'spoiler alert   you can listen wong kar wai s movies like radio play  invisible vibrations characters  rooms stay in  rhythm presses ahead  attraction dislike   whole spectrum atmosphere played back sound track  dialogue mostly completely unimportant the narration similar childish amorous look beautiful woman sad man whose sorrows noticeable  helpless   in mood love  told child perspective  child never appears narrator  aesthetic film developed extreme light color dramaturgy  harsh cuts  unattached  almost documentary camera complex  unobtrusive sound the genius use nat king cole s  perhaps  perhaps  perhaps   whose mysterious power grows often repeated melancholic waltz helps graceful choreography two protagonists  maggie cheung beautiful dresses brilliant  perfect vis à vis handsome  stylish tony leung  audience assumes romance them  wong just sees sad resignation  two potential lovers revolving around like satellites  knowing never will share orbit  wish will find other  won t emot

In [41]:
if preds[n] > 0.5:
  print("probabilty = ", preds[n])
  print('predicted sentiment : positive')
else: 
  print("probabilty = ", preds[n])
  print('precicted sentiment : negative')

if (Y_test[n] == 1):
  print('correct sentiment : positive')
else:
  print('correct sentiment : negative')


probabilty =  [0.99352115]
predicted sentiment : positive
correct sentiment : positive


In [42]:
preds[n]

array([0.99352115], dtype=float32)

In [43]:
Y_test[n]

1

In [44]:
# model.save_weights('/content/drive/MyDrive/Colab Notebooks/NLP/imdb_weights_lstm.hdf5')
# model.save('/content/drive/MyDrive/Colab Notebooks/NLP/imdb_lstm.h5')

In [45]:
reviews_list_idx = tokenizer.texts_to_sequences(reviews_list)

In [46]:
def add_score_predictions(data, reviews_list_idx):

  data['sentiment score'] = 0

  reviews_list_idx = pad_sequences(reviews_list_idx, maxlen=maxLen, padding='post')

  review_preds = model.predict(reviews_list_idx)

  data['sentiment score'] = review_preds

  pred_sentiment = np.array(list(map(lambda x : 'positive' if x > 0.5 else 'negative',review_preds)))

  data['predicted sentiment'] = 0

  data['predicted sentiment'] = pred_sentiment

  return data

  

In [47]:
data = add_score_predictions(data, reviews_list_idx)

In [48]:
data

Unnamed: 0,review,sentiment,review without stopwords,clean_review,sentiment score,predicted sentiment
0,one of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching just 1 oz epi...,one reviewers mentioned watching just 1 oz epi...,0.573071,positive
1,a wonderful little production. <br /><br />the...,positive,wonderful little production. <br /><br />the f...,wonderful little production the filming techn...,0.989158,positive
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,thought wonderful way spend time hot summer we...,0.991080,positive
3,basically there's a family where a little boy ...,negative,basically family little boy (jake) thinks zomb...,basically family little boy jake thinks zomb...,0.150421,negative
4,"petter mattei's ""love in the time of money"" is...",positive,"petter mattei's ""love time money"" visually stu...",petter mattei s love time money visually stu...,0.992541,positive
...,...,...,...,...,...,...
49995,i thought this movie did a down right good job...,positive,thought movie right good job. wasn't creative ...,thought movie right good job wasn t creative ...,0.897397,positive
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"bad plot, bad dialogue, bad acting, idiotic di...",bad plot bad dialogue bad acting idiotic di...,0.009456,negative
49997,i am a catholic taught in parochial elementary...,negative,catholic taught parochial elementary schools n...,catholic taught parochial elementary schools n...,0.490923,negative
49998,i'm going to have to disagree with the previou...,negative,going disagree previous comment side maltin on...,going disagree previous comment side maltin on...,0.758660,positive


In [49]:
import tensorflow as tf

In [50]:
model = tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/NLP/imdb_lstm.h5")

In [51]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 50)           4770950   
_________________________________________________________________
lstm (LSTM)                  (None, 150, 128)          91648     
_________________________________________________________________
dropout (Dropout)            (None, 150, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 150, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584

In [73]:
#custom Testing
X_test_indices = tokenizer.texts_to_sequences(["dirty place"])
print(X_test_indices)
X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
print(X_test_indices)
predict = model.predict(X_test_indices)
if predict > 0.5:
  print("probabilty = ", predict)
  print('predicted sentiment : positive')
else: 
  print("probabilty = ", predict)
  print('precicted sentiment : negative')


[[1574, 196]]
[[1574  196    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0]]
probabilty =  [[0.41769662]]
precicted sentiment : negative


In [60]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-0.88.0-py2.py3-none-any.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 4.3 MB/s 
[?25hCollecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.0-py2.py3-none-any.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 19.9 MB/s 
Collecting blinker
  Downloading blinker-1.4.tar.gz (111 kB)
[K     |████████████████████████████████| 111 kB 32.0 MB/s 
[?25hCollecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.18-py3-none-any.whl (170 kB)
[K     |████████████████████████████████| 170 kB 31.9 MB/s 
Collecting watchdog
  Downloading watchdog-2.1.5-py3-none-manylinux2014_x86_64.whl (75 kB)
[K     |████████████████████████████████| 75 kB 3.6 MB/s 
Collecting base58
  Downloading base58-2.1.0-py3-none-any.whl (5.6 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.7-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.8 

In [3]:
%%writefile app.py
import streamlit as st
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)

@st.cache(allow_output_mutation=True)
def load_model():
  model=tf.keras.models.load_model("/content/drive/MyDrive/Colab Notebooks/NLP/imdb_lstm.h5")
  return model
with st.spinner('Model is being loaded..'):
  model=load_model()

st.write("""
         # Sentiment Analysis With LSTM
         """
         )

# file = st.file_uploader("Please upload an brain scan file", type=["jpg", "png"])
# import cv2
# from PIL import Image, ImageOps
# import numpy as np
text = st.text_input("Please Enter Your Statement...")
st.set_option('deprecation.showfileUploaderEncoding', False)
def import_and_predict(sentence, model):
        X_test_indices = tokenizer.texts_to_sequences([sentence])
        print(X_test_indices)
        X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
        print(X_test_indices)
        predict = model.predict(X_test_indices)
        if predict > 0.5:
          probabilty = predict
          prediction = "Positive"
        else: 
          probabilty = predict
          prediction = "Negative"
          
        # size = (180,180)    
        # image = ImageOps.fit(image_data, size, Image.ANTIALIAS)
        # image = np.asarray(image)
        # img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # #img_resize = (cv2.resize(img, dsize=(75, 75),    interpolation=cv2.INTER_CUBIC))/255.
        
        # img_reshape = img[np.newaxis,...]
    
        # prediction = model.predict(img_reshape)
        
        return probabilty, prediction
if text is None:
    st.text("Please Enter an Statement ")
else:
    # image = Image.open(file)
    # st.image(image, use_column_width=True)
    probabilty, prediction = import_and_predict(text, model)
    # score = tf.nn.softmax(predictions[0])
    # st.write(probabilty)
    # st.write(prediction)
    string = f"This Statement most likely is {prediction} with probabilty of {probabilty}..."
    st.success(string)
    # print(
    # "This image most likely belongs to {} with a {:.2f} percent confidence."
    # .format(class_names[np.argmax(score)], 100 * np.max(score)))


Overwriting app.py


In [4]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-5.1.0.tar.gz (745 kB)
[?25l[K     |▍                               | 10 kB 21.2 MB/s eta 0:00:01[K     |▉                               | 20 kB 22.5 MB/s eta 0:00:01[K     |█▎                              | 30 kB 23.5 MB/s eta 0:00:01[K     |█▊                              | 40 kB 19.9 MB/s eta 0:00:01[K     |██▏                             | 51 kB 14.6 MB/s eta 0:00:01[K     |██▋                             | 61 kB 11.1 MB/s eta 0:00:01[K     |███                             | 71 kB 12.0 MB/s eta 0:00:01[K     |███▌                            | 81 kB 13.3 MB/s eta 0:00:01[K     |████                            | 92 kB 13.4 MB/s eta 0:00:01[K     |████▍                           | 102 kB 10.2 MB/s eta 0:00:01[K     |████▉                           | 112 kB 10.2 MB/s eta 0:00:01[K     |█████▎                          | 122 kB 10.2 MB/s eta 0:00:01[K     |█████▊                          | 133 kB 10.2 MB/s eta 0:00:01[K

In [5]:
!streamlit run app.py &>/dev/null&


In [9]:
from pyngrok import ngrok
 
public_url = ngrok.connect('8501')
public_url

t=2021-09-15T14:43:34+0000 lvl=warn msg="failed to start tunnel" pg=/api/tunnels id=f92eae5cc7bc75fc err="Your account may not run more than 2 tunnels over a single ngrok client session.\nThe tunnels already running on this session are:\ntn_1yBFCBMc1sNf9xVnvA84yCok3ow, tn_1yBFCFqVqSRHox3RVa8ikX0FQV0\n\r\n\r\nERR_NGROK_324\r\n"


PyngrokNgrokHTTPError: ignored