<a href="https://colab.research.google.com/github/mindyng/mindyng.github.io/blob/master/Sentiment_Analyzer_Deployed_via_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up Kaggle Dependencies

In [1]:
# Install Kaggle library
!pip install -q kaggle
from google.colab import files
#upload the kaggle.json file that was downloaded
uploaded = files.upload()
# make a diectory in which kaggle.json is stored
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
#download the dataset into colab(paste API command after !)
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
#unzip the data
!unzip imdb-dataset-of-50k-movie-reviews.zip

Saving kaggle.json to kaggle.json
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:00<00:00, 29.5MB/s]
100% 25.7M/25.7M [00:00<00:00, 37.7MB/s]
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


# Load Dependencies

In [31]:
import numpy as np
import pandas as pd
import string
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

# Pre-Process Data
Clear HTML tags, numbers, special characters

In [22]:
def remove_special_characters(text):
    '''Make text lowercase, remove text in square brackets, remove links, remove punctuation.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    #text = re.sub('\w*\d\w*', '', text)
    return text

def remove_num(text):
 return "".join(re.sub('([0–9]+)','',text))

data = pd.read_csv('/content/IMDB Dataset.csv')
data.review=data.review.apply(lambda x : remove_num(x))
data.review = data.review.apply(lambda x: remove_special_characters(x))
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# Word Embeddings

In [29]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data["review"])
X = tokenizer.texts_to_sequences(data["review"])
X = pad_sequences(X,maxlen=500)
Y = data["sentiment"]
vocab_size = len(tokenizer.word_index) + 1

In [30]:
# We can then create our train and test sets:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)
#We store this tokenizer in a file to use later in web app
import pickle
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [32]:
def prepare_targets(y_train, y_test):
 le = LabelEncoder()
 le.fit(y_train)
 y_train_enc = le.transform(y_train)
 y_test_enc = le.transform(y_test)
 return y_train_enc, y_test_enc
ytrain,ytest = prepare_targets(Y_train,Y_test)

# Model Architecture Build

In [36]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           11111300  
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 11,294,853
Trainable params: 11,294,853
Non-trainable params: 0
_________________________________________________________________


# Training the Model

In [41]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5)
history=model.fit(X_train, ytrain,
 batch_size=128,
 epochs=20,
 validation_data=[X_test, ytest],
 callbacks=[es])
#We save this model so that we can use in own web app
model.save('movie_sent.h5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping


# Assess Model Prediction Accuracy

In [42]:
string11="Between the Lovecraftian overtones and Liberato’s performance, The Beach House offers up beautifully shot terror and will make you think before opening your door."
x_1=tokenizer.texts_to_sequences([string11])
x_1 = pad_sequences(x_1,maxlen=500)
model.predict(x_1)
#Output (closer to 1 = positive review and closer to 0 = negative review)

array([[0.98013276]], dtype=float32)

# Deployment Using Streamlit

In [None]:
import streamlit as st
import pickle
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict(message):
    model=load_model("movie_sent.h5")
    with open('tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
        x_1 = tokenizer.texts_to_sequences([message])
        x_1 = pad_sequences(x_1, maxlen=500)
        predictions = model.predict(x_1)[0][0]
        return predictions

st.title("Movie Review Sentiment Analyzer")
message = st.text_area("Enter Review","Type Here ..")

if st.button("Analyze"):
    with st.spinner("Analyzing the text…"):
        prediction=predict(message)
        if prediction > 0.6:
            st.success("Positive review with {:.2f} confidence".format(prediction))
            st.balloons()
        elif prediction <0.4:
            st.error("Negative review with {:.2f} confidence".format(1-prediction))
        else:
            st.warning("Not sure! Try to add some more words/context")