## Movie_Review_Sentimental_Analysis

### <b><u> In this notebook we will do movie review sentimental analysis using simple ANN model. For this we use *Tensorflow, nltk* libraries.

<b><u>Importing Libraries and functions

In [1]:
# To remove punctuation
import string

# Array operation
import numpy as np

#For Dataframe
import pandas as pd

# Regex Operation
import re

import os

# Stopword remove
import nltk
from nltk.corpus import stopwords

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

# Deep Neural Network
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2023-05-04 01:02:10.368000: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-04 01:02:10.459913: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-04 01:02:10.463426: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-04 01:02:10.463437: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

#### <u> function to read contents from text file

In [2]:
def load_doc(filename):
    
    # open file read only
    file = open(filename,'r')
    
    # read all text
    text = file.read()
    
    # close file
    file.close()
    
    # return text data
    return text

#### <u> function to clean the text

In [3]:
# turn a doc into clean tokens
def clean_doc(doc):
    
    # split into tokens by white space
    tokens = doc.split()
    
    # prepare regex for char filtering
    re_punc = re.compile("[%s]" % re.escape(string.punctuation))
    
    # remove punctuation from each word
    tokens = [re_punc.sub( "", w) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # filter out stop words
    stop_words = set(stopwords.words("english"))
    tokens = [w for w in tokens if not w in stop_words]
    
    # filter out short tokens
    tokens = [word for word in tokens if len(word)>1]
    return tokens

#### <u> function to convert document to lines

In [4]:
# Load docs, clean and return line of tokens
def doc_to_line(filename,vocab):
    
    # load the doc
    doc = load_doc(filename)
    
    # clean doc
    tokens = clean_doc(doc)
    
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    
    return " ".join(tokens)

#### <u> file vocab file loaded

In [6]:
vocab_filename = '../dataset/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

#### <u> function to read all files from directory

In [7]:
# load all docs from dictionary

def process_docs(directory,vocab):
    
    lines = list()
    
    #walk through all files and folders
    for filename in os.listdir(directory):
        
        #create the full path
        path = directory + "/" + filename
        
        #load and clean data
        line = doc_to_line(path,vocab)
        
        #add to list
        lines.append(line)
    
    return lines

#### <u> function to load and clean entire dataset

In [8]:
def load_clean_dataset(vocab):
    
    # load documents
    neg = process_docs('../dataset/movie_review/neg',vocab)
    pos = process_docs('../dataset/movie_review/pos',vocab)
    
    docs = neg + pos
    
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    
    return docs,labels

#### <u> function to fit a tokenizer

In [9]:
def create_tokenizer(lines):
    
    tokenizer = Tokenizer()
    
    tokenizer.fit_on_texts(lines)
    
    return tokenizer

#### <u> function to define and create model

In [10]:
# define the model
def define_model(n_words):
    
    # define network
    model = Sequential()
    
    # Dense Layer 1
    model.add(Dense(50,
                   input_shape = (n_words,),
                   activation = 'relu'))
    
    # Dense Layer 2 (output layer)
    model.add(Dense(1,
                    activation = 'sigmoid'))
    
    # compilation
    model.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
    
    # plot model
    plot_model(model,
              to_file = 'model.png',
              show_shapes = True)
    
    return model

#### <u> load all reviews

In [11]:
# train dataset
train_docs , ytrain = load_clean_dataset(vocab)

# test dataset
test_docs , ytest = load_clean_dataset(vocab)

#### <u> create the tokenizer

In [12]:
# this is object
tokenizer = create_tokenizer(train_docs)

#### <u> encode data

In [13]:
# convert text doc to binary matrix
#(i.e. if word present 1 else 0)

X_train = tokenizer.texts_to_matrix(train_docs, mode = 'binary')

X_test = tokenizer.texts_to_matrix(test_docs, mode = 'binary')

#### <u> Define the network

In [14]:
n_words = X_train.shape[1]
model = define_model(n_words)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                1288450   
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 1,288,501
Trainable params: 1,288,501
Non-trainable params: 0
_________________________________________________________________


2023-05-04 01:03:16.581992: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-04 01:03:16.582524: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-04 01:03:16.582988: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-05-04 01:03:16.583375: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-05-04 01:03:16.583668: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

#### <u> fit the network

In [15]:
model.fit(X_train,np.array(ytrain),
          validation_data=[X_test,np.array(ytest)],
         epochs=10,
         batch_size = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f470f52ada0>

#### <u> classify the review as negative or positive

In [16]:
def predict_sentiment(review): # , vocab , tokenizer , model
    
    # clean
    tokens = clean_doc(review)
    
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    
    # convert to line
    line = ' '.join(tokens)
    
    # encode
    encoded = tokenizer.texts_to_matrix([line], mode = 'binary')
    
    # predict sentiments
    yhat = model.predict(encoded,verbose = 0)
    
    # retrieve predicted percentege and label
    percent_pos = yhat[0,0]
    
    if round(percent_pos)==0:
        
        return (1-percent_pos), 'NEGATIVE'
    
    return percent_pos, 'POSITIVE'            

In [17]:
text = 'best movie ever!! it was great. i recommend it.'

In [18]:
percent , sentiment = predict_sentiment(text)


print(f"Review: {text}\nSentiment: {sentiment} ({round(percent*100,2)}%)")

Review: best movie ever!! it was great. i recommend it.
Sentiment: POSITIVE (61.4%)
