#### <u> Import library

In [1]:
import string
import numpy as np
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.utils import pad_sequences

2022-12-24 17:14:04.282346: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-24 17:14:04.405197: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-24 17:14:04.410304: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-24 17:14:04.410320: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

#### <u> function to read contents from text file

In [2]:
def load_doc(filename):
    
    # open file read only
    file = open(filename,'r')
    
    # read all text
    text = file.read()
    
    # close file
    file.close()
    
    # return text data
    return text

#### <u> file vocab file loaded

In [3]:
vocab_filename = '/home/dai/Desktop/dai2022/modulewise/naturallanguageprocessing/datasets/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

#### <u> function to clean the text

In [4]:
# turn a doc into clean tokens
def clean_doc(doc,vocab):
    
    # split into tokens by white space
    tokens = doc.split()
    
    # prepare regex for char filtering
    re_punc = re.compile("[%s]" % re.escape(string.punctuation))
    
    # remove punctuation from each word
    tokens = [re_punc.sub( "", w) for w in tokens]
    
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    
    tokens = ' '.join(tokens)
    
    return tokens

#### <u> load all docs in a directory

In [5]:
def process_docs(directory, vocab, is_train):
    
    documents = list()
    
    # walk through all files in the folder
    for filename in os.listdir(directory):
        
        # skip any reviews in the test set
        if is_train and filename.startswith('cv9'):
            continue
        
        if not is_train and not filename.startswith('cv9'):
            continue
        
        # create the full ppath of the file to open
        path = directory + '/' + filename
        
        # load the doc
        doc = load_doc(path)
        
        # clean doc
        tokens = clean_doc(doc,vocab)
        
        # add to list
        documents.append(tokens)
        
    return documents

#### <u> function to load and clean entire dataset

In [6]:
def load_clean_dataset(vocab,is_train):
    
    # load documents
    neg = process_docs('/home/dai/Desktop/dai2022/modulewise/naturallanguageprocessing/datasets/review_polarity/txt_sentoken/neg',vocab,is_train)
    pos = process_docs('/home/dai/Desktop/dai2022/modulewise/naturallanguageprocessing/datasets/review_polarity/txt_sentoken/pos',vocab,is_train)
    
    docs = neg + pos
    
    # prepare labels
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    
    return docs,labels

#### <u> function to fit a tokenizer

In [7]:
# the tokenizer is object
def create_tokenizer(lines):
    
    tokenizer = Tokenizer()
    
    tokenizer.fit_on_texts(lines)
    
    return tokenizer

#### <u> function for integer encode and pad documents

In [8]:
def encode_docs(tokenizer, max_length, docs):
    
    # integer encode
    encoded = tokenizer.texts_to_sequences(docs)
    
    # pad sequences
    padded = pad_sequences(encoded, maxlen=max_length, padding = 'post')
    
    return padded

#### <u> function to define and create model

In [9]:
# define the model
def define_model(vocab_size,max_length):
    
    # define network
    model = Sequential()
    
    # Embedding layer
    model.add(Embedding(vocab_size,100,
                        input_length = max_length))
    
    # Convolution Layers
    model.add(Conv1D(filters=32,
                     kernel_size=8,
                     activation='relu'))
    
    # Pool Layers
    model.add(MaxPool1D(pool_size=2))
    
    # Flattern Layer
    model.add(Flatten())
    
    # Dense Layer 1
    model.add(Dense(50,
                   input_shape = (vocab_size,),
                   activation = 'relu'))
    
    # Dense Layer 2 (output layer)
    model.add(Dense(1,
                    activation = 'sigmoid'))
    
    # compilation
    model.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])
    
    # summarize defined model
    model.summary()
    
    # plot model
    plot_model(model,
              to_file = 'model_cnn.png',
              show_shapes = True)
    
    return model

#### <u> load all reviews

In [10]:
# train dataset
train_docs , ytrain = load_clean_dataset(vocab,True)

# test dataset
test_docs , ytest = load_clean_dataset(vocab,False)

#### <u> create the tokenizer

In [11]:
# this is object
tokenizer = create_tokenizer(train_docs)

#### <u> define vocabulaty size

In [12]:
vocab_size = len(tokenizer.word_index) + 1

print(f"vocabulary size: {vocab_size}")

vocabulary size: 25768


#### <u> calculate maximum sequence length

In [13]:
max_l_train = max([len(s.split()) for s in train_docs])

max_l_test = max([len(s.split()) for s in test_docs])

max_length = max(max_l_train,max_l_test)

print(f"Maximum Length = {max_length}")

Maximum Length = 1317


#### <u> encoded data

In [14]:
# convert text doc to binary matrix
#(i.e. if word present 1 else 0)

Xtrain = encode_docs(tokenizer,max_length,train_docs)

Xtest = encode_docs(tokenizer,max_length,test_docs)

In [15]:
Xtrain.shape, len(ytrain)

((1810, 1317), 1810)

#### <u> Define the network

In [16]:
model = define_model(vocab_size,max_length)

2022-12-24 17:14:40.243668: E tensorflow/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-12-24 17:14:40.243766: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (dai-Precision-7820-Tower): /proc/driver/nvidia/version does not exist
2022-12-24 17:14:40.254703: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1317, 100)         2576800   
                                                                 
 conv1d (Conv1D)             (None, 1310, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 655, 32)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 20960)             0         
                                                                 
 dense (Dense)               (None, 50)                1048050   
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                        

#### <u> fit the network

In [17]:
model.fit(Xtrain,np.array(ytrain),
          validation_data= [Xtest,np.array(ytest)],
          epochs=10,
          batch_size = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7ac27c3610>

#### <u> evaluate on train set

In [18]:
model.evaluate(Xtrain,np.array(ytrain))



[3.0148083169478923e-05, 1.0]

#### <u> evaluate on test set

In [19]:
model.evaluate(Xtest,np.array(ytest))



[0.6160975098609924, 0.8700000047683716]

#### <u> classify the review as negative or positive

In [20]:
def predict_sentiment(review): # , vocab , tokenizer , model
    
    # clean review
    line = clean_doc(review, vocab)
    
    #eoncode and padd review
    padded = encode_docs(tokenizer,max_length,[line])
    
    # predict sentiments
    yhat = model.predict(padded,verbose = 0)
    
    # retrieve predicted percentege and label
    percent_pos = yhat[0,0]
    
    if round(percent_pos)==0:
        
        return (1-percent_pos), 'NEGATIVE'
    
    return percent_pos, 'POSITIVE'         

#### <u> positive review

In [21]:
text = 'Everyone will enjoy this film. I love it, recommended!'

In [22]:
percent , sentiment = predict_sentiment(text)


print(f"Review: {text}\nSentiment: {sentiment} ({round(percent*100,2)}%)")

Review: Everyone will enjoy this film. I love it, recommended!
Sentiment: NEGATIVE (51.12%)


#### <u> negative review

In [23]:
text = 'this is bad movie. Do not watch it. It sucks.'

In [24]:
percent , sentiment = predict_sentiment(text)


print(f"Review: {text}\nSentiment: {sentiment} ({round(percent*100,2)}%)")

Review: this is bad movie. Do not watch it. It sucks.
Sentiment: NEGATIVE (57.59%)


### <b><u>Save the object of classifier and vectorier

In [25]:
import joblib

In [26]:
joblib.dump(model,'sentiment_model_27.model')



INFO:tensorflow:Assets written to: ram://bb907e3c-e785-4a38-976e-b4df3aacfc5b/assets


INFO:tensorflow:Assets written to: ram://bb907e3c-e785-4a38-976e-b4df3aacfc5b/assets


['sentiment_model_27.model']

In [27]:
joblib.dump(tokenizer,'sentiment_tokenizer_27.model')

['sentiment_tokenizer_27.model']

### <b><u> Creating UI using tkinter for above model

In [28]:
from tkinter import *

In [29]:
model = joblib.load('sentiment_model_27.model')

In [30]:
tokenizer = joblib.load('sentiment_tokenizer_27.model')

In [33]:
top = Tk()

top.title('Sentiment Analysis')
top.geometry("500x350")

def show():
    
    # converting text to str type
    msg = str(text.get())
    
    # predict on model
    percent , sentiment = predict_sentiment(msg)
    
    # printing prediction on UI
    Label(text = f"Review: {text}\nSentiment: {sentiment} ({round(percent*100,2)}%)").place(x=150,y=200)

text = StringVar()  # intvar ,doublevar, stringvar

l = Label(text = "Enter Sentance: ").place(x=50,y=100)
e = Entry(textvariable=text).place(x=200,y=100)
b = Button(text = "Submit", command=show).place(x=180,y=150)

top.mainloop()