In [None]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
import sklearn
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemma=WordNetLemmatizer()
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN,Flatten,Embedding,Masking,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu,sigmoid,softmax
import pickle

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
class sent_analysis:
    def __init__(self):
        self.df=pd.read_csv(r'/content/IMDB Dataset.csv')
        self.df=self.df.iloc[:10000]
        print(f'shape{self.df.shape}')
        print(self.df.columns)
        print(self.df.isnull().sum())
        self.df['sentiment']=self.df['sentiment'].map({'positive':1,'negative':0})
        print(self.df.head(5))
        print(self.df['sentiment'].value_counts())
    def data_cleaning(self):
        try:
            self.cleaned_data = []
            for i in self.df['review']:
                data = i.lower()
                data = ''.join([i for i in data if i not in string.punctuation])
                data = ' '.join([lemma.lemmatize(i) for i in data.split() if i not in stopwords.words('english')])
                self.cleaned_data.append(data)
            print(f'data after cleaning:{self.cleaned_data[:10]}')

        except Exception as e:
            er_ty, er_msg, er_lin = sys.exc_info()
            print(f"Performance Error : {er_lin.tb_lineno} : due to {er_msg}")

    def vectors_conversion(self):
        try:
          data_size=10000
          self.vec_data=[one_hot(i,data_size)for i in self.cleaned_data]
          print(f'data after one hot{self.vec_data[:10]}')
          self.t=[]
          for i in self.vec_data:
            self.t.append(len(i))
          self.a=max(self.t)
          self.final_data = pad_sequences(self.vec_data,maxlen=self.a,padding='post')
          print(f'cleaned data after preprocessing:{self.final_data}')

        except Exception as e:
            er_ty, er_msg, er_lin = sys.exc_info()
            print(f"Performance Error : {er_lin.tb_lineno} : due to {er_msg}")
    def data_splitting(self):
      try:
        self.train_inde_data=self.final_data[:7000]
        self.train_dep_data=self.df['sentiment'][:7000]
        self.val_inde_data=self.final_data[7000:9000]
        self.val_dep_data=self.df['sentiment'][7000:9000]
        self.test_inde_data=self.final_data[9000:]
        self.test_dep_data=self.df['sentiment'][9000:]
        print(f'train_inde_data shape: {self.train_inde_data.shape}')
        print(f'self.train_dep_data shape: {self.train_dep_data.shape}')
        print(f'self.val_inde_data shape: {self.val_inde_data.shape}')
        print(f'self.val_dep_data shape: {self.val_dep_data.shape}')
        print(f'self.test_inde_data shape: {self.test_inde_data.shape}')
        print(f'self.test_dep_data shape: {self.test_dep_data.shape}')
      except Exception as e:
          er_ty, er_msg, er_lin = sys.exc_info()
          print(f"Performance Error : {er_lin.tb_lineno} : due to {er_msg}")

    def model_training(self):
      try:
        self.model=Sequential()
        self.model.add(Embedding(input_dim=10000,output_dim=5,input_length=self.a))
        self.model.add(Masking(mask_value=0.0))
        self.model.add(Bidirectional(SimpleRNN(units = 3,return_sequences=True,name='Hidden_layer_1')))
        self.model.add(Bidirectional(SimpleRNN(units = 4,return_sequences=True,name='Hidden_layer_2')))
        self.model.add(Bidirectional(SimpleRNN(units = 5,return_sequences=False,name='Hidden_layer_3')))
        self.model.add(Dense(units = 1,activation='sigmoid',name='output_layer'))

        self.model.compile(optimizer='adam',loss = 'binary_crossentropy',metrics=['accuracy'])

        self.model.fit(self.train_inde_data,self.train_dep_data,epochs=20,batch_size=100,validation_data=(self.val_inde_data,self.val_dep_data))
      except Exception as e:
          er_ty, er_msg, er_lin = sys.exc_info()
          print(f"Performance Error : {er_lin.tb_lineno} : due to {er_msg}")
    def model_save(self):
      try:
        with open('analysis.pkl','wb') as f:
          pickle.dump(self.model,f)
      except Exception as e:
          er_ty, er_msg, er_lin = sys.exc_info()
          print(f"Performance Error : {er_lin.tb_lineno} : due to {er_msg}")

if __name__ == "__main__":
    obj=sent_analysis()
    obj.data_cleaning()
    obj.vectors_conversion()
    obj.data_splitting()
    obj.model_training()
    obj.model_save()


shape(10000, 2)
Index(['review', 'sentiment'], dtype='object')
review       0
sentiment    0
dtype: int64
                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1
sentiment
1    5028
0    4972
Name: count, dtype: int64
data after cleaning:['one reviewer mentioned watching 1 oz episode youll hooked right exactly happened mebr br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordbr br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high

In [None]:
with open('analysis.pkl','rb') as f:
  m = pickle.load(f)


In [None]:
labels=['positive','negative']

In [None]:
dic_size=10000
review=['I was very lucky to see this film as part of the Melbourne International Film Festival 2005 only a few days ago. I must admit that I am very partial to movies that focus on human relations and especially the ones which concentrate on the tragic side of life. I also love the majority of Scandinavian cinematic offerings, there is often a particular deep quality in the way the story unfolds and the characters are drawn. Character building in this film is extraordinary in its details and its depth. This is despite the fact that we do encounter quite a number of characters all with very particular personal situations and locations within their community. The audience at the end of the screening was very silent and pensive. I am still playing some of those scenes in my mind and I am still amazed at their power and meaningfulness.']
text = review[0].lower()
text = ''.join([i for i in text if i not in string.punctuation])
text = ' '.join([lemma.lemmatize(i) for i in text.split() if i not in stopwords.words('english')])
v = [one_hot(i,dic_size) for i in [text]]
p = pad_sequences(v,maxlen=953,padding='post')
print(labels[np.argmax(m.predict(p))])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step
positive
