In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import re,string,unicodedata
from bs4 import BeautifulSoup

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.text import Tokenizer

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


ModuleNotFoundError: No module named 'pandas'

In [None]:
dataset = pd.read_csv('imdb-dataset.csv')
dataset.head()

In [None]:
import warnings
warnings.simplefilter('ignore')

display(dataset['sentiment'].value_counts())
sns.countplot(dataset['sentiment'])

In [None]:
dataset.describe()

In [None]:
tokenizer = ToktokTokenizer()

stopword_list = stopwords.words('english')

In [None]:
i = 0
def preprocess_data(text, is_lower_case=False):
    global i
    ##remove noisy data
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text)
    ##remove special charecter
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    
    ##Stemming text
    pstemmer = PorterStemmer()
    text= ' '.join([pstemmer.stem(word) for word in text.split()])
    ##remove stop words
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    i += 1
    if i%10000 == 0:
        print(i)
    return filtered_text


#### apply preprocessing

dataset['review']=dataset['review'].apply(preprocess_data)

In [None]:
dataset['review']

In [None]:
le = LabelEncoder()
training_reviews, testing_reviews, training_labels, testing_labels  = train_test_split(dataset['review'].values,dataset['sentiment'].values,test_size = 0.2)
training_labels = le.fit_transform(training_labels)
testing_labels = le.fit_transform(testing_labels)

In [None]:
training_labels

In [None]:
tokenizer = Tokenizer(num_words=10000,oov_token='<OOV>')
tokenizer.fit_on_texts(training_reviews)
word_index = tokenizer.word_index
training_sequence = tokenizer.texts_to_sequences(training_reviews)
testing_sequence = tokenizer.texts_to_sequences(testing_reviews)
train_pad_sequence = pad_sequences(training_sequence,maxlen = 200,truncating= 'post',padding = 'pre')
test_pad_sequence = pad_sequences(testing_sequence,maxlen = 200,truncating= 'post',padding = 'pre')
print('Total Unique Words : {}'.format(len(word_index)))

In [None]:
print(train_pad_sequence.shape)
for word in word_index:
    if len(word) > 15:
        print(word)

In [None]:
embedded_words = {}
with open('glove.6B.200d.txt', encoding='utf8') as file:
    for line in file:
        words, coeff = line.split(maxsplit=1)
        coeff = np.array(coeff.split(),dtype = float)
        embedded_words[words] = coeff

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1,200))
for word, i in word_index.items():
    embedding_vector = embedded_words.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
len(word_index), embedding_matrix.shape

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(len(word_index) + 1, 200,weights=[embedding_matrix],input_length=200,
                            trainable=False),
                             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(128,activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.1)),
                             tf.keras.layers.Dense(128,activation = 'relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),
                             tf.keras.layers.Dropout(0.2),
                             tf.keras.layers.Dense(1,activation = tf.nn.sigmoid)])

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy() , optimizer='Adam' , metrics = 'accuracy')
history = model.fit(
    train_pad_sequence,
    training_labels,
    batch_size=100,
    epochs = 30 ,
    validation_data=(test_pad_sequence,testing_labels))


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))

plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.legend(loc=0)
plt.figure()

plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend(loc=0)

plt.show()

In [None]:
print('Training Accuracy: {}'.format(max(acc)))
print('Validation Accuracy: {}'.format(max(val_acc)))