In [1]:
!pip install tensorflow-datasets > /dev/null

In [2]:
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(nltk.corpus.stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
(ds_train,ds_test),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train","test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

In [4]:
df_train = tfds.as_dataframe(ds_train, ds_info)
df_test = tfds.as_dataframe(ds_test, ds_info)

In [5]:
def clean_entry(text_list):
  str_list = []
  for text in text_list:
    str_text = str(text)
    str_text_lim = len(str_text)-1
    str_text = str_text[1:str_text_lim]
    str_list.append(str_text)
  return html_term_remover(str_list)

def prepare_for_ai(df_col):
  list_to_return = df_col.tolist()
  return clean_entry(list_to_return)

def html_term_remover(df_list: list):
  return_list = []
  for i in df_list:
    b_soup = BeautifulSoup(i, 'html.parser')
    return_list.append(b_soup.get_text())
  return apply_re(return_list)

def apply_re(str_list):
  re_list = []
  for text in str_list:
    text = re.sub("[^0-9A-Za-z ]", "", text)
    re_list.append(text)
  return remove_integer(re_list)

def remove_integer(str_list):
  re_list = []
  int_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
  for text in str_list:
    sentence_list = []
    new_text = text.split()
    for word in new_text:
      if word not in int_list:
        sentence_list.append(word)
    re_list.append(' '.join(sentence_list))
  return return_lower_text(re_list)

def return_lower_text(str_list):
  re_list = []
  for word in str_list:
    re_list.append(word.lower())
  return apply_nltk(re_list)

def apply_nltk(text):
  return_list = []
  for elem in text:
    tokens = word_tokenize(elem)
    working_list = []
    for word in tokens:
      if len(word) > 3 and word not in stopwords:
        working_list.append(stemmer.lemmatize(word))
    return_list.append(' '.join(working_list))
  return return_list

In [6]:
x_train = prepare_for_ai(df_train['text'])
x_test = prepare_for_ai(df_test['text'])
y_train = df_train['label'].values.tolist()
y_test = df_test['label'].values.tolist()

In [7]:
x_all = x_train.copy()
x_all.extend(x_test)

y_all = y_train.copy()
y_all.extend(y_test)

assert len(y_all) == len(x_all)

In [8]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [9]:
tokenizer = Tokenizer(num_words=750, split=' ')
tokenizer.fit_on_texts(x_all)


In [10]:
x_as_sequence = tokenizer.texts_to_sequences(x_all)
x_as_sequence = pad_sequences(x_as_sequence)

y_all_array = np.asarray(y_all)

In [11]:
print(x_as_sequence.shape)
print(y_all_array.shape)
print(type(x_as_sequence))
print(type(y_all_array))

(50000, 514)
(50000,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [12]:
x_train_padded, x_test_padded, y_train_new, y_test_new = train_test_split(x_as_sequence, y_all_array, test_size=0.2, random_state=35)

In [18]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Embedding,LSTM, SpatialDropout1D, Input, Dropout

In [19]:
inputs = Input(shape=(514,))
embedding_layer = Embedding(751, 150, input_length=553)(inputs)
dropout_1 = SpatialDropout1D(0.35)(embedding_layer)
lstm_layer = LSTM(75)(dropout_1)
dropout_2 = Dropout(0.15)(lstm_layer)
d_layer = Dense(10, activation="relu")(dropout_2)
out_layer = Dense(1, activation='sigmoid')(d_layer)

model = Model(inputs=inputs, outputs=out_layer, name="lstm_classifier")
model.summary()

Model: "lstm_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 514)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 514, 150)          112650    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 514, 150)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 75)                67800     
                                                                 
 dropout (Dropout)           (None, 75)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                760       
                                                   

In [20]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.optimizers import Adam

In [21]:
loss = BinaryCrossentropy(from_logits=False)
metric = BinaryAccuracy(name='accuracy')
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [22]:
model.fit(x_train_padded, y_train_new, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2a8d445cd0>

In [23]:
model.evaluate(x_test_padded, y_test_new)



[0.39689555764198303, 0.8438000082969666]