In [1]:
!pip install tensorflow-datasets > /dev/null

In [2]:
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(nltk.corpus.stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
(ds_train,ds_test),ds_info = tfds.load(
    name="imdb_reviews",
    split=["train","test"],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

In [4]:
df_train = tfds.as_dataframe(ds_train, ds_info)
df_test = tfds.as_dataframe(ds_test, ds_info)

In [5]:
def clean_entry(text_list):
  str_list = []
  for text in text_list:
    str_text = str(text)
    str_text_lim = len(str_text)-1
    str_text = str_text[1:str_text_lim]
    str_list.append(str_text)
  return html_term_remover(str_list)

def prepare_for_ai(df_col):
  list_to_return = df_col.tolist()
  return clean_entry(list_to_return)

def html_term_remover(df_list: list):
  return_list = []
  for i in df_list:
    b_soup = BeautifulSoup(i, 'html.parser')
    return_list.append(b_soup.get_text())
  return apply_re(return_list)

def apply_re(str_list):
  re_list = []
  for text in str_list:
    text = re.sub("[^0-9A-Za-z ]", "", text)
    re_list.append(text)
  return remove_integer(re_list)

def remove_integer(str_list):
  re_list = []
  int_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
  for text in str_list:
    sentence_list = []
    new_text = text.split()
    for word in new_text:
      if word not in int_list:
        sentence_list.append(word)
    re_list.append(' '.join(sentence_list))
  return return_lower_text(re_list)

def return_lower_text(str_list):
  re_list = []
  for word in str_list:
    re_list.append(word.lower())
  return apply_nltk(re_list)

def apply_nltk(text):
  return_list = []
  for elem in text:
    tokens = word_tokenize(elem)
    working_list = []
    for word in tokens:
      if len(word) > 3 and word not in stopwords:
        working_list.append(stemmer.lemmatize(word))
    return_list.append(' '.join(working_list))
  return return_list

In [6]:
x_train = prepare_for_ai(df_train['text'])
x_test = prepare_for_ai(df_test['text'])
y_train = df_train['label'].values.tolist()
y_test = df_test['label'].values.tolist()

In [7]:
x_all = x_train.copy()
x_all.extend(x_test)

y_all = y_train.copy()
y_all.extend(y_test)

assert len(y_all) == len(x_all)

In [8]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [9]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=750, output_sequence_length=200, standardize=None)
vectorizer.adapt(x_all)

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [11]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [12]:

embeddings_index = {}
with open("/content/glove.6B.200d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [13]:
num_tokens = len(voc) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 746 words (4 misses)


In [14]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Embedding,LSTM, SpatialDropout1D, Input, Dropout
from tensorflow.keras.initializers import Constant

In [15]:
inputs = Input(shape=(200,))
embedding_layer = Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(embedding_matrix))(inputs)
dropout_1 = SpatialDropout1D(0.50)(embedding_layer)
lstm_layer = LSTM(75)(dropout_1)
dropout_2 = Dropout(0.05)(lstm_layer)
d_layer = Dense(10, activation="relu")(dropout_2)
out_layer = Dense(1, activation='sigmoid')(d_layer)

model = Model(inputs=inputs, outputs=out_layer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 200)          150400    
                                                                 
 spatial_dropout1d (SpatialD  (None, 200, 200)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 75)                82800     
                                                                 
 dropout (Dropout)           (None, 75)                0         
                                                                 
 dense (Dense)               (None, 10)                760       
                                                             

In [16]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.optimizers import Adam

In [17]:
loss = BinaryCrossentropy(from_logits=False)
metric = BinaryAccuracy(name='accuracy')
optimizer = Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [18]:
x_all_vectorized = vectorizer(np.array([[s] for s in x_all])).numpy()
y_all_vectorized = np.asarray(y_all)

In [19]:
x_train_model, x_test_model, y_train_model, y_test_model = train_test_split(x_all_vectorized, y_all_vectorized, test_size=0.2, random_state=35)

In [20]:
model.fit(x_train_model, y_train_model, epochs=15, batch_size=32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fe48e7c3a50>

In [21]:
model.evaluate(x_test_model, y_test_model)



[0.3689700663089752, 0.8475000262260437]