In [47]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [46]:
training_size = 30000
vocabs_size = 10000
embedding_dim = 16
max_length = 200
trunc_type = 'post'
oov_tok = '<OOV>'

In [42]:
imdb_dataset = pd.read_csv('../../../../mnt/c/NN/NLP/IMDB Dataset.csv')

In [43]:
def clean_text(text):
    text = text.lower()
    text = re.sub("-", " ", text)
    text = re.sub('"', " ", text)
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = " ".join(filter(lambda x: x[0] != "@", text.split()))
    return text

In [45]:
imdb_dataset['review'] = imdb_dataset['review'].apply(clean_text)
imdb_dataset["sentiment"] = imdb_dataset["sentiment"].map({ 'negative' : 0, 'positive' : 1})

In [48]:
training_data = imdb_dataset['review'][:training_size]
training_labels = imdb_dataset['sentiment'][:training_size]

validation_data = imdb_dataset['review'][training_size:]
validation_labels = imdb_dataset['sentiment'][training_size:]

In [49]:
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(training_data)

sequences = tokenizer.texts_to_sequences(training_data)
padded = pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

In [50]:
training_pad = np.array(padded)
training_labels = np.array(training_labels)

validation_pad = np.array(padded)
validation_labels = np.array(training_labels)

In [51]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabs_size,embedding_dim,input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


2022-08-28 17:08:04.328460: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-08-28 17:08:04.329498: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-28 17:08:04.335966: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [None]:
history = model.fit(
    training_pad,training_labels,
    validation_data=(validation_pad,validation_labels),
    epochs=10,
    verbose=2
)