In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LinearRegression
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preparing our Data
First we read in the data and separate it by column

In [2]:
imdb_data = pd.read_csv('../input/imdb-dataset/imdbdataset.csv')

imdb_data.loc[imdb_data['sentiment'] == 'positive',  'sentiment'] = 1
imdb_data.loc[imdb_data['sentiment'] == 'negative',  'sentiment'] = 0

imdb_data.head()

reviews, sentiments = imdb_data['review'], imdb_data['sentiment']

We now split our dataset into training, testing, and development sets.

In [3]:
train_reviews = np.array(reviews[0:30000])
train_sentiments = np.array(sentiments[0:30000]).astype(np.int)
test_reviews = np.array(reviews[30000:40000])
test_sentiments = np.array(sentiments[30000:40000]).astype(np.int)
dev_reviews = np.array(reviews[40000:50000])
dev_sentiments = np.array(sentiments[40000:50000]).astype(np.int)

# Tokenization
Tokenization happens here

In [4]:
# Parameters
input_size = 1000
output_size = 64
max_length = 120
truncation = 'post'
token = "<OOV>"

In [5]:
tokenizer = Tokenizer(num_words = input_size, oov_token = token)
tokenizer.fit_on_texts(train_reviews)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_reviews)
testing_sequences = tokenizer.texts_to_sequences(test_reviews)

# Padding
Padding happens here

In [6]:
padded = pad_sequences(sequences, maxlen = max_length, truncating = truncation)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length)

# Building the Model
We build the Sequential model here

In [7]:
model = tf.keras.Sequential([
	tf.keras.layers.Embedding(input_size, output_size, input_length=max_length),
	tf.keras.layers.Flatten(),
])

in_size = 128

while in_size > 7:
    model.add(tf.keras.layers.Dense(in_size, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.1))
    in_size = in_size / 2
    
model.add(tf.keras.layers.Dense(1, activation='relu'))


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

In [8]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 64)           64000     
_________________________________________________________________
flatten (Flatten)            (None, 7680)              0         
_________________________________________________________________
dense (Dense)                (None, 128)               983168    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2

# Running the Model
We run the model here

In [9]:
num_epochs = 10
model.fit(padded,
          train_sentiments,
          epochs = num_epochs,
          validation_data = (testing_padded, test_sentiments))

2021-12-20 18:29:36.144280: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc3c05b3290>

# Results
Here is what our results and findings tell us