In [10]:
import os
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

In [2]:
INP_DIR = "data/data_"

In [6]:
reviews_train = pd.read_csv(os.path.join(INP_DIR, "reviews_train.csv"))
reviews_val = pd.read_csv(os.path.join(INP_DIR, "reviews_val.csv"))
reviews_test = pd.read_csv(os.path.join(INP_DIR, "reviews_test.csv"))

y_train = reviews_train["sentiment"].values
y_val = reviews_val["sentiment"].values
y_test = reviews_test["sentiment"].values

reviews_train.shape, reviews_val.shape, reviews_test.shape

((32000, 2), (8000, 2), (10000, 2))

In [33]:
MAX_WORDS_PER_DOC = 200
MAX_VOCAB = 20000
EMBEDDING_DIM = 100

In [34]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(reviews_train["review"].values)

word_index = tokenizer.word_index

seq_train = tokenizer.texts_to_sequences(reviews_train["review"].values)
seq_val = tokenizer.texts_to_sequences(reviews_val["review"].values)
seq_test = tokenizer.texts_to_sequences(reviews_test["review"].values)

In [35]:
X_train = keras.preprocessing.sequence.pad_sequences(seq_train, MAX_WORDS_PER_DOC, padding="post")
X_val = keras.preprocessing.sequence.pad_sequences(seq_val, MAX_WORDS_PER_DOC, padding="post")
X_test = keras.preprocessing.sequence.pad_sequences(seq_test, MAX_WORDS_PER_DOC, padding="post")

X_train.shape, X_val.shape, X_test.shape

((32000, 200), (8000, 200), (10000, 200))

# Word embedding + a dense layer

In [41]:
keras.backend.clear_session()

model = keras.models.Sequential()
model.add(keras.layers.Embedding(MAX_VOCAB + 1, EMBEDDING_DIM, input_length=MAX_WORDS_PER_DOC))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["acc"])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 100)          2000100   
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 20001     
Total params: 2,020,101
Trainable params: 2,020,101
Non-trainable params: 0
_________________________________________________________________
None
