In [0]:
!pip install keras-tuner
import sys
import pickle
import argparse
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Dropout
from keras.utils.np_utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import kerastuner
from kerastuner.tuners import RandomSearch, Hyperband
from kerastuner import HyperModel
import keras.metrics

Collecting keras-tuner
[?25l  Downloading https://files.pythonhosted.org/packages/a7/f7/4b41b6832abf4c9bef71a664dc563adb25afc5812831667c6db572b1a261/keras-tuner-1.0.1.tar.gz (54kB)
[K     |██████                          | 10kB 28.3MB/s eta 0:00:01[K     |████████████                    | 20kB 2.1MB/s eta 0:00:01[K     |██████████████████              | 30kB 2.8MB/s eta 0:00:01[K     |████████████████████████        | 40kB 3.1MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 2.5MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.3MB/s 
Collecting terminaltables
  Downloading https://files.pythonhosted.org/packages/9b/c4/4a21174f32f8a7e1104798c445dacdc1d4df86f2f26722767034e4de4bff/terminaltables-3.1.0.tar.gz
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/c9/dc/45cdef1b4d119eb96316b3117e6d5708a08029992b2fee2c143c7a0a5cc5/colorama-0.4.3-py2.py3-none-any.whl
Building wheels for collected packages: keras-tuner, terminaltables

Using TensorFlow backend.


In [0]:
df = pickle.load(open('reviews_50000_processed.p', 'rb'))
vocab = pickle.load(open('reviews_50000_vocab.p', 'rb'))

In [0]:
model_fp = 'reviews_50000_tuned_cnn.p'
NAME = 'tune_50000'

CAT_COL = 'stars'

num_reviews = len(df.index)

train_num = int(num_reviews * .8)

# Split into train / validate
train = df.iloc[:train_num]
validate = df.iloc[train_num:]

X_train = train['text'].values
# Convert [1-5] stars to a vector representation
y_train = to_categorical(train[CAT_COL].values)
print(y_train.shape)

X_validate = validate['text'].values
y_validate = to_categorical(validate[CAT_COL].values)
print(y_validate.shape)

max_length = max(max([len(review) for review in X_train]),  
              max([len(review) for review in X_validate]))


# Tokenizer() is a Keras object that tokenizes texts for pre-processing
# Further processing is necessary before we can input the next into the CNN
tokenizer = Tokenizer()

tokenizer.fit_on_texts(X_train)

num_words = len(tokenizer.word_index) + 1

# Tokenize the words -> assign and integer to each word and represent the review
# as a sequence of integers
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_validate_tokens = tokenizer.texts_to_sequences(X_validate)

# Pad the vectors with zeros so that they're all the same length
# Necessary for efficient computation
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_validate_pad = pad_sequences(X_validate_tokens, maxlen=max_length, padding='post')

# Experiment design from "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional
# Neural Networks for Sentence Classification", Zhang and Wallace 2015, accessed via
# the article found https://machinelearningmastery.com/best-practices-document-classification-deep-learning/
# Tuner guidelines from https://keras-team.github.io/keras-tuner/ and https://www.sicara.ai/blog/hyperparameter-tuning-keras-tuner 
def build_model(hp):
  global num_words
  model = Sequential()
  # Embedding layer - emebed the words into dense vectors (down from 5000+ feat. vectors)
  # Parameters to tune: How many features the resulting dense vector has
  model.add(Embedding(num_words, 
                      output_dim=hp.Choice('embedding_output_dimension', values=[50, 100, 150], default=100),
                      input_length=max_length)) # experiment with size of vector space
  # Convolution layer - tune the number of filters, kernel size, and activatin fxn
  model.add(Conv1D(filters=hp.Choice('num_filters', values=[50, 100, 200, 400, 600],default=50),
                    kernel_size=hp.Int('conv_kernel', min_value=1, max_value=10, step=1), 
                    activation=hp.Choice('conv_activ', values=['relu', 'tanh'], default='relu'))) # experiement with filter / kernel size / activation fxn
  # Pooling layer to reduce convolution results
  model.add(MaxPooling1D(pool_size=2))
  # Flatten to combine pooling results
  model.add(Flatten())
  # Dropout layer = tune the proportion
  model.add(Dropout(rate=hp.Float('drop_rate', min_value=0, max_value=.5, step=.1)))
  model.add(Dense(6, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam',  metrics=[keras.metrics.Precision(name='precision'),
                  keras.metrics.Recall(name='recall')])
  return model

tuner = RandomSearch(
          build_model,
           metrics=[keras.metrics.Precision(name='precision'),
             keras.metrics.Recall(name='recall')],
          objective=kerastuner.Objective('val_precision', direction='max'),
          max_trials=5,
          executions_per_trial=3,
          seed=10,
          directory='tuning_out',
          project_name=NAME)

tuner.search_space_summary()

# Tune the model
tuner.search(X_train_pad, y_train, epochs=10, validation_data=(X_validate_pad, y_validate))

tuner.results_summary()

# Grab the best model
best_model = tuner.get_best_models(num_models=1)[0]
model_json = best_model.to_json()
open('reviews_10000_trained_cnn.json', 'w').write(model_json)
best_model.save_weights('reviews_10000_trained_cnn_weights.h5')

In [0]:
# Save the training data and create testing data
pickle.dump(X_train_pad, open('reviews_50000_X_train_pad.p', 'wb'))
test = pickle.load(open('reviews_10000_test_processed.p', 'rb'))
X_test_tokens = tokenizer.texts_to_sequences(test['text'].values)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')
pickle.dump(X_test_pad, open('reviews_50000_train_10000_test_pad.p', 'wb'))

In [0]:
# Generate reports and save the prediction data
from sklearn.metrics import classification_report
test_report = classification_report(test['stars'], best_model.predict_classes(X_test_pad), output_dict=True)
train_report = classification_report(train['stars'], best_model.predict_classes(X_train_pad), output_dict=True)
open('cnn_50000_report.json', 'w').close()
print(test_report, file=open('cnn_50000_report.json', 'a'))
print(train_report, file=open('cnn_50000_report.json', 'a'))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


  _warn_prf(average, modifier, msg_start, len(result))
