In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import os

from nltk.stem.snowball import SnowballStemmer

from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam, SGD

# from keras import layers
# from keras.utils import to_categorical
# from keras.models import Sequential, Model, load_model
# from keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.text import Tokenizer
# from keras.optimizers import Adam, SGD


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [2]:
%matplotlib inline
np.random.seed(99)
plt.style.use('ggplot')

# Auxiliary functions

In [3]:
def plot_history(history, plot_name):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.savefig(f'{plot_name}.png')

# Download the dataset

In [4]:
train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

In [5]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmer = SnowballStemmer("english", ignore_stopwords=True)
vectorizer = StemmedCountVectorizer(stop_words='english')
# vectorizer = CountVectorizer()

In [6]:
vectorizer.fit(test_set.data)

X_train = vectorizer.transform(train_set.data)
X_test  = vectorizer.transform(test_set.data)

num_classes = np.max(train_set["target"]) + 1

y_train = to_categorical(train_set.target, num_classes)
y_test = to_categorical(test_set.target, num_classes)

# Simplest Neural Network with Simple Preprocessing

In [7]:
def train_simple_network(parameters):
    print(f'Performing training on the following parameters {parameters}')
    
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    model = Sequential()
    model.add(layers.Dense(
        parameters["dense_layer_neurons"], 
        input_dim=input_dim,
        activation='relu'
    ))
    
    if parameters["use_deeper_network"]:
        model.add(layers.Dense(
            32, 
            activation='relu'
        ))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(output_dim, activation='softmax'))

    model.compile(
        optimizer=parameters["optimizer"],
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train,
        y_train,
        epochs=parameters["epochs"],
        verbose=2,
        validation_data=(X_test, y_test),
        batch_size=parameters["batch_size"],
    )
    val_accuracy = history.history['val_accuracy'][-1]
    return {'loss': -val_accuracy, 'status': STATUS_OK, 'model': model, 'history': history} 

In [8]:
parameters = {
    "dense_layer_neurons": hp.choice("dense_layer_neurons", [64, 96, 128]),
    "optimizer": hp.choice("optimizer", [Adam(), SGD()]),
    "batch_size": hp.choice("batch_size", [10, 32]),
    "epochs": hp.choice("epochs", [25, 50]),
    "use_deeper_network": hp.choice("use_deeper_network", [True, False])
}

trials = Trials()

max_evals = 3 * 2 * 2 * 2 * 2

best = fmin(
    fn=train_simple_network,
    space=parameters,
    algo=tpe.suggest,
    trials=trials,
    max_evals=max_evals
)

best_model = trials.best_trial['result']['model']
history = trials.best_trial['result']['history']

best_model.save('simple_model.hd5')
plot_history(history, "simple_model")

Performing training on the following parameters {'batch_size': 10, 'dense_layer_neurons': 128, 'epochs': 25, 'optimizer': <tensorflow.python.keras.optimizer_v2.gradient_descent.SGD object at 0x7f10d78cf450>, 'use_deeper_network': True}
  0%|          | 0/48 [00:00<?, ?it/s, best loss: ?]


MemoryError: Unable to allocate array with shape (11314, 77643) and data type int64

# Simplest Neural Network with More Complex Preprocessing

In [None]:
def train_network_with_preprocessing(parameters):
    embedding_dim = 50

    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, 
                               output_dim=embedding_dim, 
                               input_length=maxlen))
    model.add(layers.Flatten())
    model.add(layers.Dense(parameters["dense_layer_neurons"], activation='relu'))
    model.add(layers.Dense(output_dim, activation='softmax'))

    model.compile(
        optimizer=parameters["optimizer"],
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        X_train, 
        y_train,
        epochs=parameters["epochs"],
        validation_data=(X_test, y_test),
        batch_size=parameters["batch_size"]
    )
    val_accuracy = history.history['val_accuracy'][-1]
    return {'loss': -val_accuracy, 'status': STATUS_OK, 'model': model, 'history': history}

In [None]:
tokenizer = Tokenizer(num_words=3000)
tokenizer.fit_on_texts(train_set.data)

X_train = tokenizer.texts_to_sequences(train_set.data)
X_test = tokenizer.texts_to_sequences(test_set.data)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 300

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

parameters = {
    "dense_layer_neurons": hp.choice("dense_layer_neurons", [32, 64, 128]),
    "optimizer": hp.choice("optimizer", [Adam(), SGD()]),
    "batch_size": hp.choice("batch_size", [10, 32]),
    "epochs": hp.choice("epochs", [25, 50]),
}

input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

trials = Trials()

max_evals = 3 * 2 * 2 * 2

best = fmin(
    fn=train_network_with_preprocessing,
    space=parameters,
    algo=tpe.suggest,
    trials=trials,
    max_evals=max_evals
)

best_model = trials.best_trial['result']['model']
history = trials.best_trial['result']['history']

best_model.save('simple_model_preprocessing.hd5')
plot_history(history, "simple_model_preprocessing")

# Perform test

In [6]:
import pdb

def perform_test(model):
#     pdb.set_trace()
#     X_test, y_test = test_set.data, test_set.target
    correct_predictions = 0
    number_of_samples = len(y_test)
    
    for x, y in zip(X_test, y_test): 
        prediction = model.predict(x)
#         pdb.set_trace()
        if prediction.argmax() == y.argmax():
            correct_predictions += 1
    return correct_predictions / number_of_samples
#         print(x)

In [7]:
path_to_models = os.getcwd() + os.sep + "models" + os.sep
model_names = [
    "simple_model.hd5", 
    "simple_model_preprocessing.hd5"
]

for model_name in model_names:
    model = load_model(path_to_models + model_name)
    test_results = perform_test(model)
    print(f"Model {model_name}: {test_results}")

























































































































































































































































































































Model simple_model.hd5: 0.8489113117365905


ValueError: Error when checking input: expected embedding_12_input to have shape (300,) but got array with shape (77643,)