In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

from nltk.stem.snowball import SnowballStemmer

from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam, SGD

# from keras import layers
# from keras.utils import to_categorical
# from keras.models import Sequential, Model, load_model
# from keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.text import Tokenizer
# from keras.optimizers import Adam, SGD


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [None]:
%matplotlib inline
np.random.seed(99)
plt.style.use('ggplot')

# Auxiliary functions

In [None]:
def plot_history(history, plot_name):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.savefig(f'{plot_name}.png')

# Download the dataset

In [None]:
train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

In [None]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

stemmer = SnowballStemmer("english", ignore_stopwords=True)
vectorizer = StemmedCountVectorizer(stop_words='english')
# vectorizer = CountVectorizer()

In [None]:
vectorizer.fit(test_set.data)

X_train = vectorizer.transform(train_set.data)
X_test  = vectorizer.transform(test_set.data)

num_classes = np.max(train_set["target"]) + 1

y_train = to_categorical(train_set.target, num_classes)
y_test = to_categorical(test_set.target, num_classes)

# Simplest Neural Network with Simple Preprocessing

In [None]:
def train_simple_network(parameters):
    print(f'Performing training on the following parameters {parameters}')
    
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]

    model = Sequential()
    model.add(layers.Dense(
        parameters["dense_layer_neurons"], 
        input_dim=input_dim,
        activation='relu'
    ))
    
    if parameters["use_deeper_network"]:
        model.add(layers.Dense(
            32, 
            activation='relu'
        ))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(output_dim, activation='softmax'))

    model.compile(
        optimizer=parameters["optimizer"],
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train,
        y_train,
        epochs=parameters["epochs"],
        verbose=2,
        validation_data=(X_test, y_test),
        batch_size=parameters["batch_size"],
    )
    val_accuracy = history.history['val_accuracy'][-1]
    return {'loss': -val_accuracy, 'status': STATUS_OK, 'model': model, 'history': history} 

In [None]:
parameters = {
    "dense_layer_neurons": hp.choice("dense_layer_neurons", [64, 96, 128]),
    "optimizer": hp.choice("optimizer", [Adam(), SGD()]),
    "batch_size": hp.choice("batch_size", [10, 32]),
    "epochs": hp.choice("epochs", [25, 50]),
    "use_deeper_network": hp.choice("use_deeper_network", [True, False])
}

trials = Trials()

max_evals = 3 * 2 * 2 * 2 * 2

best = fmin(
    fn=train_simple_network,
    space=parameters,
    algo=tpe.suggest,
    trials=trials,
    max_evals=max_evals
)

best_model = trials.best_trial['result']['model']
history = trials.best_trial['result']['history']

best_model.save('simple_model.hd5')
plot_history(history, "simple_model")

# Simplest Neural Network with More Complex Preprocessing

In [None]:
def train_network_with_preprocessing(parameters):
    embedding_dim = 50

    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size, 
                               output_dim=embedding_dim, 
                               input_length=maxlen))
    model.add(layers.Flatten())
    model.add(layers.Dense(parameters["dense_layer_neurons"], activation='relu'))
    model.add(layers.Dense(output_dim, activation='softmax'))

    model.compile(
        optimizer=parameters["optimizer"],
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        X_train, 
        y_train,
        epochs=parameters["epochs"],
        validation_data=(X_test, y_test),
        batch_size=parameters["batch_size"]
    )
    val_accuracy = history.history['val_accuracy'][-1]
    return {'loss': -val_accuracy, 'status': STATUS_OK, 'model': model, 'history': history}

In [None]:
tokenizer = Tokenizer(num_words=3000)
tokenizer.fit_on_texts(train_set.data)

X_train = tokenizer.texts_to_sequences(train_set.data)
X_test = tokenizer.texts_to_sequences(test_set.data)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 300

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

parameters = {
    "dense_layer_neurons": hp.choice("dense_layer_neurons", [32, 64, 128]),
    "optimizer": hp.choice("optimizer", [Adam(), SGD()]),
    "batch_size": hp.choice("batch_size", [10, 32]),
    "epochs": hp.choice("epochs", [25, 50]),
}

input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

trials = Trials()

max_evals = 3 * 2 * 2 * 2

best = fmin(
    fn=train_network_with_preprocessing,
    space=parameters,
    algo=tpe.suggest,
    trials=trials,
    max_evals=max_evals
)

best_model = trials.best_trial['result']['model']
history = trials.best_trial['result']['history']

best_model.save('simple_model_preprocessing.hd5')
plot_history(history, "simple_model_preprocessing")

# Overkill


TODO:

1) fast text

2) biLSTM

In [None]:
# # coding=utf-8

# import numpy as np
# from keras.callbacks import EarlyStopping
# from keras.datasets import imdb
# from keras.preprocessing import sequence

# # from fast_text import FastText

# from keras import Input, Model
# from keras.layers import Embedding, GlobalAveragePooling1D, Dense


# class FastText(object):
#     def __init__(self, maxlen, max_features, embedding_dims,
#                  class_num=1,
#                  last_activation='sigmoid'):
#         self.maxlen = maxlen
#         self.max_features = max_features
#         self.embedding_dims = embedding_dims
#         self.class_num = class_num
#         self.last_activation = last_activation

#     def get_model(self):
#         input = Input((self.maxlen,))

#         embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
#         x = GlobalAveragePooling1D()(embedding)

#         output = Dense(self.class_num, activation=self.last_activation)(x)
#         model = Model(inputs=input, outputs=output)
#         return model
    
# def create_ngram_set(input_list, ngram_value=2):
#     """
#     Extract a set of n-grams from a list of integers.
#     # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
#     {(4, 9), (4, 1), (1, 4), (9, 4)}
#     # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
#     [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
#     """
#     return set(zip(*[input_list[i:] for i in range(ngram_value)]))


# def add_ngram(sequences, token_indice, ngram_range=2):
#     """
#     Augment the input list of list (sequences) by appending n-grams values.
#     Example: adding bi-gram
#     # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
#     # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
#     # >>> add_ngram(sequences, token_indice, ngram_range=2)
#     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
#     Example: adding tri-gram
#     # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
#     # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
#     # >>> add_ngram(sequences, token_indice, ngram_range=3)
#     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
#     """
#     new_sequences = []
#     for input_list in sequences:
#         new_list = input_list[:]
#         for ngram_value in range(2, ngram_range + 1):
#             for i in range(len(new_list) - ngram_value + 1):
#                 ngram = tuple(new_list[i:i + ngram_value])
#                 if ngram in token_indice:
#                     new_list.append(token_indice[ngram])
#         new_sequences.append(new_list)

#     return new_sequences


# # Set parameters:
# # ngram_range = 2 will add bi-grams features
# ngram_range = 1
# max_features = 5000
# maxlen = 400
# batch_size = 32
# embedding_dims = 50
# epochs = 10
# print('Loading data...')
# (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [None]:
# X_train

In [None]:
# print(len(x_train), 'train sequences')
# print(len(x_test), 'test sequences')
# print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
# print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

# if ngram_range > 1:
#     print('Adding {}-gram features'.format(ngram_range))
#     # Create set of unique n-gram from the training set.
#     ngram_set = set()
#     for input_list in x_train:
#         for i in range(2, ngram_range + 1):
#             set_of_ngram = create_ngram_set(input_list, ngram_value=i)
#             ngram_set.update(set_of_ngram)

#     # Dictionary mapping n-gram token to a unique integer.
#     # Integer values are greater than max_features in order
#     # to avoid collision with existing features.
#     start_index = max_features + 1
#     token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
#     indice_token = {token_indice[k]: k for k in token_indice}

#     # max_features is the highest integer that could be found in the dataset.
#     max_features = np.max(list(indice_token.keys())) + 1

#     # Augmenting x_train and x_test with n-grams features
#     x_train = add_ngram(x_train, token_indice, ngram_range)
#     x_test = add_ngram(x_test, token_indice, ngram_range)
#     print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
#     print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

# print('Pad sequences (samples x time)...')
# x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
# x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
# print('x_train shape:', x_train.shape)
# print('x_test shape:', x_test.shape)

# print('Build model...')
# model = FastText(maxlen, max_features, embedding_dims).get_model()
# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

# print('Train...')
# early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
# model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           callbacks=[early_stopping],
#           validation_data=(x_test, y_test))

# print('Test...')
# result = model.predict(x_test)