# Better Model

#### Use hyperas for hyperparam tuning

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn import preprocessing

In [3]:
import polyglot
from polyglot.text import Text, Word
from polyglot.mapping import Embedding
from keras.preprocessing import sequence

Using TensorFlow backend.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
from keras.utils import np_utils
from keras.layers import Input, Dense, LSTM, Activation
from keras.models import Model, Sequential
from keras import optimizers

In [6]:
from hyperopt import Trials, STATUS_OK, tpe, rand
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional

In [7]:
def data():
    MAX_WORD_COUNT = 150
    class TextClassificationDataSet(object):
        def __init__(self, 
                     file_path,
                     word_embedding='./polyglot/embeddings2/zh/embeddings_pkl.tar.bz2',
                     MAX_WORD_COUNT=MAX_WORD_COUNT,
                     text_col_name='text',
                     label_col_name='tags',
                     one_hot_encoder=None):
            self.MAX_WORD_COUNT = MAX_WORD_COUNT

            self.df = pd.read_csv(file_path)
            self.text_col_name = text_col_name
            self.label_col_name = label_col_name

            if label_col_name is not None:
                self.label_encoder = self._fit_label_encoder(label_col_name)
                self.onehot_encoder = self._fit_onehot_encoder()

            if one_hot_encoder is not None:
                self.one_hot_encoder = one_hot_encoder

            self.embeddings = self._load_word_embeddings(word_embedding)

            self.features = None
            self.labels = None

        def get_features(self, use_cache=True):        
            if use_cache and self.features is not None:
                return self.features
            clean_text_col = self._get_clean_text_col(self.text_col_name)
            self.features = np.array(clean_text_col.apply(lambda x: np.squeeze(self._article2vecs_simple(x, embeddings=self.embeddings, max_word_count=self.MAX_WORD_COUNT))).tolist())
            return self.features

        def get_labels(self, use_cache=True):
            if self.label_col_name is None:
                raise KeyError('label_col_name is None, unable to get labels from the input data.')
            if use_cache and self.labels is not None:
                return self.labels
            self.labels = self.onehot_encoder.transform(self.df['label_index'].values.reshape(-1, 1)).toarray()
            return self.labels

        def _parse_text(self, text):
            if isinstance(text, str):
                text_parsed = Text(text)
            else:
                text_parsed = text
            return text_parsed

        def _article2vecs_simple(self, article_text, embeddings, max_word_count):
            if isinstance(article_text, str):
                article_parsed = self._parse_text(article_text)

            sentences_words_embedding = sequence.pad_sequences([[embeddings.get(word) for word in article_parsed.words if embeddings.get(word) is not None]], maxlen=max_word_count, truncating='post', dtype='float32')
            return sentences_words_embedding

        def _load_word_embeddings(self, word_embedding):
            if isinstance(word_embedding, Embedding):
                return word_embedding
            else:
                return Embedding.load(word_embedding)

        def _load_data_from_csv(self, file_path):
            return pd.read_csv(file_path)

        def _get_clean_text_col(self, text_col):
            """remove html tags in text"""
            text_col = self.df[text_col]
            return text_col.apply(lambda x: BeautifulSoup(x, "html5lib").text)

        def _fit_label_encoder(self, label_col):
            label_encoder = preprocessing.LabelEncoder()
            label_encoder.fit(self.df[label_col].tolist())
            self.df['label_index'] = label_encoder.fit_transform(self.df[label_col])
            self.label_encoder = label_encoder
            return label_encoder

        def _fit_onehot_encoder(self):
            onehot_encoder = preprocessing.OneHotEncoder()
            onehot_encoder.fit(self.df['label_index'].values.reshape(-1, 1))
            self.onehot_encoder = onehot_encoder
            return onehot_encoder
    dataset_train = TextClassificationDataSet(file_path='../data/offsite-tagging-training-set (1).csv')
    X_train, X_validate, y_train, y_validate = train_test_split(dataset_train.get_features(), dataset_train.get_labels(), test_size=0.2, random_state=42)
    return X_train, y_train, X_validate, y_validate


In [8]:
def data_first500():
    MAX_WORD_COUNT = 150
    first_n_records = 500
    class TextClassificationDataSet(object):
        def __init__(self, 
                     file_path,
                     word_embedding='./polyglot/embeddings2/zh/embeddings_pkl.tar.bz2',
                     MAX_WORD_COUNT=MAX_WORD_COUNT,
                     text_col_name='text',
                     label_col_name='tags',
                     one_hot_encoder=None):
            self.MAX_WORD_COUNT = MAX_WORD_COUNT

            self.df = pd.read_csv(file_path).head(first_n_records)
            self.text_col_name = text_col_name
            self.label_col_name = label_col_name

            if label_col_name is not None:
                self.label_encoder = self._fit_label_encoder(label_col_name)
                self.onehot_encoder = self._fit_onehot_encoder()

            if one_hot_encoder is not None:
                self.one_hot_encoder = one_hot_encoder

            self.embeddings = self._load_word_embeddings(word_embedding)

            self.features = None
            self.labels = None

        def get_features(self, use_cache=True):        
            if use_cache and self.features is not None:
                return self.features
            clean_text_col = self._get_clean_text_col(self.text_col_name)
            self.features = np.array(clean_text_col.apply(lambda x: np.squeeze(self._article2vecs_simple(x, embeddings=self.embeddings, max_word_count=self.MAX_WORD_COUNT))).tolist())
            return self.features

        def get_labels(self, use_cache=True):
            if self.label_col_name is None:
                raise KeyError('label_col_name is None, unable to get labels from the input data.')
            if use_cache and self.labels is not None:
                return self.labels
            self.labels = self.onehot_encoder.transform(self.df['label_index'].values.reshape(-1, 1)).toarray()
            return self.labels

        def _parse_text(self, text):
            if isinstance(text, str):
                text_parsed = Text(text)
            else:
                text_parsed = text
            return text_parsed

        def _article2vecs_simple(self, article_text, embeddings, max_word_count):
            if isinstance(article_text, str):
                article_parsed = self._parse_text(article_text)

            sentences_words_embedding = sequence.pad_sequences([[embeddings.get(word) for word in article_parsed.words if embeddings.get(word) is not None]], maxlen=max_word_count, truncating='post', dtype='float32')
            return sentences_words_embedding

        def _load_word_embeddings(self, word_embedding):
            if isinstance(word_embedding, Embedding):
                return word_embedding
            else:
                return Embedding.load(word_embedding)

        def _load_data_from_csv(self, file_path):
            return pd.read_csv(file_path)

        def _get_clean_text_col(self, text_col):
            """remove html tags in text"""
            text_col = self.df[text_col]
            return text_col.apply(lambda x: BeautifulSoup(x, "html5lib").text)

        def _fit_label_encoder(self, label_col):
            label_encoder = preprocessing.LabelEncoder()
            label_encoder.fit(self.df[label_col].tolist())
            self.df['label_index'] = label_encoder.fit_transform(self.df[label_col])
            self.label_encoder = label_encoder
            return label_encoder

        def _fit_onehot_encoder(self):
            onehot_encoder = preprocessing.OneHotEncoder()
            onehot_encoder.fit(self.df['label_index'].values.reshape(-1, 1))
            self.onehot_encoder = onehot_encoder
            return onehot_encoder
    dataset_train = TextClassificationDataSet(file_path='../data/offsite-tagging-training-set (1).csv')
    X_train, X_validate, y_train, y_validate = train_test_split(dataset_train.get_features(), dataset_train.get_labels(), test_size=0.2, random_state=42)
    return X_train, y_train, X_validate, y_validate


In [9]:
def search_model_seq(X_train, y_train, X_validate, y_validate):
    MAX_WORD_COUNT = 150
    embedding_size = 64
    tag_classes_count = 3
    
    batch_size = {{choice([128, 256, 512])}}
    lstm_units = {{choice([64, 128, 256, 512])}}
    dense_units = {{choice([64, 128, 256, 512])}}
    
    model = Sequential()

    model.add(LSTM(lstm_units, input_shape=(MAX_WORD_COUNT, embedding_size), name='LSTM'))
    
    model.add(Dense(dense_units, activation='relu', name='Dense_1'))
    model.add(Dense(dense_units, activation='relu', name='Dense_2'))
    model.add(Dense(dense_units, activation='relu', name='Dense_3'))

    model.add(Dense(tag_classes_count, activation='softmax', name='main_output'))
    model.compile(optimizer={{choice(['rmsprop', 'adam', 'adagrad', 'nadam', 'adadelta'])}}, 
              loss={'main_output': 'categorical_crossentropy'}, 
              metrics=['accuracy'])
    
    model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=1,
              validation_data=(X_validate, y_validate))
    
    score, acc = model.evaluate(X_validate, y_validate, batch_size=batch_size, verbose=0)
    print('Test Accuracy:{}'.format(acc))
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}


In [10]:
X_train, y_train, X_validate, y_validate = data()

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.


In [11]:
# import gc; gc.collect()
trials = Trials()
best_run, best_model, space = optim.minimize(model=search_model_seq,
                                      data=data_first500,
                                      algo=tpe.suggest,
                                      max_evals=50,
                                      trials=trials,
                                      notebook_name='draft',
                                             eval_space=True,   # <-- this is the line that puts real values into 'best_run'
                                             return_space=True  # <-- this allows you to save the space for later evaluations 
                                            )

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from bs4 import BeautifulSoup
except:
    pass

try:
    from sklearn import preprocessing
except:
    pass

try:
    import polyglot
except:
    pass

try:
    from polyglot.text import Text, Word
except:
    pass

try:
    from polyglot.mapping import Embedding
except:
    pass

try:
    from keras.preprocessing import sequence
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from keras.layers import Input, Dense, LSTM, Activation
except:
    pass

try:
    from keras.models import Model, Sequential
except:
    pass

try:
    from keras import optimizers
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from bs4 import BeautifulSoup
except:
    pass

try:
    from sklearn import preprocessing
except:
    pass

try:
    import poly

Detector is not able to detect the language reliably.


Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6399999856948853
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.28999999165534973
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6000000238418579
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples

Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6499999761581421
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6499999761581421
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.5199999809265137
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.11999999731779099
Train on 400 samples, validate on 100 samples
Epoch 1/1
Test Accuracy:0.6600000262260437


In [12]:

print("Evalutation of best performing model:")
print(best_model.evaluate(X_validate, y_validate))
print("Best performing model chosen hyper-parameters:")
print(best_run)

Evalutation of best performing model:
[1.0598476840534627, 0.5430038513206854]
Best performing model chosen hyper-parameters:
{'batch_size': 256, 'lstm_units': 256, 'lstm_units_1': 64, 'optimizer': 'adam'}


In [13]:
best_run

{'batch_size': 256, 'lstm_units': 256, 'lstm_units_1': 64, 'optimizer': 'adam'}

In [15]:
best_model.fit(X_train, y_train,
              batch_size=best_run['batch_size'],
              epochs=10,
#               verbose=2,
              validation_data=(X_validate, y_validate))

Train on 3115 samples, validate on 779 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x127dab198>

In [16]:
best_model.evaluate(X_validate, y_validate)



[0.31981858569024002, 0.90372272174379797]