In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

#### Use polyglot for tokenizing and word embedding

In [2]:
import polyglot
from polyglot.text import Text, Word
from polyglot.mapping import Embedding

#### Use sklearn for the utils

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Use keras with tensorflow backend

In [23]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.layers import Input, Dense, LSTM, Activation
from keras.models import Model, Sequential
from keras import optimizers

#### Use hyperas for hyperparam tuning

In [5]:
from hyperopt import Trials, STATUS_OK, tpe, rand
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional

# Prepare Data

In [6]:
def data():
    """
    hyperas has some weird error if the first_n_records or TextClassificationDataSet is parametrized... should refactor
    """
    MAX_WORD_COUNT = 150
    class TextClassificationDataSet(object):
        def __init__(self, 
                     file_path,
                     word_embedding='./polyglot/embeddings2/zh/embeddings_pkl.tar.bz2',
                     MAX_WORD_COUNT=MAX_WORD_COUNT,
                     text_col_name='text',
                     label_col_name='tags',
                     one_hot_encoder=None):
            self.MAX_WORD_COUNT = MAX_WORD_COUNT

            self.df = pd.read_csv(file_path)
            self.text_col_name = text_col_name
            self.label_col_name = label_col_name

            if label_col_name is not None:
                self.label_encoder = self._fit_label_encoder(label_col_name)
                self.onehot_encoder = self._fit_onehot_encoder()

            if one_hot_encoder is not None:
                self.one_hot_encoder = one_hot_encoder

            self.embeddings = self._load_word_embeddings(word_embedding)

            self.features = None
            self.labels = None

        def get_features(self, use_cache=True):        
            if use_cache and self.features is not None:
                return self.features
            clean_text_col = self._get_clean_text_col(self.text_col_name)
            self.features = np.array(clean_text_col.apply(lambda x: np.squeeze(self._article2vecs_simple(x, embeddings=self.embeddings, max_word_count=self.MAX_WORD_COUNT))).tolist())
            return self.features

        def get_labels(self, use_cache=True):
            if self.label_col_name is None:
                raise KeyError('label_col_name is None, unable to get labels from the input data.')
            if use_cache and self.labels is not None:
                return self.labels
            self.labels = self.onehot_encoder.transform(self.df['label_index'].values.reshape(-1, 1)).toarray()
            return self.labels

        def _parse_text(self, text):
            if isinstance(text, str):
                text_parsed = Text(text)
            else:
                text_parsed = text
            return text_parsed

        def _article2vecs_simple(self, article_text, embeddings, max_word_count):
            if isinstance(article_text, str):
                article_parsed = self._parse_text(article_text)

            sentences_words_embedding = sequence.pad_sequences([[embeddings.get(word) for word in article_parsed.words if embeddings.get(word) is not None]], maxlen=max_word_count, truncating='post', dtype='float32')
            return sentences_words_embedding

        def _load_word_embeddings(self, word_embedding):
            if isinstance(word_embedding, Embedding):
                return word_embedding
            else:
                return Embedding.load(word_embedding)

        def _load_data_from_csv(self, file_path):
            return pd.read_csv(file_path)

        def _get_clean_text_col(self, text_col):
            """remove html tags in text"""
            text_col = self.df[text_col]
            return text_col.apply(lambda x: BeautifulSoup(x, "html5lib").text)

        def _fit_label_encoder(self, label_col):
            label_encoder = preprocessing.LabelEncoder()
            label_encoder.fit(self.df[label_col].tolist())
            self.df['label_index'] = label_encoder.fit_transform(self.df[label_col])
            self.label_encoder = label_encoder
            return label_encoder

        def _fit_onehot_encoder(self):
            onehot_encoder = preprocessing.OneHotEncoder()
            onehot_encoder.fit(self.df['label_index'].values.reshape(-1, 1))
            self.onehot_encoder = onehot_encoder
            return onehot_encoder
    dataset_train = TextClassificationDataSet(file_path='../data/offsite-tagging-training-set (1).csv')
    X_train, X_validate, y_train, y_validate = train_test_split(dataset_train.get_features(), dataset_train.get_labels(), test_size=0.2, random_state=42)
    return X_train, y_train, X_validate, y_validate


In [7]:
def data_first1024():
    """ 
    same as data(), just that it only returns the first 1024 rows of the input 
    => faster for searching hyper params
    
    hyperas has some weird error if the first_n_records or TextClassificationDataSet is parametrized... should refactor
    """
    MAX_WORD_COUNT = 150
    first_n_records = 1024
    class TextClassificationDataSet(object):
        def __init__(self, 
                     file_path,
                     word_embedding='./polyglot/embeddings2/zh/embeddings_pkl.tar.bz2',
                     MAX_WORD_COUNT=MAX_WORD_COUNT,
                     text_col_name='text',
                     label_col_name='tags',
                     one_hot_encoder=None):
            self.MAX_WORD_COUNT = MAX_WORD_COUNT

            self.df = pd.read_csv(file_path).head(first_n_records)
            self.text_col_name = text_col_name
            self.label_col_name = label_col_name

            if label_col_name is not None:
                self.label_encoder = self._fit_label_encoder(label_col_name)
                self.onehot_encoder = self._fit_onehot_encoder()

            if one_hot_encoder is not None:
                self.one_hot_encoder = one_hot_encoder

            self.embeddings = self._load_word_embeddings(word_embedding)

            self.features = None
            self.labels = None

        def get_features(self, use_cache=True):
            if use_cache and self.features is not None:
                return self.features
            clean_text_col = self._get_clean_text_col(self.text_col_name)
            self.features = np.array(clean_text_col.apply(lambda x: np.squeeze(self._article2vecs_simple(x, embeddings=self.embeddings, max_word_count=self.MAX_WORD_COUNT))).tolist())
            return self.features

        def get_labels(self, use_cache=True):
            if self.label_col_name is None:
                raise KeyError('label_col_name is None, unable to get labels from the input data.')
            if use_cache and self.labels is not None:
                return self.labels
            self.labels = self.onehot_encoder.transform(self.df['label_index'].values.reshape(-1, 1)).toarray()
            return self.labels

        def _parse_text(self, text):
            if isinstance(text, str):
                text_parsed = Text(text)
            else:
                text_parsed = text
            return text_parsed

        def _article2vecs_simple(self, article_text, embeddings, max_word_count):
            if isinstance(article_text, str):
                article_parsed = self._parse_text(article_text)

            sentences_words_embedding = sequence.pad_sequences([[embeddings.get(word) for word in article_parsed.words if embeddings.get(word) is not None]], maxlen=max_word_count, truncating='post', dtype='float32')
            return sentences_words_embedding

        def _load_word_embeddings(self, word_embedding):
            if isinstance(word_embedding, Embedding):
                return word_embedding
            else:
                return Embedding.load(word_embedding)

        def _load_data_from_csv(self, file_path):
            return pd.read_csv(file_path)

        def _get_clean_text_col(self, text_col):
            """remove html tags in text"""
            text_col = self.df[text_col]
            return text_col.apply(lambda x: BeautifulSoup(x, "html5lib").text)

        def _fit_label_encoder(self, label_col):
            label_encoder = preprocessing.LabelEncoder()
            label_encoder.fit(self.df[label_col].tolist())
            self.df['label_index'] = label_encoder.fit_transform(self.df[label_col])
            self.label_encoder = label_encoder
            return label_encoder

        def _fit_onehot_encoder(self):
            onehot_encoder = preprocessing.OneHotEncoder()
            onehot_encoder.fit(self.df['label_index'].values.reshape(-1, 1))
            self.onehot_encoder = onehot_encoder
            return onehot_encoder
    dataset_train = TextClassificationDataSet(file_path='../data/offsite-tagging-training-set (1).csv')
    X_train, X_validate, y_train, y_validate = train_test_split(dataset_train.get_features(), dataset_train.get_labels(), test_size=0.2, random_state=42)
    return X_train, y_train, X_validate, y_validate


In [8]:
def search_model_seq(X_train, y_train, X_validate, y_validate):
    """
    Defines the computational graph.
    """
    MAX_WORD_COUNT = 150
    embedding_size = 64
    tag_classes_count = 3
    
    batch_size = {{choice([128, 256, 512])}}
    lstm_units = {{choice([64, 128, 256, 512])}}
    dense_units = {{choice([64, 128, 256, 512])}}
    
    model = Sequential()

    model.add(LSTM(lstm_units, input_shape=(MAX_WORD_COUNT, embedding_size), name='LSTM'))
    
    model.add(Dense(dense_units, activation='relu', name='Dense_1'))
    model.add(Dense(dense_units, activation='relu', name='Dense_2'))
    model.add(Dense(dense_units, activation='relu', name='Dense_3'))

    model.add(Dense(tag_classes_count, activation='softmax', name='main_output'))
    model.compile(optimizer={{choice(['rmsprop', 'adam', 'adagrad', 'nadam', 'adadelta'])}}, 
              loss={'main_output': 'categorical_crossentropy'}, 
              metrics=['accuracy'])
    
    model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=1,
              validation_data=(X_validate, y_validate))
    
    score, acc = model.evaluate(X_validate, y_validate, batch_size=batch_size, verbose=0)
    print('Test Accuracy:{}'.format(acc))
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}


In [9]:
X_train, y_train, X_validate, y_validate = data()

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.


In [11]:
# import gc; gc.collect()
trials = Trials()
best_run, best_model, space = optim.minimize(model=search_model_seq,
                                      data=data_first1024,
                                      algo=tpe.suggest,
                                      max_evals=70,
                                      trials=trials,
                                      notebook_name='model_final',
                                             eval_space=True,   # <-- this is the line that puts real values into 'best_run'
                                             return_space=True  # <-- this allows you to save the space for later evaluations 
                                            )

>>> Imports:
#coding=utf-8

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    from bs4 import BeautifulSoup
except:
    pass

try:
    import polyglot
except:
    pass

try:
    from polyglot.text import Text, Word
except:
    pass

try:
    from polyglot.mapping import Embedding
except:
    pass

try:
    from sklearn import preprocessing
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.metrics import accuracy_score
except:
    pass

try:
    from keras.preprocessing import sequence
except:
    pass

try:
    from keras.utils import np_utils
except:
    pass

try:
    from keras.layers import Input, Dense, LSTM, Activation
except:
    pass

try:
    from keras.models import Model, Sequential
except:
    pass

try:
    from keras import optimizers
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe, rand
except:
    pass

try:
    from hyperas im

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.


Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples,

Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536553382874
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.5658536611533747
Train on 819 samples, validate on 205 samples
Epoch 1/1
Test Accuracy:0.56585

In [12]:

print("Evalutation of best performing model:")
print(best_model.evaluate(X_validate, y_validate))
print("Best performing model chosen hyper-parameters:")
print(best_run)

Evalutation of best performing model:
[0.94468809658510844, 0.59563543026805388]
Best performing model chosen hyper-parameters:
{'batch_size': 128, 'lstm_units': 256, 'lstm_units_1': 256, 'optimizer': 'rmsprop'}


In [13]:
best_run

{'batch_size': 128,
 'lstm_units': 256,
 'lstm_units_1': 256,
 'optimizer': 'rmsprop'}

In [None]:
best_model.fit(X_train, y_train,
              batch_size=best_run['batch_size'],
              epochs=20,
#               verbose=2,
              validation_data=(X_validate, y_validate))

In [15]:
best_model.evaluate(X_validate, y_validate)



[0.27436906497193853, 0.91014120659871034]

In [35]:
from importlib import reload
import keras.models
reload(keras.models)

<module 'keras.models' from '/Users/ericng/Workspace/hk01_test/q3b_proj/model/tag-clf/lib/python3.6/site-packages/keras/models.py'>

In [36]:
best_model.save('model_final.h5')

# Predict the test set

In [29]:
MAX_WORD_COUNT = 150

In [30]:
class TextClassificationDataSet(object):
    def __init__(self, 
                 file_path,
                 word_embedding='./polyglot/embeddings2/zh/embeddings_pkl.tar.bz2',
                 MAX_WORD_COUNT=MAX_WORD_COUNT,
                 text_col_name='text',
                 label_col_name='tags',
                 one_hot_encoder=None):
        self.MAX_WORD_COUNT = MAX_WORD_COUNT

        self.df = pd.read_csv(file_path)
        self.text_col_name = text_col_name
        self.label_col_name = label_col_name

        if label_col_name is not None:
            self.label_encoder = self._fit_label_encoder(label_col_name)
            self.onehot_encoder = self._fit_onehot_encoder()

        if one_hot_encoder is not None:
            self.one_hot_encoder = one_hot_encoder

        self.embeddings = self._load_word_embeddings(word_embedding)

        self.features = None
        self.labels = None

    def get_features(self, use_cache=True):        
        if use_cache and self.features is not None:
            return self.features
        clean_text_col = self._get_clean_text_col(self.text_col_name)
        self.features = np.array(clean_text_col.apply(lambda x: np.squeeze(self._article2vecs_simple(x, embeddings=self.embeddings, max_word_count=self.MAX_WORD_COUNT))).tolist())
        return self.features

    def get_labels(self, use_cache=True):
        if self.label_col_name is None:
            raise KeyError('label_col_name is None, unable to get labels from the input data.')
        if use_cache and self.labels is not None:
            return self.labels
        self.labels = self.onehot_encoder.transform(self.df['label_index'].values.reshape(-1, 1)).toarray()
        return self.labels

    def _parse_text(self, text):
        if isinstance(text, str):
            text_parsed = Text(text)
        else:
            text_parsed = text
        return text_parsed

    def _article2vecs_simple(self, article_text, embeddings, max_word_count):
        if isinstance(article_text, str):
            article_parsed = self._parse_text(article_text)

        sentences_words_embedding = sequence.pad_sequences([[embeddings.get(word) for word in article_parsed.words if embeddings.get(word) is not None]], maxlen=max_word_count, truncating='post', dtype='float32')
        return sentences_words_embedding

    def _load_word_embeddings(self, word_embedding):
        if isinstance(word_embedding, Embedding):
            return word_embedding
        else:
            return Embedding.load(word_embedding)

    def _load_data_from_csv(self, file_path):
        return pd.read_csv(file_path)

    def _get_clean_text_col(self, text_col):
        """remove html tags in text"""
        text_col = self.df[text_col]
        return text_col.apply(lambda x: BeautifulSoup(x, "html5lib").text)

    def _fit_label_encoder(self, label_col):
        label_encoder = preprocessing.LabelEncoder()
        label_encoder.fit(self.df[label_col].tolist())
        self.df['label_index'] = label_encoder.fit_transform(self.df[label_col])
        self.label_encoder = label_encoder
        return label_encoder

    def _fit_onehot_encoder(self):
        onehot_encoder = preprocessing.OneHotEncoder()
        onehot_encoder.fit(self.df['label_index'].values.reshape(-1, 1))
        self.onehot_encoder = onehot_encoder
        return onehot_encoder

In [31]:
dataset_test = TextClassificationDataSet(file_path='../data/offsite-tagging-test-set (1).csv', label_col_name=None)

In [51]:
pred = best_model.predict(dataset_test.get_features())

In [60]:
pred_classes = best_model.predict_classes(dataset_test.get_features())



In [52]:
pred

array([[ 0.00779445,  0.03504602,  0.95715952],
       [ 0.72062439,  0.16627042,  0.11310524],
       [ 0.00546045,  0.02126656,  0.97327304],
       ..., 
       [ 0.10732042,  0.17002803,  0.72265148],
       [ 0.00891487,  0.04745243,  0.94363272],
       [ 0.02163052,  0.07743791,  0.90093154]], dtype=float32)

In [54]:
dataset_train = TextClassificationDataSet(file_path='../data/offsite-tagging-training-set (1).csv')

In [55]:
df_test = dataset_test.df.copy()

In [57]:
label_encoder = dataset_train.label_encoder

In [62]:
df_test['predicted_tags'] = label_encoder.inverse_transform(pred_classes)

In [63]:
df_test

Unnamed: 0,id,text,predicted_tags
0,6,南華添鋒力　簽前厄瓜多爾國腳保耶 港超勁旅南華宣布羅致前厄瓜多爾國腳菲力斯保耶（Felix ...,足球
1,128,如果大學$0捐款　科大嶺南將蝕過千萬元 據now新聞台報道，身兼8大校監的特首梁振英曾以大學...,梁振英
2,136,英超最強火力對碰　雙城爭冠靠鋒霸 英超今季風起雲湧，傳統「Big 5」只剩兩隊名列積分榜前5...,足球
3,213,【01球評】膺半程冠軍　阿仙奴今季不奪標更待何時？ 近年「兵工廠」每季的起落都離不開一個循環...,足球
4,658,【書商失蹤】梁振英：希望失蹤的李波本人提供資料 行政長官梁振英出席行政會議前見記者，被問及李...,梁振英
5,700,【施政盤點】三份施政報告　僅一半政策達標 行政長官梁振英即將公布任內第四份施政報告，《香港0...,梁振英
6,729,【施政盤點】「治港絕招」　設19委員會　空談多實務少 行政長官梁振英上任3年多，先後成立多個...,梁振英
7,837,高普首簽　「新馬迪」來季投紅軍 利物浦傷兵滿營及戰績不穩，主帥高普仍不忘投資未來，昨以510...,足球
8,1037,「最潮主帥」鬥利物浦：我已領先1：0 英乙球隊埃克塞特在明晨的足總盃於主場迎戰利物浦，雖然越...,足球
9,1095,紅軍超殘陣逼和英乙隊　高普：負擔不起重賽 逾十名球員受傷的利物浦，今晨在足總盃第三圈以大部份...,足球


In [64]:
df_test.to_csv('../output/testset_with_tags.csv')