# DataSet: Experimental Data for Question Classification
    
webpage:    http://cogcomp.org/Data/QA/QC/
        

In [1]:
import os
import collections
import re
import numpy as np
datadir = '/data/question'
os.listdir(datadir)

['train_1000.label',
 'train_2000.label',
 'train_3000.label',
 'train_4000.label',
 'train_5500.label',
 'TREC_10.label']

### Load the dataset

In [2]:
def clean_text(string):
    string = string.lower()
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"i 'm", "i am", string)
    string = re.sub(r"he 's", "he is", string)
    string = re.sub(r"she 's", "she is", string)
    string = re.sub(r"it 's", "it is", string)
    string = re.sub(r"that's", "that is", string)
    string = re.sub(r"what 's", "what is", string)
    string = re.sub(r"where 's", "where is", string)
    string = re.sub(r"how 's", "how is", string)
    string = re.sub(r"won't", "will not", string)
    string = re.sub(r"can't", "cannot", string)
    string = re.sub(r"n't", " not", string)
    
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data(filename):
    file = os.path.join(datadir,filename)
    f = open(file, 'rb')

    labels_low = []
    labels_high = []
    sentences = []
    for line in f.readlines():
        label, sentence = line.decode('windows-1252').split(maxsplit=1)
        label_h = label.split(":")[0]
        labels_low.append(label)
        labels_high.append(label_h)
        sentences.append(clean_text(sentence))
    
    return sentences, labels_low, labels_high


sentences_train, labels_low_train, labels_high_train = load_data('train_5500.label')
sentences_test, labels_low_test, labels_high_test = load_data('TREC_10.label')

print("train set size: ", len(sentences_train))
print("test set size: ", len(sentences_test))

train set size:  5452
test set size:  500


In [3]:
# Count the high level labels
counter=collections.Counter(labels_high_train)
print(counter)

Counter({'ENTY': 1250, 'HUM': 1223, 'DESC': 1162, 'NUM': 896, 'LOC': 835, 'ABBR': 86})


In [4]:
# Count the low level labels
counter=collections.Counter(labels_low_train)
print(counter)

Counter({'HUM:ind': 962, 'LOC:other': 464, 'DESC:def': 421, 'NUM:count': 363, 'DESC:manner': 276, 'DESC:desc': 274, 'NUM:date': 218, 'ENTY:other': 217, 'ENTY:cremat': 207, 'DESC:reason': 191, 'HUM:gr': 189, 'LOC:country': 155, 'LOC:city': 129, 'ENTY:animal': 112, 'ENTY:food': 103, 'ENTY:dismed': 103, 'ENTY:termeq': 93, 'NUM:period': 75, 'NUM:money': 71, 'ABBR:exp': 70, 'LOC:state': 66, 'ENTY:sport': 62, 'ENTY:event': 56, 'NUM:other': 52, 'HUM:desc': 47, 'ENTY:product': 42, 'ENTY:substance': 41, 'ENTY:color': 40, 'ENTY:techmeth': 38, 'NUM:dist': 34, 'NUM:perc': 27, 'ENTY:veh': 27, 'ENTY:word': 26, 'HUM:title': 25, 'LOC:mount': 21, 'ABBR:abb': 16, 'ENTY:lang': 16, 'ENTY:body': 16, 'NUM:volsize': 13, 'ENTY:plant': 13, 'ENTY:symbol': 11, 'NUM:weight': 11, 'ENTY:instru': 10, 'NUM:code': 9, 'ENTY:letter': 9, 'NUM:speed': 9, 'NUM:temp': 8, 'NUM:ord': 6, 'ENTY:currency': 4, 'ENTY:religion': 4})


### preprocessing -- tokenizing and pading

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 40
max_features = 2000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(sentences_test)

x_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
x_test = pad_sequences(list_tokenized_test, maxlen=maxlen)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
# One-hot encode for the labels

from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
encoder.fit(labels_high_test)
y_train = encoder.transform(labels_high_train)
y_test = encoder.transform(labels_high_test)

# Models for queries dataset

### bi-directional LSTM models
Given the nature of search engine queries, the order of the words is not as organized as speech, so the model that can carry information both forward and backward could be beneficial. 

### CNN models
Given the property of search engine queries, semi-random word orders and limited word length per classification task, the mechanism of Convolutional neural network seems to be a good fit.



# performance measures


$Accuracy =\frac{ \text{#  of correctly predicted query intents}}{\text{Total # of queries}}$


$Precision_i =
\frac{\text{# of correctly predicted intents with the intent i}}
{\text{Total # of predicted intents with the intent i}}$

$Recall_i =
\frac{\text{# of correctly predicted intents with the intent i}}
{\text{Total # of human annotated queries with the intent i}} $

$F\text{-}score_i = 2 ×
\frac{Precision_i × Recall_i}{Precision_i + Recall_i}$





In [7]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """
        Recall metric for multi-label classification 
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """
        Precision metric for multi-label classification of
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

# Embedding:

use vectors from GloVe as initial values for the matched query words. For query words that do not exist in GloVe vocabulary, the vectors will be initialized with random values using Xavier initializer.

# MODEL 1: CNN Model (1d convolution)

<img style="float: left;" src="img_files/cnn.png">

In [8]:
# Build model
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Conv1D, Embedding
from keras.layers.merge import Concatenate

sequence_length = maxlen
embedding_dim = 50
dropout_prob = 0.5
num_filters = 20
hidden_dims = 50

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

z = Embedding(max_features, embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob)(z)

# Convolutional block
filter_sizes = [3,4,5]

conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
    
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob)(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(6, activation="softmax")(z)

model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 40, 50)        100000      input_1[0][0]                    
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 40, 50)        0           embedding[0][0]                  
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 38, 20)        3020        dropout_1[0][0]                  
___________________________________________________________________________________________

In [9]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])

Train on 5452 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f31440d69b0>

# Model 2: BLSTM
<img style="float: left;" src="img_files/blstm.png">

In [10]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Conv1D, Embedding, Bidirectional, LSTM
from keras.layers.merge import Concatenate

sequence_length = maxlen
embedding_dim = 50
dropout_prob = 0.5
unit_size = 32

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

z = Embedding(max_features, embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob)(z)

z = Bidirectional(LSTM(unit_size, return_sequences=True))(z)
z = Bidirectional(LSTM(unit_size))(z)

model_output = Dense(6, activation="softmax")(z)

model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 40)                0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 50)            100000    
_________________________________________________________________
dropout_3 (Dropout)          (None, 40, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 40, 64)            21248     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 390       
Total params: 146,470
Trainable params: 146,470
Non-trainable params: 0
_________________________________________________________________


In [11]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])

Train on 5452 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f312e8f2ef0>

# MODEL: C-BLSTM

<img style="float: left;" src="img_files/cblstm.png">

In [12]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Conv1D, Embedding, Bidirectional, LSTM
from keras.layers.merge import Concatenate

sequence_length = maxlen
embedding_dim = 50
num_filters = 20
unit_size = 32
dropout_prob = 0.5

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

z = Embedding(max_features, embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob)(z)

# Convolutional block
filter_sizes = [3,4,5]

conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
#     conv = Flatten()(conv)
    conv_blocks.append(conv)
    print(conv)
    
z = Concatenate(axis=1)(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
print(z.get_shape())
z = Bidirectional(LSTM(unit_size))(z)

model_output = Dense(6, activation="softmax")(z)

model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

Tensor("max_pooling1d_4/Squeeze:0", shape=(?, 19, 20), dtype=float32)
Tensor("max_pooling1d_5/Squeeze:0", shape=(?, 18, 20), dtype=float32)
Tensor("max_pooling1d_6/Squeeze:0", shape=(?, 18, 20), dtype=float32)
(?, 55, 20)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 40, 50)        100000      input_3[0][0]                    
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 40, 50)        0           embedding[0][0]                  
_______________________________________________________________________

In [13]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])

Train on 5452 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f312d0afa20>

# Model: BLSTM-2DCNN

<img style="float: left;" src="img_files/blstm2Dcnn.png">

In [14]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPool2D, Conv2D, Embedding, Bidirectional, LSTM, Reshape

units = 32
conv_filters = 32

sequence_length = maxlen
embedding_dim = 50
# dropout_prob = 0.5
num_filters = 100
hidden_dims = 50

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

z = Embedding(max_features, embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(0.2)(z)

z = Bidirectional(LSTM(
    units,
    dropout=0.2,
    recurrent_dropout=0.2,
    return_sequences=True))(z)
z = Reshape((2 * sequence_length, units, 1))(z)
z = Conv2D(conv_filters, (3, 3))(z)
z = MaxPool2D(pool_size=(2, 2))(z)
z = Flatten()(z)
model_output = Dense(6, activation="softmax")(z)

model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 40)                0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 50)            100000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 40, 50)            0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 40, 64)            21248     
_________________________________________________________________
reshape_1 (Reshape)          (None, 80, 32, 1)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 78, 30, 32)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 39, 15, 32)        0         
__________

In [15]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])

Train on 5452 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f312cd58eb8>

# MODEL: BLSTM-att

<img style="float: left;" src="img_files/blstm-att.png">
    

In [16]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPool2D, Conv2D, Embedding, Bidirectional, LSTM, Reshape, merge, Permute, Multiply

units = 32
sequence_length = maxlen
embedding_dim = 50

input_shape = (sequence_length,)
model_input = Input(shape=input_shape)

z = Embedding(max_features, embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(0.5)(z)

z = Bidirectional(LSTM(units,return_sequences=True))(z)

a = Permute((2,1))(z)

a = Dense(sequence_length, activation='softmax')(a)
a_probs = Permute((2,1))(a)

a_mul = Multiply()([z, a_probs])
a_mul = Flatten()(a_mul)

model_output = Dense(6, activation="softmax")(a_mul)

model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_5 (InputLayer)             (None, 40)            0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 40, 50)        100000      input_5[0][0]                    
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 40, 50)        0           embedding[0][0]                  
____________________________________________________________________________________________________
bidirectional_5 (Bidirectional)  (None, 40, 64)        21248       dropout_6[0][0]                  
___________________________________________________________________________________________

In [17]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50


tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test),  verbose =1, shuffle=True,
          callbacks=[tensorboard])

Train on 5452 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f31256eac18>

# MODEL: Character-level Convolutional Networks

<img style="float: left;" src="img_files/char_cnn.png">

In [18]:
print("max sentences length of the dataset: ", max([len(s) for s in sentences_train]))

max sentences length of the dataset:  197


In [19]:
sequence_length = 200

alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n"

def pad_char(char_seq, padding_char=""):
    num_padding = sequence_length - len(char_seq)
    new_char_seq = char_seq + [padding_char] * num_padding
    return new_char_seq
    
def string_to_int8_conversion(char_seq, alphabet):
    x = np.array([alphabet.find(char) for char in char_seq], dtype=np.int8)
    return x    

def pad_to_ind(sentences):
    sentence_ind =[]
    for s in sentences:
        chars = pad_char(list(s))
        sentence_ind.append( string_to_int8_conversion(chars, alphabet))
    return sentence_ind

sentences_train_ind =  pad_to_ind(sentences_train)
sentences_test_ind =  pad_to_ind(sentences_test)


def batch_generator(x, y, batch_size=16):
    while True:
        # choose batch_size random sentences / labels from the data
        idx = np.random.randint(0, len(x), batch_size)
        
        feat = np.zeros(shape=[len(idx), len(alphabet), len(x[0]), 1])
        for i, id in enumerate(idx):
            for pos, word in enumerate(x[id]):
                feat[i][word][pos][0] = 1

        label = y[idx,:]
        yield feat, label
        

train_gen = batch_generator(sentences_train_ind, y_train)
test_gen = batch_generator(sentences_test_ind, y_test)

In [20]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling2D, Conv2D, Embedding
from keras.layers.merge import Concatenate

sequence_length = 200
num_quantized_chars=len(alphabet)
kernel_sizes = (4, 4, 3, 3, 3, 3)
num_filters1 = 64
num_filters2 = 128
dropout_prob = 0.5
hidden_dims = 256


model_input = Input(shape=[ num_quantized_chars, sequence_length, 1])

conv1 = Conv2D(num_filters1, 
               kernel_size = (num_quantized_chars, kernel_sizes[0]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(model_input)
pool1 = MaxPooling2D(pool_size=(1,3))(conv1)


conv2 = Conv2D(num_filters1, 
               kernel_size =(1, kernel_sizes[1]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(pool1)
pool2 = MaxPooling2D(pool_size=(1,3))(conv2)

conv3 = Conv2D( num_filters1,
                kernel_size = (1, kernel_sizes[2]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(pool2)

conv4 = Conv2D(num_filters2,
               kernel_size = (1, kernel_sizes[3]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(conv3)

conv5 = Conv2D(num_filters2,
               kernel_size =(1, kernel_sizes[4]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(conv4)

conv6 = Conv2D(num_filters2,
               kernel_size = (1, kernel_sizes[5]),
               strides=1, padding='Valid',activation='relu', use_bias=True)(conv5)
pool6 = MaxPooling2D(pool_size=(1,3))(conv6)

flat = Flatten()(pool6)
drop1 = Dropout(dropout_prob)(flat)

fc1 = Dense(hidden_dims, activation="relu")(drop1)
drop2 = Dropout(dropout_prob)(fc1)

fc2 = Dense(hidden_dims, activation="relu")(drop2)

model_output = Dense(6, activation="softmax")(fc2)


model = Model(model_input, model_output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy",f1])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 70, 200, 1)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 197, 64)        17984     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 65, 64)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 1, 62, 64)         16448     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 1, 20, 64)         0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 1, 18, 64)         12352     
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 1, 16, 128)        24704     
__________

In [21]:
from keras.callbacks import TensorBoard

batch_size = 64
num_epochs = 50

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0,
                          write_graph=True, write_images=False)

model.fit_generator(generator=train_gen,  
                    steps_per_epoch=y_train.shape[0] // batch_size,
                    epochs=num_epochs,
                    validation_data=test_gen,  
                    validation_steps=y_test.shape[0] // batch_size,
                    verbose =1,
                    callbacks=[tensorboard])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50


Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f312544a940>