## Question Classification

In [1]:
#read data
texts = []
labels = []

with open('data/LabelledData.txt','r') as f:
    for line in f:
        text, label = map(str,line.split(",,,"))
        texts.append(text.strip())
        labels.append(label.strip())

In [2]:
#for text, label in zip(texts[:10], labels[:10]):
#    print(text," -->", label)

In [3]:
import re
def pre_process(text):
    text = re.sub(r"\b's\b","is",text)
    text = re.sub(r"[^a-z?\.]"," ",text.lower())
    return text

In [4]:
processed_texts = [pre_process(text) for text in texts]

In [5]:
for text, label in zip(processed_texts[:10], labels[:10]):
    print(text," -->", label)

how did serfdom develop in and then leave russia ?  --> unknown
what films featured the character popeye doyle ?  --> what
how can i find a list of celebrities   real names ?  --> unknown
what fowl grabs the spotlight after the chinese year of the monkey ?  --> what
what is the full form of .com ?  --> what
what contemptible scoundrel stole the cork from my lunch ?  --> what
what team did baseball  s st. louis browns become ?  --> what
what is the oldest profession ?  --> what
what are liver enzymes ?  --> what
name the scar faced bounty hunter of the old west .  --> unknown


In [7]:
import numpy as np

X = np.array(texts)
y = np.array(labels, dtype='str')

In [8]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
y = encoder.fit_transform(y)

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os, sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
MAX_SEQUENCE_LENGTH = 45
MAX_NUM_WORDS = 50
VALIDATION_SPLIT = 0.2

In [11]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 3675 unique tokens.


In [13]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = y

In [14]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (1483, 45)
Shape of label tensor: (1483, 5)


### ML

In [15]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import concatenate, Activation

In [16]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [17]:
x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [24]:
x_train.shape, x_val.shape

((1187, 45), (296, 45))

In [19]:
vocab_size = len(word_index)+1

In [31]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(vocab_size, 100)(sequence_input)

x1 = Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1)(embedded_sequences)
x1 = GlobalMaxPooling1D()(x1)

x2 = Conv1D(filters=50, kernel_size=3, padding='valid', activation='relu', strides=1)(embedded_sequences)
x2 = GlobalMaxPooling1D()(x2)

merged = concatenate([x1, x2], axis=1)
merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = Dense(5)(merged)
output = Activation('sigmoid')(merged)

model = Model(inputs=[sequence_input], outputs=[output])
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 45)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 45, 100)      367600      input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 44, 50)       10050       embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 43, 50)       15050       embedding_4[0][0]                
__________________________________________________________________________________________________
global_max

In [32]:
from keras.callbacks import ModelCheckpoint
filepath="weights/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [33]:
model.fit(x_train, y_train,
          batch_size=64,
          epochs=5,
          validation_split=0.1, callbacks=[checkpoint])

Train on 1068 samples, validate on 119 samples
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.80000, saving model to weights/CNN_best_weights.01-0.8000.hdf5
Epoch 2/5

Epoch 00002: val_acc improved from 0.80000 to 0.86555, saving model to weights/CNN_best_weights.02-0.8655.hdf5
Epoch 3/5

Epoch 00003: val_acc improved from 0.86555 to 0.91765, saving model to weights/CNN_best_weights.03-0.9176.hdf5
Epoch 4/5

Epoch 00004: val_acc improved from 0.91765 to 0.94622, saving model to weights/CNN_best_weights.04-0.9462.hdf5
Epoch 5/5

Epoch 00005: val_acc improved from 0.94622 to 0.95462, saving model to weights/CNN_best_weights.05-0.9546.hdf5


<keras.callbacks.History at 0x7f6faca86588>

In [34]:
score = model.evaluate(x_val, y_val,
                       batch_size=64, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.07537008389025121
Test accuracy: 0.9743243165918298


In [35]:
text_labels = encoder.classes_

In [None]:
for i in range(10):
    prediction = model.predict(np.array([x_val[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts.iloc[i][:50], "...")
    print('Actual label:' + test_tags.iloc[i])
    print("Predicted label: " + predicted_label + "\n")