## Question Classification

In [1]:
#read data
texts = []
labels = []

with open('data/LabelledData.txt','r') as f:
    for line in f:
        text, label = map(str,line.split(",,,"))
        texts.append(text.strip())
        labels.append(label.strip())

In [2]:
#for text, label in zip(texts[:10], labels[:10]):
#    print(text," -->", label)

In [3]:
import re
def pre_process(text):
    text = re.sub(r"\b's\b","is",text)
    text = re.sub(r"[^a-z?\.]"," ",text.lower())
    return text

In [4]:
processed_texts = [pre_process(text) for text in texts]

In [5]:
for text, label in zip(processed_texts[:10], labels[:10]):
    print(text," -->", label)

how did serfdom develop in and then leave russia ?  --> unknown
what films featured the character popeye doyle ?  --> what
how can i find a list of celebrities   real names ?  --> unknown
what fowl grabs the spotlight after the chinese year of the monkey ?  --> what
what is the full form of .com ?  --> what
what contemptible scoundrel stole the cork from my lunch ?  --> what
what team did baseball  s st. louis browns become ?  --> what
what is the oldest profession ?  --> what
what are liver enzymes ?  --> what
name the scar faced bounty hunter of the old west .  --> unknown


In [6]:
import numpy as np

X = np.array(texts)
y = np.array(labels, dtype='str')

In [9]:
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{'affirmation': 104, 'unknown': 272, 'what': 609, 'when': 96, 'who': 402}

In [47]:
from sklearn.preprocessing import LabelBinarizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os, sys

from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model, Sequential
from keras import utils
from keras.layers import concatenate, Activation
from keras.callbacks import ModelCheckpoint

In [20]:
MAX_SEQUENCE_LENGTH = 45
MAX_NUM_WORDS = 50
VALIDATION_SPLIT = 0.1

In [32]:
# Split data into train and test
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
data = X[indices]
labels = y[indices]
num_validation_samples = int(VALIDATION_SPLIT * X.shape[0])

In [36]:
train_x = data[:-num_validation_samples]
train_y = labels[:-num_validation_samples]
test_x = data[-num_validation_samples:]
test_y = labels[-num_validation_samples:]

In [37]:
encoder = LabelBinarizer()
encoder.fit(train_y)
y_train = encoder.transform(train_y)
y_test = encoder.transform(test_y)

In [38]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_x)
x_train = tokenizer.texts_to_matrix(train_x)
x_test = tokenizer.texts_to_matrix(test_x)

In [39]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (1335, 50)
x_test shape: (148, 50)
y_train shape: (1335, 5)
y_test shape: (148, 5)


In [40]:
batch_size = 32
epochs = 5

In [41]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 3414 unique tokens.


### ML

In [49]:
model = Sequential()
model.add(Dense(512, input_shape=(MAX_NUM_WORDS,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(5))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 512)               26112     
_________________________________________________________________
activation_4 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 2565      
_________________________________________________________________
activation_5 (Activation)    (None, 5)                 0         
Total params: 28,677
Trainable params: 28,677
Non-trainable params: 0
_________________________________________________________________


In [50]:
filepath="weights/best_weights.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [51]:
model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 1201 samples, validate on 134 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1d49961e10>

In [52]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.17618592687555262
Test accuracy: 0.9594594562375868


In [53]:
text_labels = encoder.classes_

In [58]:
for i in range(148):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    if test_y[i]!=predicted_label:
        print(test_x[i][:100], "...")
        print('Actual label:' + test_y[i])
        print("Predicted label: " + predicted_label + "\n")

what time do you go to the school ? ...
Actual label:when
Predicted label: what

when is boxing day ? ...
Actual label:what
Predicted label: when

when reading classified ads , what does eenty : other stand for ? ...
Actual label:when
Predicted label: what

are these lead free ? ...
Actual label:affirmation
Predicted label: unknown

what time of day did emperor hirohito die ? ...
Actual label:when
Predicted label: what

what time does the flight leave ? ...
Actual label:when
Predicted label: what

