In [2]:
import pandas as pd
import numpy
import os
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.utils import to_categorical
from keras.models import model_from_json
from sklearn.metrics import f1_score
import numpy as np

### Load the prepared data

In [13]:
num_classes = 2

#load train data
X1 = pd.read_csv('train.csv')

train_sentences = X1['data']
labels_train = X1['label']

#load test data
X2 = pd.read_csv('test.csv')
test_sentences = X2['data']

### Convert labels to categorical

In [16]:
labels = labels_train.values
labels[labels == 1] = 1
labels[labels == 2] = 0

y_train = to_categorical(labels, num_classes)

### Vectorize sentences using pre-trained word embedding

In [17]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec',encoding='utf8')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(train_sentences)
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_sentences), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_sentences), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Train a simple LSTM model 

In [19]:
# Add an Input Layer
input_layer = layers.Input((70, ))

# Add the word embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# Add the LSTM Layer
lstm_layer = layers.LSTM(100)(embedding_layer)

# Add the output Layers
output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(num_classes, activation="softmax")(output_layer1)

# Compile the model
model = models.Model(inputs=input_layer, outputs=output_layer2)
model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy',metrics=['accuracy'])

#Train the model
model.fit(train_seq_x, y_train, epochs=5, batch_size=8)

#Save the trained model

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Saved model to disk


### Test the model and get predictions for test data

In [20]:
#Load the trained model

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

#get predictions on train data and calculate F1-score with weighted mean
y_pred_train = model.predict(train_seq_x, batch_size=8,verbose=1)

print('F1-score with weighted average for training data is:', f1_score(y_train, y_pred_train.round(), average='weighted'))

# get prediction on test data
y_pred = model.predict(valid_seq_x, batch_size=8,verbose=1)
y_pred = numpy.argmax(y_pred, axis=1)

# get the id column of test data
qid = X2['id']

#get the test questions
test = X2['data']

#convert y_pred to real labels
preds = pd.DataFrame(y_pred, columns=['preds'])

preds[preds.preds == 0] = 'other_questions'
preds[preds.preds == 1] = '1-mark_questions'

qid = qid.values
data = test.values
preds = preds['preds']
preds = preds.values

result = {'id':qid, 'test-questions':data, 'preds': preds}
df = pd.DataFrame(result)
df.to_csv('questions-predicted.csv', index=False)

Loaded model from disk
F1-score with weighted average for training data is: 0.989303282501812
