In [1]:
"""
Train convolutional network for review spam detection. Based on
https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras
which is based on
"Convolutional Neural Networks for Sentence Classification" by Yoon Kim
http://arxiv.org/pdf/1408.5882v2.pdf
(https://github.com/yoonkim/CNN_sentence)

"""

import numpy as np
execfile("data_helpers.py")
# from w2v import train_word2vec ## not needed with pretrained vectors

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D, GlobalMaxPooling1D

from sklearn import metrics
import csv
import cPickle as cpickle

np.random.seed(2)

# Parameters
# ==================================================


model_variation = 'CNN-non-static'  ## CNN-rand | CNN-non-static | CNN-static
print('Model variation is %s' % model_variation)

# Model Hyperparameters
embedding_dim = 300
filter_sizes = (3,4,5)
num_filters = 50
dropout_prob = (0.25, 0.5)
hidden_dims = 50

# Training parameters
batch_size = 32
num_epochs = 40
val_split = 0.1

# Paths to files
data_name = "100_v2"
pos_path = "../CNN_sentence/spam_" + data_name + ".txt"
neg_path = "../CNN_sentence/ham_" + data_name + ".txt"
test_path = "../CNN_sentence/test_" + data_name + ".txt"
test_labels_path = "../CNN_sentence/test_results_" + data_name + ".txt"
w2v_path = "../Electronics_vectors300.bin"

# Output names
history_name = "history_100_f345_elec.p"
preds_name = "preds_100_f345_elec.txt"


# Data Preparatopn
# ==================================================
#
# Load data
print("Loading data...")
x, y, x_test, vocabulary, embeddings, sequence_length = load_data(pos_path, neg_path, test_path, w2v_path)

if model_variation=='CNN-non-static' or model_variation=='CNN-static':
    embedding_weights = [embeddings]
    if model_variation=='CNN-static':
        x = embedding_weights[0][x]
        x_test = embedding_weights[0][x_test]
elif model_variation=='CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)

print("Vocabulary Size: {:d}".format(len(vocabulary)))

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN not available)


Model variation is CNN-non-static
Loading data...
29235 words occure in pre-trained embeddings.
Vocabulary Size: 33187


In [2]:
# Building model
# ==================================================
#
# graph subnet with one input and one output,
# convolutional layers concateneted in parallel
graph_in = Input(shape=(sequence_length, embedding_dim))
convs = []
for fsz in filter_sizes:
    conv = Convolution1D(nb_filter=num_filters,
                         filter_length=fsz,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(graph_in)
    pool = GlobalMaxPooling1D()(conv)
    #flatten = Flatten()(pool) # ouput of pooling layer (None, 300) -> one value per sentence
    convs.append(pool)
    
if len(filter_sizes)>1:
    out = Merge(mode='concat')(convs)
else:
    out = convs[0]

graph = Model(input=graph_in, output=out)

# main sequential model
model = Sequential()
if not model_variation=='CNN-static':
    model.add(Embedding(len(vocabulary)+1, embedding_dim, input_length=sequence_length,
                        weights=embedding_weights))
model.add(Dropout(dropout_prob[0], input_shape=(sequence_length, embedding_dim)))
model.add(graph)
model.add(Dense(hidden_dims))
model.add(Dropout(dropout_prob[1]))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
# print(x_test.shape)
# print(x.shape)
# print(embeddings.shape)
# print(sequence_length)

In [None]:
# Training model
# ==================================================
history = model.fit(x_shuffled, y_shuffled, batch_size=batch_size,
          nb_epoch=40, validation_split=val_split, verbose=2)

Train on 45266 samples, validate on 5030 samples
Epoch 1/40


In [None]:
# Predict and get performance
# ==================================================
y_test_pred = model.predict_classes(x_test, verbose = 0)

with open(test_labels_path, 'r') as csvfile:
    reader = csv.reader(csvfile)
    y_test = np.asarray([int(r[0]) for r in reader])


# Confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
print(cm)
# Accuracy
acc = metrics.accuracy_score(y_test, y_test_pred)
print(acc)
# F1 score
f1 = metrics.f1_score(y_test, y_test_pred)
print(f1)
# AUC
y_test_probs = model.predict_proba(x_test, verbose = 0) 
auc = metrics.roc_auc_score(y_test, y_test_probs)
print(auc)

In [7]:
# save training history graph and predictions
with open(history_name, "w") as f:
    cpickle.dump(history.history, f)

all_preds = np.concatenate([y_test_pred, y_test_probs], axis = 1)

with open(preds_name, "w") as f:
    writer = csv.writer(f, delimiter = ";")
    for num in all_preds:
        writer.writerow(num)

In [None]:
# Print training history
# ==================================================

print(history.history.keys())

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## summarize history for loss
#plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
#plt.title('model loss')
#plt.ylabel('loss')
#plt.xlabel('epoch')
#plt.legend(['train', 'test'], loc='upper left')
#plt.show()