# Spooky Author Identification: GloVe Models

## Implementation

### Import packages

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from packages import *

Using TensorFlow backend.


In [3]:
INPUT_DIR = '../input/'
EMBEDDINGS_DIR = f'{INPUT_DIR}embeddings/'
TRAIN_FILE_PATH = f'{INPUT_DIR}train.csv'
TEST_FILE_PATH = f'{INPUT_DIR}test.csv'
SAMPLE_SUBMISSION_FILE_PATH = f'{INPUT_DIR}sample_submission.csv'
GLOVE_EMBEDDINGS_FILE_PATH = f'{EMBEDDINGS_DIR}glove.6B.300d.txt' # Try `glove.840B.300d.txt`
EMBEDDING_DIM = 300
MAX_FEATURES = 12000 # The top most common words
MAX_SEQUENCE_LENGTH = 900 # Since max number of words in a sentence is 861; try 34 as well

### Load data

In [4]:
train, test, submission = load_data(TRAIN_FILE_PATH, 
                                    TEST_FILE_PATH, 
                                    SAMPLE_SUBMISSION_FILE_PATH)

### Preprocess text

In [5]:
# Todo: Decide whether or not to perform custom text preprocessing beforehand
# X_train_sequences = list(train['text'].apply(lambda x: process_text(x)).values)
# X_test_sequences = list(test['text'].apply(lambda x: process_text(x)).values)
X_train_sequences = list(train['text'].values)
X_test_sequences = list(test['text'].values)

# Tokenize and pad the sentences
X_train_tokenized, X_test_tokenized, word_index = compute_word_index(X_train_sequences,
                                                                     X_test_sequences,
                                                                     MAX_FEATURES,
                                                                     MAX_SEQUENCE_LENGTH)

Found 25943 unique tokens.


In [6]:
# The target classes need to be one-hot encoded so that
# [1, 0, 0] --> EAP
# [0, 1, 0] --> HPL
# [0, 0, 1] --> MWS
y_train_encoded = one_hot_encode_classes(train['author'].values)

Original class labels: ['EAP' 'HPL' 'MWS']


### Prepare the embedding layer

In [7]:
embeddings_index = load_glove_embeddings(GLOVE_EMBEDDINGS_FILE_PATH)

400000it [00:43, 9170.46it/s]


Found 400000 word vectors.


In [8]:
embedding_matrix, vocab_size = construct_embedding_matrix(word_index, 
                                                          embeddings_index, 
                                                          EMBEDDING_DIM)

In [9]:
# Input: Sequences of integers with input shape: (samples, indices)
# Output: A 3D tensor of shape (samples, sequence_length, embedding_dim)
#
# Layer is frozen so that its weights (the embedding vectors)
# will not be updated during training.
#
# Note: You can remove `weights` and `trainable` to train the embedding.
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            trainable=False)

### Build the model

In [14]:
# We load this embedding matrix into an Embedding layer.
# Note that we set trainable=False to prevent the weights from being updated during training.

# Custom metrics can be passed at the compilation step.
# The function would need to take (y_true, y_pred) as arguments and return 
# a single tensor value.
# def mean_pred(y_true, y_pred):
#     return K.mean(y_pred)

def create_cnn_model(embedding_layer):
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input_layer')
    x = embedding_layer(input_layer)
    
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
#     x = Conv1D(128, 5, activation='relu')(x)
#     x = MaxPooling1D(5)(x)
#     x = Conv1D(128, 5, activation='relu')(x)
#     x = MaxPooling1D(5)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)

    output_layer = Dense(3, activation='softmax')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', # Try 'rmsprop'
                  metrics=['accuracy'])
#                   metrics=['accuracy', mean_pred])
    model.summary()

    return model

In [16]:
model = create_cnn_model(embedding_layer)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 900)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 900, 300)          7103400   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 896, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 179, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 22912)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2932864   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total para

### Train the model

In [17]:
# stopper = EarlyStopping(monitor='val_loss',
#                         min_delta=0,
#                         patience=3,
#                         verbose=0,
#                         mode='auto')

history = model.fit(X_train_padded,
#                     X_train_scaled, # Or should it be scaled?
                    y_train_encoded,
                    batch_size=64, # 32, 128, 512
                    epochs=2, # 2, 5, 100
                    verbose=1,
#                     callbacks=[stopper],
                    validation_split=0.2,
#                     validation_data=[X_test_scaled, y_test_encoded], # Overrides split
                    shuffle=True)
model.save_weights('./models/cnn_glove_model.h5')

Train on 12530 samples, validate on 3133 samples
Epoch 1/2
Epoch 2/2


### Display the results

In [None]:
history.history

In [None]:
# history.history
"""
{'acc': [0.9771225346821195, 0.982747603137452], # Score per epoch
 'loss': [0.06964566658561674, 0.04653985751459787],
 'val_acc': [0.9809708705834689, 0.981837727556194],
 'val_loss': [0.05274372364098244, 0.0492371146594362]}
"""

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1) # range(1, 3)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
# seed = 7123478
# estimator = KerasClassifier(build_fn=create_lstm_model, epochs=2, batch_size=32)
# kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# results = cross_val_score(estimator, X_train, y, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Make predictions on the test data

In [None]:
y_test_final = model.predict([X_test_padded], batch_size=1024)
# model.load_weights('pre_trained_glove_model.h5')
y_test_metrics = model.evaluate(X_test_padded, y_test_final, batch_size=128, verbose=1)

In [None]:
y_test_metrics # [loss, accuracy] # Result in percentages

In [None]:
print('Logloss: {:.3f}'.format(calculate_logloss(y_test, y_test_final)))

### Check that submission is ready

In [None]:
# Check the final submission values
y_test_final[0:3]

In [None]:
y_test_final.shape == submission.shape

### Create a submission CSV

In [None]:
TEMP_SUBMISSION_FILE_PATH = '../input/temp_submission.csv'
temp_submission = pd.read_csv(TEMP_SUBMISSION_FILE_PATH)

temp_submission[['EAP', 'HPL', 'MWS']] = y_test_final
temp_submission.to_csv('../submissions/001_submission.csv', index=False)

In [None]:
# A CNN model

In [None]:
# A plain RNN model

In [None]:
# An RNN model with LSTM layers from Keras

In [None]:
# An RNN model with GRU layers from Keras

## Refinement

## Model Evaluation and Validation

Each model will be evaluated based on the logloss metric using either 5-fold or 10-fold cross validation; the lower the logloss, the better the model. Apart from evaluation, for either the top two performing algorithms or for each algorithm, I plan to run random search to tune certain hyperparameters for each algorithm for at least 60 iterations (but I’ll lower that number if it ends up taking way too long) in order to find the best model for this multiclass classification problem of authorship attribution.

## Justification

## Free-Form Visualization