# Spooky Author Identification: GloVe Models

## Implementation

### Import packages

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from packages import *

Using TensorFlow backend.


In [3]:
INPUT_DIR = '../input/'
EMBEDDINGS_DIR = f'{INPUT_DIR}embeddings/'
TRAIN_FILE_PATH = f'{INPUT_DIR}train.csv'
TEST_FILE_PATH = f'{INPUT_DIR}test.csv'
SAMPLE_SUBMISSION_FILE_PATH = f'{INPUT_DIR}sample_submission.csv'
GLOVE_EMBEDDINGS_FILE_PATH = f'{EMBEDDINGS_DIR}glove.6B.300d.txt' # Try `glove.840B.300d.txt`
EMBEDDING_DIM = 300
MAX_FEATURES = 12000 # The top most common words
MAX_SEQUENCE_LENGTH = 900 # Since max number of words in a sentence is 861; try 34 as well

### Load data

In [4]:
train, test, submission = load_data(TRAIN_FILE_PATH, 
                                    TEST_FILE_PATH, 
                                    SAMPLE_SUBMISSION_FILE_PATH)

### Preprocess text

In [5]:
# Todo: Decide whether or not to perform custom text preprocessing beforehand
# X_train_sequences = list(train['text'].apply(lambda x: process_text(x)).values)
# X_test_sequences = list(test['text'].apply(lambda x: process_text(x)).values)
X_train_sequences = list(train['text'].values)
X_test_sequences = list(test['text'].values)

# Tokenize and pad the sentences
X_train_tokenized, X_test_tokenized, word_index = compute_word_index(X_train_sequences,
                                                                     X_test_sequences,
                                                                     MAX_FEATURES,
                                                                     MAX_SEQUENCE_LENGTH)

Found 25943 unique tokens.


### Prepare the embedding layer

In [6]:
embeddings_index = load_glove_embeddings(GLOVE_EMBEDDINGS_FILE_PATH)

400000it [00:37, 10712.33it/s]

Found 400000 word vectors.





In [7]:
embedding_matrix, vocab_size = construct_embedding_matrix(word_index, 
                                                          embeddings_index, 
                                                          EMBEDDING_DIM)

### Train the model

In [13]:
from models import build_embedding_layer, build_cnn_model

# Fix a random seed for reproducibility
seed = 42
np.random.seed(seed)

In [10]:
# The target classes need to be converted to integers so that
# EAP --> 0
# HPL --> 1
# MWS --> 2
y_train_integers = integer_encode_classes(train['author'].values)

In [29]:
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
metrics = ['val_loss', 'val_acc', 'loss', 'acc']
cv_scores = {
    'val_loss': [],
    'val_acc': [],
    'loss': [],
    'acc': [],
}
best_model = None

for train_index, test_index in kfold.split(X_train_tokenized, y_train_integers):
    # The target classes need to be one-hot encoded so that
    # EAP --> 0 --> [1, 0, 0]
    # HPL --> 1 --> [0, 1, 0]
    # MWS --> 2 --> [0, 0, 1]
    y_train_encoded = one_hot_encode_classes(y_train_integers)
    
    # Prepare the splits of data
    X_train, y_train = X_train_tokenized[train_index], y_train_encoded[train_index]
    X_valid, y_valid = X_train_tokenized[test_index], y_train_encoded[test_index]
    
    # Build the embedding layer
    embedding_layer = build_embedding_layer(embedding_matrix, 
                                            vocab_size, 
                                            EMBEDDING_DIM, 
                                            MAX_SEQUENCE_LENGTH)
    # Build the model
    model = build_cnn_model(embedding_layer, MAX_SEQUENCE_LENGTH)
    # Train the model
    history = model.fit(X_train,
                        y_train,
                        batch_size=64, # 32, 64, 128, 256, 512
                        epochs=1, # 100
                        verbose=1,
#                         callbacks=[stopper],
                        validation_data=[X_valid, y_valid],
                        shuffle=True)
    # Todo: Save the best model thus far
    best_model = model
    # Save the scores for later evaluation
    for metric in metrics:
        cv_score = history.history[metric][0]
        cv_scores[metric].append(cv_score)

# Calculate mean and standard deviation across all folds' scores
for metric in metrics:
    mean = np.mean(cv_scores[metric])
    std = np.std(cv_scores[metric])
    print('{} mean and std: {:.4f} (+/- {:.4f})'.format(metric, mean, std))

# stopper = EarlyStopping(monitor='val_loss',
#                         min_delta=0,
#                         patience=3,
#                         verbose=0,
#                         mode='auto')

Train on 13051 samples, validate on 6528 samples
Epoch 1/1
Train on 13053 samples, validate on 6526 samples
Epoch 1/1
Train on 13054 samples, validate on 6525 samples
Epoch 1/1
val_loss mean and std: 0.8530 (+/- 0.0093)
val_acc mean and std: 0.6238 (+/- 0.0083)
loss mean and std: 0.9074 (+/- 0.0031)
acc mean and std: 0.5790 (+/- 0.0015)


### Display the results

In [24]:
history.history['val_loss'][0]

0.8470147880375157

In [None]:
# history.history
"""
{'acc': [0.9771225346821195, 0.982747603137452], # Score per epoch
 'loss': [0.06964566658561674, 0.04653985751459787],
 'val_acc': [0.9809708705834689, 0.981837727556194],
 'val_loss': [0.05274372364098244, 0.0492371146594362]}
"""

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1) # range(1, 3)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
# seed = 7123478
# estimator = KerasClassifier(build_fn=create_lstm_model, epochs=2, batch_size=32)
# kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# results = cross_val_score(estimator, X_train, y, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Make predictions on the test data

In [None]:
y_test_final = model.predict([X_test_padded], batch_size=1024)
# model.load_weights('pre_trained_glove_model.h5')
y_test_metrics = model.evaluate(X_test_padded, y_test_final, batch_size=128, verbose=1)

In [None]:
y_test_metrics # [loss, accuracy] # Result in percentages

In [None]:
print('Logloss: {:.3f}'.format(calculate_logloss(y_test, y_test_final)))

### Check that submission is ready

In [None]:
# Check the final submission values
y_test_final[0:3]

In [None]:
y_test_final.shape == submission.shape

### Create a submission CSV

In [None]:
TEMP_SUBMISSION_FILE_PATH = '../input/temp_submission.csv'
temp_submission = pd.read_csv(TEMP_SUBMISSION_FILE_PATH)

temp_submission[['EAP', 'HPL', 'MWS']] = y_test_final
temp_submission.to_csv('../submissions/001_submission.csv', index=False)

In [None]:
# A CNN model

In [None]:
# A plain RNN model

In [None]:
# An RNN model with LSTM layers from Keras

In [None]:
# An RNN model with GRU layers from Keras

## Refinement

## Model Evaluation and Validation

Each model will be evaluated based on the logloss metric using either 5-fold or 10-fold cross validation; the lower the logloss, the better the model. Apart from evaluation, for either the top two performing algorithms or for each algorithm, I plan to run random search to tune certain hyperparameters for each algorithm for at least 60 iterations (but I’ll lower that number if it ends up taking way too long) in order to find the best model for this multiclass classification problem of authorship attribution.

## Justification

## Free-Form Visualization