# Spooky Author Identification: GloVe Models

## Implementation

### Import packages

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [19]:
from packages import *

### Load data

In [6]:
train, test, submission = load_data()

In [7]:
# Fit and return the encoded labels
le = LabelEncoder()
y_values = le.fit_transform(train['author'].values)
print(le.classes_)

['EAP' 'HPL' 'MWS']


In [8]:
# Split the training data into training and testing splits
X_train, X_test, y_train, y_test = train_test_split(train['text'].values,
                                                    y_values,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y_values)

In [None]:
X_train[0:3]

### Preprocess text

In [9]:
# We can still perform custom text processing beforehand
# list_sentences_train = train['text'].apply(lambda x: process_text(x)) # Series (19579,)
# list_sentences_test = test['text'].apply(lambda x: process_text(x))
list_sentences_train = list(X_train) # ndarray (15663,)
list_sentences_test = list(X_test)

In [10]:
GLOVE_EMBEDDINGS_FILE_PATH = '../input/embeddings/glove.6B.300d.txt' # Use `glove.840B.300d.txt`
EMBEDDING_DIM = 300
MAX_FEATURES = 12000 # The features are words, in this case
# count    19579.000000
# mean        26.730477
# std         19.048353
# min          2.000000
# 25%         15.000000
# 50%         23.000000
# 75%         34.000000
# max        861.000000
# Pad documents to a max length of 900 since the max length is 861.
# These input sequences should be padded so that they all
# have the same length in a batch of input data.
MAX_SEQUENCE_LENGTH = 900 # 70

# Only include the top `num_words` most common words
tokenizer = Tokenizer(num_words=MAX_FEATURES)
# Build the word index, requiring a list argument
tokenizer.fit_on_texts(list_sentences_train)

# Turn strings into a list of lists of integer indices such as [[688, 75, 1], [...]]
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

# `data` will later be used in `model.fit` call
# sequences is a list of lists
# Pad with 0.0 BEFORE each sequence
# Remove values, BEFORE each sequence, from sequences larger than `maxlen`
# Turn a list of integers into a 2D integer tensor of shape (samples, maxlen)
# --> (159571, 100) such as array([[0, 0, 0, 688], [0, 0, 0, 589]])
X_train_padded = pad_sequences(list_tokenized_train, 
                               maxlen=MAX_SEQUENCE_LENGTH, 
                               padding='pre', 
                               truncating='pre', 
                               value=0.0)
X_test_padded = pad_sequences(list_tokenized_test, 
                              maxlen=MAX_SEQUENCE_LENGTH, 
                              padding='pre', 
                              truncating='pre', 
                              value=0.0)

# Recover the computed word index, which appears as {'necessary': 1234, ...}
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

Found 23677 unique tokens.


### Prepare the embedding layer

In [11]:
# Load the GloVe embeddings into a dictionary
# This maps words (as strings) to their vector representation (as float vectors)
embeddings_index = {}
# f = open(os.path.join(GLOVE_DIR, 'glove_filename'))
f = open(GLOVE_EMBEDDINGS_FILE_PATH, encoding='utf-8')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found {} word vectors.'.format(len(embeddings_index)))

400000it [00:47, 8482.65it/s]

Found 400000 word vectors.





In [None]:
# Note: Only use this if creating a randomly initialized embedding matrix
# np.stack() --> 
#   array([[0.32, 0.7 ],
#          [0.42, 0.1 ]], dtype=float32)
# all_embeddings = np.stack(embeddings_index.values())
# embedding_mean, embedding_std = all_embeddings.mean(), all_embeddings.std()
# print(embedding_mean, embedding_std)

In [12]:
# Compute the embedding matrix using our training words `word_index` and
# the pre-trained embeddings `embeddings_index`
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

# Create an embedding matrix with random initialization for words that aren't in GloVe,
#   using the mean and stdev of the GloVe embeddings
# embedding_matrix = np.random.normal(loc=embedding_mean,
#                                     scale=embedding_std,
#                                     size=(vocab_size, EMBEDDING_DIM))

# Loop over each of the first `MAX_FEATURES` words of the `word_index` built from
# the dataset and retrieve its embedding vector from the GloVe `embeddings_index`
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    # Words not found in the `embeddings_index` will have their vectors in `embedding_matrix`
    # remain as all zeros
    # -- or --
    # remain as a random normalization of the mean and stdev of the GloVe embeddings

In [13]:
# Input: Sequences of integers with input shape: (samples, indices)
# Output: A 3D tensor of shape (samples, sequence_length, embedding_dim)
# Layer is frozen so that its weights, the embedding vectors,
# will not be updated during training.
# Note: May remove `weights` and `trainable` to train the embedding
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[embedding_matrix],
                            trainable=False)

### Build the model

In [14]:
# We load this embedding matrix into an Embedding layer.
# Note that we set trainable=False to prevent the weights from being updated during training.

# Custom metrics can be passed at the compilation step.
# The function would need to take (y_true, y_pred) as arguments and return 
# a single tensor value.
# def mean_pred(y_true, y_pred):
#     return K.mean(y_pred)

def create_cnn_model(embedding_layer):
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input_layer')
    x = embedding_layer(input_layer)
    
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(5)(x)
#     x = Conv1D(128, 5, activation='relu')(x)
#     x = MaxPooling1D(5)(x)
#     x = Conv1D(128, 5, activation='relu')(x)
#     x = MaxPooling1D(5)(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)

    output_layer = Dense(3, activation='softmax')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', # Try 'rmsprop'
                  metrics=['accuracy'])
#                   metrics=['accuracy', mean_pred])
    model.summary()

    return model

In [None]:
# scaler = StandardScaler()
# Sentence vectors are passed in here
# X_train_scaled = scaler.fit_transform(X_train) # Then converted to a numpy array with
# X_test_scaled = scaler.transform(X_test) # np.array()

In [15]:
# The target classes need to be one-hot encoded
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

In [None]:
# y_train_encoded[0:4]

In [None]:
# y_train_encoded.shape

In [16]:
model = create_cnn_model(embedding_layer)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     (None, 900)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 900, 300)          7103400   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 896, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 179, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 22912)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2932864   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total para

### Train the model

In [17]:
# stopper = EarlyStopping(monitor='val_loss',
#                         min_delta=0,
#                         patience=3,
#                         verbose=0,
#                         mode='auto')

history = model.fit(X_train_padded,
#                     X_train_scaled, # Or should it be scaled?
                    y_train_encoded,
                    batch_size=64, # 32, 128, 512
                    epochs=2, # 2, 5, 100
                    verbose=1,
#                     callbacks=[stopper],
                    validation_split=0.2,
#                     validation_data=[X_test_scaled, y_test_encoded], # Overrides split
                    shuffle=True)
model.save_weights('./models/cnn_glove_model.h5')

Train on 12530 samples, validate on 3133 samples
Epoch 1/2
Epoch 2/2


### Display the results

In [None]:
history.history

In [None]:
# history.history
"""
{'acc': [0.9771225346821195, 0.982747603137452], # Score per epoch
 'loss': [0.06964566658561674, 0.04653985751459787],
 'val_acc': [0.9809708705834689, 0.981837727556194],
 'val_loss': [0.05274372364098244, 0.0492371146594362]}
"""

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1) # range(1, 3)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
# seed = 7123478
# estimator = KerasClassifier(build_fn=create_lstm_model, epochs=2, batch_size=32)
# kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# results = cross_val_score(estimator, X_train, y, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Make predictions on the test data

In [None]:
y_test_final = model.predict([X_test_padded], batch_size=1024)
# model.load_weights('pre_trained_glove_model.h5')
y_test_metrics = model.evaluate(X_test_padded, y_test_final, batch_size=128, verbose=1)

In [None]:
y_test_metrics # [loss, accuracy] # Result in percentages

In [None]:
print('Logloss: {:.3f}'.format(calculate_logloss(y_test, y_test_final)))

### Check that submission is ready

In [None]:
# Check the final submission values
y_test_final[0:3]

In [None]:
y_test_final.shape

In [None]:
submission.shape

In [None]:
X_test[0:6]

In [None]:
y_test_encoded[0:6]
# [1, 0, 0] --> EAP
# [0, 1, 0] --> HPL
# [0, 0, 1] --> MWS

### Create a submission CSV

In [None]:
TEMP_SUBMISSION_FILE_PATH = '../input/temp_submission.csv'
temp_submission = pd.read_csv(TEMP_SUBMISSION_FILE_PATH)

temp_submission[['EAP', 'HPL', 'MWS']] = y_test_final
temp_submission.to_csv('../submissions/001_submission.csv', index=False)

In [None]:
# A CNN model

In [None]:
# A plain RNN model

In [None]:
# An RNN model with LSTM layers from Keras

In [None]:
# An RNN model with GRU layers from Keras

## Refinement

## Model Evaluation and Validation

Each model will be evaluated based on the logloss metric using either 5-fold or 10-fold cross validation; the lower the logloss, the better the model. Apart from evaluation, for either the top two performing algorithms or for each algorithm, I plan to run random search to tune certain hyperparameters for each algorithm for at least 60 iterations (but I’ll lower that number if it ends up taking way too long) in order to find the best model for this multiclass classification problem of authorship attribution.

## Justification

## Free-Form Visualization