# Spooky Author Identification: GloVe Models

## Implementation

### Import packages

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '' # '' to run on CPU, '0' to run on the first GPU

In [3]:
# If you don't already have these packages, run this cell
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/marifel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/marifel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/marifel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/marifel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
from packages import *

Using TensorFlow backend.


In [5]:
MODEL_NAME = 'test_glove'

INPUT_DIR = '../input/'
TRAIN_FILE_PATH = f'{INPUT_DIR}train.csv'
TEST_FILE_PATH = f'{INPUT_DIR}test.csv'
SAMPLE_SUBMISSION_FILE_PATH = f'{INPUT_DIR}sample_submission.csv'

EMBEDDINGS_DIR = f'{INPUT_DIR}embeddings/'
EMBEDDINGS_FILE_PATH = f'{EMBEDDINGS_DIR}glove.840B.300d.txt'

OUTPUT_DIR = '../output/'
OUTPUT_LOGS_DIR = f'{OUTPUT_DIR}{MODEL_NAME}/logs/'
OUTPUT_MODELS_DIR = f'{OUTPUT_DIR}{MODEL_NAME}/models/'
OUTPUT_SCORES_DIR = f'{OUTPUT_DIR}{MODEL_NAME}/scores/'
OUTPUT_SUBMISSIONS_DIR = f'{OUTPUT_DIR}{MODEL_NAME}/submissions/'
OUTPUT_SUMMARIES_DIR = f'{OUTPUT_DIR}{MODEL_NAME}/summaries/'

# Create the output directories if they do not exist (the `_` is necessary
# in order to create intermediate directories and is itself not created)
os.makedirs(os.path.dirname(f'{OUTPUT_LOGS_DIR}_'), exist_ok=True)
os.makedirs(os.path.dirname(f'{OUTPUT_MODELS_DIR}_'), exist_ok=True)
os.makedirs(os.path.dirname(f'{OUTPUT_SCORES_DIR}_'), exist_ok=True)
os.makedirs(os.path.dirname(f'{OUTPUT_SUBMISSIONS_DIR}_'), exist_ok=True)
os.makedirs(os.path.dirname(f'{OUTPUT_SUMMARIES_DIR}_'), exist_ok=True)

EMBEDDING_DIM = 300
MAX_FEATURES = None # The top most common words if an integer; otherwise, all words are used
MAX_SEQUENCE_LENGTH = 128
N_SPLITS = 10

# Fix a random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [6]:
train, test, submission = load_data(TRAIN_FILE_PATH, 
                                    TEST_FILE_PATH, 
                                    SAMPLE_SUBMISSION_FILE_PATH)

In [7]:
# Apply text preprocessing on each sentence
X_train_sequences = list(train['text'].apply(
    lambda x: process_text(x,
                           lower=False,
                           remove_punc=False,
                           normalize_spelling=True,
                           stem=False,
                           lemmatize=False,
                           remove_stopwords=False)).values)
X_test_sequences = list(test['text'].apply(
    lambda x: process_text(x,
                           lower=False,
                           remove_punc=False,
                           normalize_spelling=True,
                           stem=False,
                           lemmatize=False,
                           remove_stopwords=False)).values)

# Tokenize and pad the sentences
X_train_tokenized, X_test_tokenized, word_index = compute_word_index(X_train_sequences,
                                                                     X_test_sequences,
                                                                     MAX_FEATURES,
                                                                     MAX_SEQUENCE_LENGTH)

Found 27675 unique tokens.


In [8]:
embeddings_index = load_embeddings(EMBEDDINGS_FILE_PATH)

2196017it [02:05, 17430.79it/s]

Found 2196016 word vectors.





In [9]:
embedding_matrix, vocab_size, num_unknown = construct_embedding_matrix(word_index, 
                                                                       embeddings_index, 
                                                                       EMBEDDING_DIM)
# Here we subtract 1 from the vocab size because 1 has been added to the
# actual number of tokens to account for masking in the embedding matrix
unknown_word_percentage = (num_unknown / (vocab_size - 1)) * 100
unknown_word_lines = ('Number of vocabulary words not found in the pre-trained embeddings: '
                      f'{num_unknown} of {vocab_size - 1} '
                      f'({unknown_word_percentage:.2f}%)')
print(unknown_word_lines)
preprocessing_file_path = f'{OUTPUT_LOGS_DIR}preprocessing.log.txt'
save_line_to_file(unknown_word_lines, preprocessing_file_path, 'w')

Number of vocabulary words not found in the pre-trained embeddings: 1634 of 27675 (5.90%)


In [10]:
# The target classes need to be converted to integers so that
# EAP --> 0
# HPL --> 1
# MWS --> 2
y_train_integers = integer_encode_classes(train['author'].values)

Original class labels: ['EAP' 'HPL' 'MWS']


In [11]:
# The target classes need to be one-hot encoded so that
# EAP --> 0 --> [1, 0, 0]
# HPL --> 1 --> [0, 1, 0]
# MWS --> 2 --> [0, 0, 1]
y_train_encoded = one_hot_encode_classes(y_train_integers)

In [12]:
# Import model-dependent files
from models import build_embedding_layer, build_model_callbacks, save_model_summary
from models import get_random_cnn_params as get_random_model_params
from models import build_cnn_model as build_model

In [13]:
# Set this to the best number of epochs based on the evaluation phase
final_num_epochs = 3

In [14]:
# Select the best model params based on the evaluation phase
# CNN
final_model_params = {
    'batch_size': 64,
    'filters': 250,
    'kernel_size': 3,
    'dropout_rate': 0.2,
    'optimizer': 'adam',
    'use_special_arch': True,
    'normal_arch_params': {}
}
# RNN
# final_model_params = {
#     'batch_size': 64,
#     'use_gru_layer': True,
#     'use_global_max_pooling_layer': True,
#     'units': 128,
#     'spatial_dropout_rate': 0.2,
#     'optimizer': 'adam',
#     'num_rnn_stacks': 1,
# }
final_batch_size = final_model_params['batch_size']

In [15]:
# Build the embedding layer
embedding_layer = build_embedding_layer(embedding_matrix, 
                                        vocab_size, 
                                        EMBEDDING_DIM, 
                                        MAX_SEQUENCE_LENGTH)
# Build the model with the best model params
model = build_model(embedding_layer, MAX_SEQUENCE_LENGTH, final_model_params)
# Save the model architecture, weights, and optimizer state to file
model.save(f'{OUTPUT_MODELS_DIR}final.model.hdf5')
# Save the model summary to file
save_model_summary(model, f'{OUTPUT_SUMMARIES_DIR}final.model_summary.txt')

In [16]:
# Train the model
history = model.fit(X_train_tokenized,
                    y_train_encoded,
                    batch_size=final_batch_size,
                    epochs=final_num_epochs,
                    verbose=1,
                    shuffle=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
predictions = model.predict(X_test_tokenized, batch_size=final_batch_size, verbose=1)



In [18]:
# Check the final submission values
predictions[0:3]

array([[0.05645953, 0.00371291, 0.9398276 ],
       [0.97289103, 0.02237941, 0.00472952],
       [0.00460288, 0.9932817 , 0.00211538]], dtype=float32)

In [19]:
submission[['EAP', 'HPL', 'MWS']] = predictions
submission_num = 26
submission_description = 'glove_best_manual_cnn_after_tp_i_2'
submission_filename = f'{submission_num:03d}_{submission_description}.csv'
submission_file_path = f'{OUTPUT_SUBMISSIONS_DIR}{submission_filename}'
submission.to_csv(submission_file_path, index=False)

In [20]:
calculate_mean_logloss(0.47272, 0.48951)

Mean logloss: 0.47776
