# NAACL 2018 Shared Task - Metaphor Detection

Add description

## Prerequisites 

- Facebook FastText Embeddings for English
- https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md

## Preflight Checks

- Installed requirements.txt
- (optional) Download vuamc.zip http://ota.ahds.ac.uk/headers/2541.xml

https://github.com/EducationalTestingService/metaphor/tree/master/NAACL-FLP-shared-task

In [2]:
import utils
import corpus
import evaluate
import features
import numpy

import os
import collections

from keras.utils import to_categorical
from keras.layers import TimeDistributed, Bidirectional, LSTM, Input, Masking, Dense
from keras.models import Model
from keras import backend as kerasbackend
from sklearn.model_selection import KFold

In [3]:
# Check for VUAMC CSV files and generate if necessary

if not os.path.exists('source/vuamc_corpus_test.csv') and not os.path.exists('source/vuamc_corpus_train.csv'):
    print('VUAMC training and test data not found. Generating...')
    # utils.download_vuamc_xml()
    # utils.generate_vuamc_csv()

In [4]:
# Load Train Corpus from CSV
c_train = corpus.VUAMC('source/vuamc_corpus_train.csv', 'source/verb_tokens_train_gold_labels.csv')
c_train.validate_corpus()
print('Loaded and validated training corpus')

# Load Test Corpus from CSV
c_test = corpus.VUAMC('source/vuamc_corpus_test.csv', 'source/verb_tokens_test.csv', mode='test')
c_test.validate_corpus()
print('Loaded and validated test corpus')

Loaded and validated training corpus
Loaded and validated test corpus


In [5]:
# Shows that we got imbalanced classes in the training data
number_of_all_labels = len(c_train.label_list)
count_of_label_classes = collections.Counter(c_train.label_list)

percentage_of_non_metaphor_tokens = round(count_of_label_classes[0] / number_of_all_labels * 100)
percentage_of_metaphor_tokens = round(count_of_label_classes[1] / number_of_all_labels * 100)
ratio = utils.simplify_ratio(percentage_of_non_metaphor_tokens, percentage_of_metaphor_tokens)
assert(percentage_of_non_metaphor_tokens + percentage_of_metaphor_tokens == 100)

print('Percentage of metaphor tokens: {}'.format(percentage_of_metaphor_tokens))
print('Percentage of non-metaphor tokens: {}'.format(percentage_of_non_metaphor_tokens))
print('Ratio: {}:{}'.format(ratio[0], ratio[1]))

Percentage of metaphor tokens: 3
Percentage of non-metaphor tokens: 97
Ratio: 1:32


In [12]:
# Global configuration
MAX_SENTENCE_LENGTH = 50
EMBEDDING_DIM = 300
KFOLD_SPLIT = 5
KERAS_OPTIMIZER = 'rmsprop'
KERAS_METRICS = [utils.f1]
KERAS_EPOCHS = 1
KERAS_BATCH_SIZE = 32

In [7]:
# embeddings = features.Word2Vec()
embeddings = features.DummyEmbeddings(EMBEDDING_DIM)
x, y = features.generate_input_and_labels(c_train.sentences, Vectors=embeddings)
x_test, y_test = features.generate_input_and_labels(c_test.sentences, Vectors=embeddings)

# Free up some memory
del embeddings
print('Deleted Word Embeddings')

# Input data and categorical labels
x_input = x
y_labels = to_categorical(y, 2)

100%|██████████| 7873/7873 [00:01<00:00, 7172.75it/s]
100%|██████████| 2694/2694 [00:00<00:00, 9546.51it/s]


Deleted Word Embeddings


In [8]:
# Generate loss_weight, since out dataset contains 97% non-metaphor tokens
KERAS_LOSS = utils.weighted_categorical_crossentropy(ratio)
print('loss_weights: {}'.format(ratio))

loss_weights: (1, 32)


In [14]:
# Create and compile model
inputs = Input(shape=(MAX_SENTENCE_LENGTH, EMBEDDING_DIM))
model = Masking(mask_value=[-1] * EMBEDDING_DIM)(inputs)
model = Bidirectional(LSTM(100, return_sequences=True, dropout=0, recurrent_dropout=0.25))(model)
outputs = TimeDistributed(Dense(2, activation='softmax'))(model)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=KERAS_OPTIMIZER, loss=KERAS_LOSS, metrics=KERAS_METRICS)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50, 300)           0         
_________________________________________________________________
masking_3 (Masking)          (None, 50, 300)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 50, 200)           320800    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 50, 2)             402       
Total params: 321,202
Trainable params: 321,202
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Generate Training and Validation split
kfold = KFold(n_splits=KFOLD_SPLIT, shuffle=True, random_state=1337)
for train, test in kfold.split(x_input, y_labels):
    x_train = x_input[train]
    x_val = x_input[test]
    y_train = y_labels[train]
    y_val = y_labels[test]

    # Fit the model for each split
    model.fit(x_train, y_train,
              batch_size=KERAS_BATCH_SIZE,
              epochs=KERAS_EPOCHS,
              validation_data=(x_val, y_val))

    scores = model.evaluate(x_val, y_val)
    print('Test score: {:.2%}'.format(scores[0]))
    print('Test accuracy: {:.2%}'.format(scores[1]))

Train on 6298 samples, validate on 1575 samples
Epoch 1/1
Test score: 52.16%
Test accuracy: 7163.81%
Train on 6298 samples, validate on 1575 samples
Epoch 1/1
Test score: 53.82%
Test accuracy: 6691.30%
Train on 6298 samples, validate on 1575 samples
Epoch 1/1
Test score: 50.25%
Test accuracy: 7044.19%
Train on 6299 samples, validate on 1574 samples
Epoch 1/1
Test score: 50.73%
Test accuracy: 7919.19%
Train on 6299 samples, validate on 1574 samples
Epoch 1/1
Test score: 52.37%
Test accuracy: 8140.66%


In [16]:
# Generate list of label predictions for each sentence
float_predictions = model.predict(x_test, batch_size=KERAS_BATCH_SIZE)
binary_predictions = kerasbackend.argmax(float_predictions)
label_predictions = kerasbackend.eval(binary_predictions)

# Write prediction to CSV file
predictions_file = 'predictions.csv'
standard_file = 'source/verb_tokens_test_gold_labels.csv'

rows = evaluate.corpus_evaluation(c_test, label_predictions, MAX_SENTENCE_LENGTH)
evaluate.csv_evalutation(rows, predictions_file)
results = evaluate.precision_recall_f1(predictions_file, standard_file)

print(results)

Result(precision=0.28904249871991805, recall=0.6846573681018799, f1=0.40648064806480644)


# Plots

In [18]:
import plotly 
plotly.offline.init_notebook_mode(connected=True)

loss_p = plotly.graph_objs.Scatter(
    y = model.history.history['loss'],
    mode = 'lines+markers',
    name = 'Loss'
)

val_loss_p = plotly.graph_objs.Scatter(
    y = model.history.history['val_loss'],
    mode = 'lines+markers',
    name = 'Validation Loss'
)

acc_p = plotly.graph_objs.Scatter(
    y = model.history.history['f1'],
    mode = 'lines+markers',
    name = 'Categorical Accuracy'
)

val_acc_p = plotly.graph_objs.Scatter(
    y = model.history.history['val_f1'],
    mode = 'lines+markers',
    name = 'Validation Categorical Accuracy'
)

layout = plotly.graph_objs.Layout(title="Training History",
                yaxis=dict(title='Value'),
                xaxis=dict(title='Epoch'))

data = [loss_p, acc_p, val_loss_p, val_acc_p]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-train-history')