# Event detection BiLSTM on Histo dataset

## Imports

In [None]:
!pip install git+https://github.com/guillaumegenthial/tf_metrics.git

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/mhc/')

In [3]:
%cd "/content/drive/My Drive/Colab Notebooks/mhc/"

/content/drive/My Drive/Colab Notebooks/mhc


In [None]:
%tensorflow_version 1.x
import tensorflow as tf
print(tf.__version__)

In [None]:
import functools
import logging
from tf_metrics import precision, recall, f1

In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
import json

from sklearn.metrics import f1_score, precision_score, recall_score

from bilstm import BiLSTM
from buildGloveVocab import GloveVocab

## Pre-processing

Build vocabulary and word embeddings compressed representations

In [None]:
data_dir = "data/"

In [None]:
GloveVocab.create_words_file(data_dir+"histo/histoclass/train.txt", data_dir+"histo/histoclass")
GloveVocab.create_words_file(data_dir+"histo/histoclass/test.txt", data_dir+"histo/histoclass", "test")
GloveVocab.create_words_file(data_dir+"histo/histoclass/dev.txt", data_dir+"histo/histoclass", "dev")
GloveVocab.create_tags_file(data_dir+"histo/histoclass/train.txt", data_dir+"histo/histoclass")
GloveVocab.create_tags_file(data_dir+"histo/histoclass/test.txt", data_dir+"histo/histoclass", "test")
GloveVocab.create_tags_file(data_dir+"histo/histoclass/dev.txt", data_dir+"histo/histoclass", "dev")

In [None]:
GloveVocab.create_words_file(data_dir+"histo/histomention/train.txt", data_dir+"histo/histomention")
GloveVocab.create_words_file(data_dir+"histo/histomention/test.txt", data_dir+"histo/histomention", "test")
GloveVocab.create_words_file(data_dir+"histo/histomention/dev.txt", data_dir+"histo/histomention", "dev")
GloveVocab.create_tags_file(data_dir+"histo/histomention/train.txt", data_dir+"histo/histomention")
GloveVocab.create_tags_file(data_dir+"histo/histomention/test.txt", data_dir+"histo/histomention", "test")
GloveVocab.create_tags_file(data_dir+"histo/histomention/dev.txt", data_dir+"histo/histomention", "dev")

In [None]:
data_paths = [data_dir+"histo/histoclass/train", data_dir+"histo/histoclass/dev", data_dir+"histo/histoclass/test"]

In [None]:
GloveVocab.build_vocab(data_paths, path_vocab_words='data/histo/histoclass/vocab.words.txt', path_vocab_chars='data/histo/histoclass/vocab.chars.txt', path_vocab_tags='data/histo/histoclass/vocab.tags.txt')

Build vocab words (may take a while)
- done. Kept 9825 out of 9825
Build vocab chars
- done. Found 100 chars
Build vocab tags (may take a while)
- done. Found 45 tags.


In [None]:
GloveVocab.build_glove(path_vocab_words='data/histo/histoclass/vocab.words.txt', path_glove_txt="wordemb/glove.840B.300d.txt")

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
- done. Found 9625 vectors for 9825 words


In [None]:
GloveVocab.build_glove(path_vocab_words='data/histo/histoclass/vocab.words.txt',path_glove_txt="wordemb/HistoGlove.txt", output_name="glove_hist.npz")

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- done. Found 7310 vectors for 9825 words


In [None]:
data_paths = [data_dir+"histo/histomention/train", data_dir+"histo/histomention/dev", data_dir+"histo/histomention/test"]

In [None]:
GloveVocab.build_vocab(data_paths, path_vocab_words='data/histo/histomention/vocab.words.txt', path_vocab_chars='data/histo/histomention/vocab.chars.txt', path_vocab_tags='data/histo/histomention/vocab.tags.txt')

Build vocab words (may take a while)
- done. Kept 9825 out of 9825
Build vocab chars
- done. Found 100 chars
Build vocab tags (may take a while)
- done. Found 3 tags.


In [None]:
GloveVocab.build_glove(path_vocab_words='data/histo/histomention/vocab.words.txt', path_glove_txt="wordemb/glove.840B.300d.txt")

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
- done. Found 9625 vectors for 9825 words


In [None]:
GloveVocab.build_glove(path_vocab_words='data/histo/histomention/vocab.words.txt',path_glove_txt="wordemb/HistoGlove.txt", output_name="glove_hist.npz")

Reading GloVe file (may take a while)
- At line 0
- At line 100000
- done. Found 7310 vectors for 9825 words


## Experiments

### Multiclass with Glove 840B

In [None]:
DATADIR = 'data/histo/histoclass'

# Logging
Path('results/glove/multiclass').mkdir(exist_ok=True)
tf.compat.v1.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler('results/glove/multiclass/main.log'),
    logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [None]:
    params = {
        'dim': 300,
        'dropout': 0.5,
        'num_oov_buckets': 1,
        'epochs': 25,
        'batch_size': 20,
        'buffer': 15000,
        'lstm_size': 100,
        'words': str(Path(DATADIR, 'vocab.words.txt')),
        'chars': str(Path(DATADIR, 'vocab.chars.txt')),
        'tags': str(Path(DATADIR, 'vocab.tags.txt')),
        'glove': str(Path(DATADIR, 'glove.npz'))
    }
    with Path('results/glove/multiclass/params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)

    
    # Estimator, train and evaluate
    train_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('train', DATADIR), BiLSTM.ftags('train', DATADIR),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('test', DATADIR), BiLSTM.ftags('test', DATADIR))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(BiLSTM.model_fn, 'results/glove/multiclass/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    #hook = tf.contrib.estimator.stop_if_no_increase_hook(
    #    estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
    #train_spec = tf.estimator.TrainSpec(input_fn=train_inpf, hooks=[hook])
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

Using config: {'_model_dir': 'results/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 120, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4534441a20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Not using Distribute Coordinator.
Running training and evaluation locally (non-distributed).
Start train and eval

({'acc': 0.90115476,
  'f1': 0.5642268,
  'global_step': 2480,
  'loss': 8.172856,
  'precision': 0.61587304,
  'recall': 0.5205724},
 [])

In [None]:
for name in ['train', 'dev', 'test']:
        BiLSTM.write_predictions(name, estimator, DATADIR, "results/glove/multiclass")

Calling model_fn.
From /content/drive/My Drive/Colab Notebooks/mhc/bilstm.py:90: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.


### Multiclass with HistoGlove

In [None]:
DATADIR = 'data/histo/histoclass'

# Logging
Path('results/histo/multiclass').mkdir(exist_ok=True)
tf.compat.v1.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler('results/histo/multiclass/main.log'),
    logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [None]:
    params = {
        'dim': 300,
        'dropout': 0.5,
        'num_oov_buckets': 1,
        'epochs': 25,
        'batch_size': 20,
        'buffer': 15000,
        'lstm_size': 100,
        'words': str(Path(DATADIR, 'vocab.words.txt')),
        'chars': str(Path(DATADIR, 'vocab.chars.txt')),
        'tags': str(Path(DATADIR, 'vocab.tags.txt')),
        'glove': str(Path(DATADIR, 'glove_hist.npz'))
    }
    with Path('results/histo/multiclass/params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)

    
    # Estimator, train and evaluate
    train_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('train', DATADIR), BiLSTM.ftags('train', DATADIR),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('test', DATADIR), BiLSTM.ftags('test', DATADIR))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(BiLSTM.model_fn, 'results/histo/multiclass/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

Using config: {'_model_dir': 'resultshisto/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 120, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0b94edc080>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Not using Distribute Coordinator.
Running training and evaluation locally (non-distributed).
Start train and

({'acc': 0.8922248,
  'f1': 0.52380955,
  'global_step': 2480,
  'loss': 9.343881,
  'precision': 0.60397196,
  'recall': 0.46243292},
 [])

In [None]:
for name in ['train', 'dev', 'test']:
        BiLSTM.write_predictions(name, estimator, DATADIR, "results/histo/multiclass")

Calling model_fn.
From /content/drive/My Drive/Colab Notebooks/mhc/bilstm.py:90: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
Done calling model_fn.
Graph was finalized.
Restoring parameters from resultshisto/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from resultshisto/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from resultshisto/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.


### Binary with Glove 840B

In [None]:
DATADIR = 'data/histo/histomention'

# Logging
Path('results/glove/binary').mkdir(exist_ok=True)
tf.compat.v1.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler('results/glove/binary/main.log'),
    logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [None]:
    params = {
        'dim': 300,
        'dropout': 0.5,
        'num_oov_buckets': 1,
        'epochs': 25,
        'batch_size': 20,
        'buffer': 15000,
        'lstm_size': 100,
        'words': str(Path(DATADIR, 'vocab.words.txt')),
        'chars': str(Path(DATADIR, 'vocab.chars.txt')),
        'tags': str(Path(DATADIR, 'vocab.tags.txt')),
        'glove': str(Path(DATADIR, 'glove.npz'))
    }
    with Path('results/glove/binary/params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)

    
    # Estimator, train and evaluate
    train_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('train', DATADIR), BiLSTM.ftags('train', DATADIR),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('test', DATADIR), BiLSTM.ftags('test', DATADIR))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(BiLSTM.model_fn, 'results/glove/binary/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

Using config: {'_model_dir': 'results/glove/binary/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 120, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5324ffaa90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Not using Distribute Coordinator.
Running training and evaluation locally (non-distributed).
Start t

({'acc': 0.9427252,
  'f1': 0.8157407,
  'global_step': 2480,
  'loss': 3.5514815,
  'precision': 0.83665717,
  'recall': 0.7958446},
 [])

In [None]:
for name in ['train', 'dev', 'test']:
        BiLSTM.write_predictions(name, estimator, DATADIR, "results/glove/binary")

Calling model_fn.
From /content/drive/My Drive/Colab Notebooks/mhc/bilstm.py:95: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/glove/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/glove/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/glove/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.


### Binary with HistoGlove

In [None]:
DATADIR = 'data/histo/histomention'

# Logging
Path('results/histo/binary').mkdir(exist_ok=True)
tf.compat.v1.logging.set_verbosity(logging.INFO)
handlers = [
    logging.FileHandler('results/histo/binary/main.log'),
    logging.StreamHandler(sys.stdout)
]
logging.getLogger('tensorflow').handlers = handlers

In [None]:
    params = {
        'dim': 300,
        'dropout': 0.5,
        'num_oov_buckets': 1,
        'epochs': 25,
        'batch_size': 20,
        'buffer': 15000,
        'lstm_size': 100,
        'words': str(Path(DATADIR, 'vocab.words.txt')),
        'chars': str(Path(DATADIR, 'vocab.chars.txt')),
        'tags': str(Path(DATADIR, 'vocab.tags.txt')),
        'glove': str(Path(DATADIR, 'glove_hist.npz'))
    }
    with Path('results/histo/binary/params.json').open('w') as f:
        json.dump(params, f, indent=4, sort_keys=True)

    
    # Estimator, train and evaluate
    train_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('train', DATADIR), BiLSTM.ftags('train', DATADIR),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(BiLSTM.input_fn, BiLSTM.fwords('test', DATADIR), BiLSTM.ftags('test', DATADIR))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(BiLSTM.model_fn, 'results/histo/binary/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

Using config: {'_model_dir': 'results/histo/binary/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 120, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f52a4964e10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Not using Distribute Coordinator.
Running training and evaluation locally (non-distributed).
Start t

({'acc': 0.93425715,
  'f1': 0.7808418,
  'global_step': 2480,
  'loss': 4.5190563,
  'precision': 0.840625,
  'recall': 0.7289973},
 [])

In [None]:
for name in ['train', 'dev', 'test']:
        BiLSTM.write_predictions(name, estimator, DATADIR, "results/histo/binary")

Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/histo/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/histo/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.
Calling model_fn.
Done calling model_fn.
Graph was finalized.
Restoring parameters from results/histo/binary/model/model.ckpt-2480
Running local_init_op.
Done running local_init_op.


## Evaluation

In [None]:
def red_pred(path):
    y, y_pred = [], []
    with Path(path).open("r") as f:
        for l in f:
            if l.strip():
                words = l.split()
                y.append(words[1])
                y_pred.append(words[2])
        return y, y_pred

In [None]:
y, y_pred = red_pred("results/glove/multiclass/score/test.preds.txt")

In [None]:
f1_a = f1_score(y, y_pred, average='weighted')
prec_a = precision_score(y, y_pred, average='weighted')
rec_a = recall_score(y, y_pred, average='weighted')
print(f1_a, prec_a, rec_a)

0.8917272518912011 0.8891516370522373 0.9011547344110854


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y, y_pred = red_pred("results/histo/multiclass/score/test.preds.txt")

In [None]:
f1_b = f1_score(y, y_pred, average='weighted')
prec_b = precision_score(y, y_pred, average='weighted')
rec_b = recall_score(y, y_pred, average='weighted')
print(f1_b, prec_b, rec_b)

0.878688859838361 0.8755559611760211 0.8922247882986913


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y, y_pred = red_pred("results/glove/binary/score/test.preds.txt")

In [None]:
f1_c = f1_score(y, y_pred, average='weighted')
prec_c = precision_score(y, y_pred, average='weighted')
rec_c = recall_score(y, y_pred, average='weighted')
print(f1_c, prec_c, rec_c)

0.942337161597166 0.9422481042226836 0.9427251732101617


In [None]:
y, y_pred = red_pred("results/histo/binary/score/test.preds.txt")

In [None]:
f1_d = f1_score(y, y_pred, average='weighted')
prec_d = precision_score(y, y_pred, average='weighted')
rec_d = recall_score(y, y_pred, average='weighted')
print(f1_d, prec_d, rec_d)

0.9324479222677828 0.9324654274309786 0.9342571208622017


In [None]:
res_df = pd.DataFrame([["Multiclass Glove 840B", prec_a, rec_a, f1_a], ["Multiclass HistoGlove", prec_b, rec_b, f1_b], ["Binary Glove 840B", prec_c, rec_c, f1_c], ["Binary HistoGlove", prec_d, rec_d, f1_d]], columns = ["Experiment", "Precision", "Recall", "F1 Score"])
res_df

Unnamed: 0,Experiment,Precision,Recall,F1 Score
0,Multiclass Glove 840B,0.889152,0.901155,0.891727
1,Multiclass HistoGlove,0.875556,0.892225,0.878689
2,Binary Glove 840B,0.942248,0.942725,0.942337
3,Binary HistoGlove,0.932465,0.934257,0.932448


In [None]:
res_df.to_latex(index=False)

'\\begin{tabular}{lrrr}\n\\toprule\n            Experiment &  Precision &    Recall &  F1 Score \\\\\n\\midrule\n Multiclass Glove 840B &   0.889152 &  0.901155 &  0.891727 \\\\\n Multiclass HistoGlove &   0.875556 &  0.892225 &  0.878689 \\\\\n     Binary Glove 840B &   0.942248 &  0.942725 &  0.942337 \\\\\n     Binary HistoGlove &   0.932465 &  0.934257 &  0.932448 \\\\\n\\bottomrule\n\\end{tabular}\n'

# Event detection Wiki dataset

In [None]:
lines = ['The transfer began on April 9 1942 after the three-month Battle of Bataan in the Philippines during World War II',
         "Astor Pantaleón Piazzolla was an Argentine tango composer bandoneon player and arranger"]
datadir = 'data/histo/histoclass'
params = 'results/glove/multiclass/params.json'
modeldir = 'results/glove/multiclass/model'

preds = BiLSTM.predict(lines, params, datadir, modeldir)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'results/glove/multiclass/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f95436de9e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:ten

In [None]:
BiLSTM.pretty_print(preds)

words: The transfer began  on April 9 1942 after the three-month Battle      of Bataan in the Philippines during World War         II
preds: O   O        B-TIME O  O     O O    O     O   O           B-HOSTILITY O  O      O  O   O           O      O     B-HOSTILITY O 
words: Astor Pantaleón Piazzolla was                  an Argentine tango composer bandoneon player and arranger
preds: O     O         O         B-EXISTENCECAUSATION O  O         O     O        O         O      O   O       


In [11]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp('The transfer began on April 9 1942 after the three-month Battle of Bataan in the Philippines during World War II')

displacy.render(doc, style="ent", jupyter=True)