In [4]:
# dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import tensorflow as tf

from datetime import datetime
import os
import pathlib
from pathlib import Path
import json

In [5]:
# this code by Alexander Fred Ojala https://github.com/ikhlaqsidhu/data-x/blob/master/06b-tools-tensorflow/intro-to-tf_v2_afo.ipynb

# TensorBoard Graph visualizer in notebook
import numpy as np
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script src="//cdnjs.cloudflare.com/ajax/libs/polymer/0.3.3/platform.js"></script>
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))



In [9]:
# load corpus
with open("ner/nyt_data_dict.json") as f:
    data_dict=json.load(f)
    
from ner.ner.corpus import Corpus

corp = Corpus(data_dict)

In [10]:
# out-of-the-box model initialization
from ner.ner.network import NER

model_params = {"use_batch_norm": True,
                "use_crf": True,
                "net_type": 'cnn',
                "use_capitalization": True,
               }

net = NER(corp, **model_params)

Number of parameters: 
Embeddings 6216912
ConvNet 168192
Classifier 1028
transitions:0 16
Total number of parameters equal 6386148


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [12]:
# out-of-the-box model training and eval on train, validation data
results = net.fit(epochs=5)

Epoch 0
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3112 phrases; correct: 2502.

precision:  80.40%; recall:  83.62%; FB1:  81.98


Epoch 1
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2899 phrases; correct: 2445.

precision:  84.34%; recall:  81.72%; FB1:  83.01


Epoch 2
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3140 phrases; correct: 2545.

precision:  81.05%; recall:  85.06%; FB1:  83.01


Epoch 3
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2671 phrases; correct: 2335.

precision:  87.42%; recall:  78.04%; FB1:  82.47


Epoch 4
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2989 phrases; correct: 2456.

precision:  82.17%; recall:  82.09%; FB1:  82.13


Eval on train:
processed 779446 tokens with 14562 phrases; found: 14562 phrases; correct: 14314.

precision:  98.30%; recall:  98.30%; FB1:  98.30

	Name: precision:  98.30%; recall:  98.30%; F1:  98.30 14562


Eval on valid:
proc

In [21]:
net.save('../nm/ner/cnn_default')

In [35]:
['This week, DC Comics released a flock of doves .'.split()]

[['This',
  'week,',
  'DC',
  'Comics',
  'released',
  'a',
  'flock',
  'of',
  'doves',
  '.']]

In [38]:
# test on new sentence
new_x, new_y = corp.tokens_batch_to_numpy_batch(['This week, DC Comics released a flock of doves .'.split()], 
                                [['O', 'O', 'B-Name', 'I-Name', 'O', 'O', 'O', 'O', 'O']])
net.predict(new_x)

[['O', 'O', 'B-Name', 'I-Name', 'O', 'O', 'O', 'O', 'O', 'O']]

In [16]:
# RNN
net2_params = {"use_crf": True,
                "net_type": 'rnn',
                "use_capitalization": True,
                  "cell_type": 'lstm'}
net2=NER(corp, **net2_params)

Number of parameters: 
Embeddings 6216912
RNN_layer_0 315392
RNN_layer_1 1050624
Classifier 2052
transitions:0 16
Total number of parameters equal 7584996


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [17]:
#RNN. Early data suggests this is way the fuck slower than the cnn
# terminated after ~20 min
results2 = net2.fit(epochs=5)

Epoch 0
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3071 phrases; correct: 2429.

precision:  79.09%; recall:  81.18%; FB1:  80.13


Epoch 1
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3233 phrases; correct: 2548.

precision:  78.81%; recall:  85.16%; FB1:  81.86


Epoch 2


KeyboardInterrupt: 

In [22]:
# RNN
net3_params = {"use_batch_norm": True,
                "use_crf": True,
                "net_type": 'cnn_highway',
                "use_capitalization": True,
               }
net3=NER(corp, **net3_params)

Number of parameters: 
Embeddings 6216912
ConvNet 168192
Classifier 1028
transitions:0 16
Total number of parameters equal 6386148


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [23]:
net3.fit(epochs=5)

Epoch 0
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2863 phrases; correct: 2359.

precision:  82.40%; recall:  78.84%; FB1:  80.58


Epoch 1
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2423 phrases; correct: 2142.

precision:  88.40%; recall:  71.59%; FB1:  79.11


Epoch 2
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3203 phrases; correct: 2601.

precision:  81.21%; recall:  86.93%; FB1:  83.97


Epoch 3
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3289 phrases; correct: 2629.

precision:  79.93%; recall:  87.87%; FB1:  83.71


Epoch 4
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3001 phrases; correct: 2515.

precision:  83.81%; recall:  84.06%; FB1:  83.93


Eval on train:
processed 779446 tokens with 14562 phrases; found: 14736 phrases; correct: 14332.

precision:  97.26%; recall:  98.42%; FB1:  97.84

	Name: precision:  97.26%; recall:  98.42%; F1:  97.84 14736


Eval on valid:
proc

OrderedDict([('Name',
              OrderedDict([('precision', 83.70786516853933),
                           ('recall', 84.49905482041588),
                           ('f1', 84.10159924741298),
                           ('n_predicted_entities', 3204),
                           ('n_true_entities', 3174)])),
             ('__total__',
              OrderedDict([('n_predicted_entities', 3204),
                           ('n_true_entities', 3174),
                           ('precision', 83.70786516853933),
                           ('recall', 84.49905482041588),
                           ('f1', 84.10159924741298)]))])

In [24]:
net3.save('../nm/ner/cnn_highway_default')

In [None]:
# parameter tuning
model_params = {"filter_width": np.arange(3, 7), # if cnn
                "embeddings_dropout": [True, False],
                "use_batch_norm": [True, False], # if cnn
                "use_crf": [True, False],
                "net_type": ['cnn', 'rnn', 'cnn_highway'],
                "use_capitalization": [True, False],
                "cell_type": ['lstm', 'gru'] # if rnn
               }
training_params = {'dropout_rate': np.arange(0.1, 0.91, 0.2),
                   'epochs': np.arange(10, 101, 30),
                   'learning_rate':np.linspace(0.0001, 0.001, 10),
                   'batch_size': np.arange(4, 65, 20),
                   'learning_rate_decay': np.arange(0.5, 1.1, 0.1)}

GridSearchCV()