In [None]:
"""  
Author: Keeley Takimoto (ktakimoto on github)
Code for neural network architecture in /ner/ credited to 
Anh, L. T., Arkhipov, M. Y., & Burtsev,
https://arxiv.org/pdf/1709.09686.pdf, code at https://github.com/deepmipt/ner"""
# dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import tensorflow as tf

from datetime import datetime
import os
import pathlib
from pathlib import Path
import json

from ner.ner.corpus import Corpus
from ner.ner.network import NER


In [2]:
# load corpus
with open("ner/nyt_data_dict.json") as f:
    data_dict=json.load(f)
    

corp = Corpus(data_dict)

In [3]:
# initialize CNN

model_params = {"use_batch_norm": True,
                "use_crf": True,
                "net_type": 'cnn',
                "use_capitalization": False,
               }

net = NER(corp, **model_params)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
Number of parameters: 
Embeddings 6216912
ConvNet 167808
Classifier 1028
transitions:0 16
Total number of parameters equal 6385764


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [4]:
# CNN training and eval on train, validation data
results = net.fit(epochs=5)

Epoch 0
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 2689 phrases; correct: 2186.

precision:  81.29%; recall:  73.06%; FB1:  76.96


Epoch 1
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3448 phrases; correct: 2625.

precision:  76.13%; recall:  87.73%; FB1:  81.52


Epoch 2
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3285 phrases; correct: 2601.

precision:  79.18%; recall:  86.93%; FB1:  82.87


Epoch 3
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3049 phrases; correct: 2477.

precision:  81.24%; recall:  82.79%; FB1:  82.01


Epoch 4
Eval on valid:
processed 167043 tokens with 2992 phrases; found: 3342 phrases; correct: 2574.

precision:  77.02%; recall:  86.03%; FB1:  81.28


Eval on train:
processed 779446 tokens with 14562 phrases; found: 14869 phrases; correct: 14401.

precision:  96.85%; recall:  98.89%; FB1:  97.86

	Name: precision:  96.85%; recall:  98.89%; F1:  97.86 14869


Eval on valid:
proc

In [None]:
# save model
net.save('../nm/ner/cnn_default')

In [None]:
#load saved model (CNN, accuracy score ~0.825)
net1 = NER(corpus= corp, pretrained_model_filepath='../nm/ner/cnn_default', **model_params)

In [None]:
# get integer-coded matrices for new sentence
new_x, new_y = corp.tokens_batch_to_numpy_batch(
    ['Colorless green ideas sleep furiously thanks to Sleep Train .'.split()], 
                                [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Name', 'I-Name'']])
new_x

In [None]:
# get name prediction for new sentence
net1.predict(new_x)

In [None]:
net1.print_number_of_parameters()


In [None]:
# initialize RNN
net2_params = {"use_crf": True,
                "net_type": 'rnn',
                "use_capitalization": True,
                  "cell_type": 'lstm'}
net2=NER(corp, **net2_params)

In [None]:
# fit RNN
# terminated early due to prohibitively slow speed (~20 min for 2 epochs)
results2 = net2.fit(epochs=5)

In [None]:
# initialize CNN highway
net3_params = {"use_batch_norm": True,
                "use_crf": True,
                "net_type": 'cnn_highway',
                "use_capitalization": True,
               }
net3=NER(corp, **net3_params)

In [None]:
# fit CNN highway
net3.fit(epochs=10)

In [None]:
# save CNN highway
net3.save('../nm/ner/cnn_highway_default')

In [None]:
# parameter tuning
# grid search never initialized due to runtime constraints
model_params = {"filter_width": np.arange(3, 7), # if cnn
                "embeddings_dropout": [True, False],
                "use_batch_norm": [True, False], # if cnn
                "use_crf": [True, False],
                "net_type": ['cnn', 'rnn', 'cnn_highway'],
                "use_capitalization": [True, False],
                "cell_type": ['lstm', 'gru'] # if rnn
               }
training_params = {'dropout_rate': np.arange(0.1, 0.91, 0.2),
                   'epochs': np.arange(10, 101, 30),
                   'learning_rate':np.linspace(0.0001, 0.001, 10),
                   'batch_size': np.arange(4, 65, 20),
                   'learning_rate_decay': np.arange(0.5, 1.1, 0.1)}

GridSearchCV()