In [14]:
import matplotlib

matplotlib.use('Agg')
import train
import dataset as ds
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import utils
import os
import conll2brat
import glob
import codecs
import shutil
import time
import copy
import evaluate
import random
import pickle
import brat2conll
import numpy as np
import utils_nlp
import distutils.util as distutils_util
import configparser
from pprint import pprint

In [1]:
parameters = {'pretrained_model_folder':'../model',
                      'dataset_text_folder':'../../../ML_EntityData/data/en',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
                      'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':500,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
                      'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
                      'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':100,
                      'token_lstm_hidden_state_dimension':100,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/glove.6B.100d.txt',
                      'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
                      'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}

In [2]:
import utils
import dataset as ds
# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.Dataset(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Checking the validity of BRAT-formatted train set... Done.
Checking compatibility between CONLL and BRAT for train_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted valid set... Done.
Checking compatibility between CONLL and BRAT for valid_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted test set... Done.
Checking compatibility between CONLL and BRAT for test_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Load dataset... done (43.67 seconds)


In [3]:
import tensorflow as tf
from BLSTM_CRF import Char_BLSTM_CRF
# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = Char_BLSTM_CRF(dataset=dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

  from ._conv import register_converters as _register_converters


embedded_characters: Tensor("character_embedding/embedded_characters:0", shape=(?, ?, 25), dtype=float32)
embedded_tokens: Tensor("token_embedding/embedding_lookup:0", shape=(?, 100), dtype=float32)
token_lstm_input: Tensor("concatenate_token_and_character_vectors/token_lstm_input:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop: Tensor("dropout/token_lstm_input_drop/mul:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop_expanded: Tensor("dropout/token_lstm_input_drop_expanded:0", shape=(1, ?, 150), dtype=float32)
unary_scores_expanded: Tensor("crf/unary_scores_expanded:0", shape=(1, ?, 19), dtype=float32)
input_label_indices_flat_batch: Tensor("crf/input_label_indices_flat_batch:0", shape=(1, ?), dtype=int32)
sequence_lengths: Tensor("crf/sequence_lengths:0", shape=(1,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [5]:
import numpy as np
model.load_pretrained_token_embeddings(sess, dataset,embedding_filepath=parameters['token_pretrained_embedding_filepath'],
                                                       check_lowercase= parameters['check_for_lowercase'],check_digits=parameters['check_for_digits_replaced_with_zeros'],
                                                       token_to_vector=token_to_vector)
# Initial params_train
transition_params_trained = np.random.rand(len(dataset.unique_labels) + 2,len(dataset.unique_labels) + 2)

del token_to_vector

Load token embeddings... done (0.17 seconds)
number_of_token_original_case_found: 14618
number_of_token_lowercase_found: 11723
number_of_token_digits_replaced_with_zeros_found: 119
number_of_token_lowercase_and_digits_replaced_with_zeros_found: 16
number_of_loaded_word_vectors: 26476
dataset.vocabulary_size: 28984


In [10]:
import time
import copy
import os
import pickle
stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
start_time = time.time()
results = {}
results['epoch'] = {}
results['execution_details'] = {}
results['execution_details']['train_start'] = start_time
results['execution_details']['time_stamp'] = experiment_timestamp
results['execution_details']['early_stop'] = False
results['execution_details']['keyboard_interrupt'] = False
results['execution_details']['num_epochs'] = 0
results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

In [15]:


bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
previous_best_valid_f1_score = 0
epoch_number = -1

while True:

    step = 0
    epoch_number += 1
    print('\nStarting epoch {0}'.format(epoch_number))

    epoch_start_time = time.time()

    if epoch_number != 0:
        # Train model: loop over all sequences of training set with shuffling
        sequence_numbers = list(range(len(dataset.token_indices['train'])))
        random.shuffle(sequence_numbers)
        for sequence_number in sequence_numbers:
            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters['dropout_rate'])
            step += 1
            if step % 10 == 0:
                print('Training {0:.2f}% done'.format(step / len(sequence_numbers) * 100), end='\r', flush=True)

    epoch_elapsed_training_time = time.time() - epoch_start_time
    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

    y_pred, y_true, output_filepaths = train.predict_labels_lite(sess=sess,model= model,transition_params_trained= transition_params_trained,
                                                                         dataset=dataset,epoch_number= epoch_number,
                                                                        stats_graph_folder= stats_graph_folder,dataset_filepaths= dataset_filepaths,
                                                                        tagging_format= parameters['tagging_format'], main_evaluation_mode=parameters['main_evaluation_mode'],use_crf=parameters['use_crf'])

       
    model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))
        
    if epoch_number >= 10 :
        break


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.0085    0.0365    0.0138      1041
     B-MISC     0.0049    0.0058    0.0053       858
      B-ORG     0.0058    0.0020    0.0030      2485
      B-PER     0.0211    0.0609    0.0314      4284
      E-LOC     0.0141    0.0211    0.0169      1041
     E-MISC     0.0067    0.0058    0.0063       858
      E-ORG     0.0131    0.0294    0.0182      2485
      E-PER     0.0667    0.0037    0.0071      4284
      I-LOC     0.0003    0.0431    0.0005       116
     I-MISC     0.0000    0.0000    0.0000       297
      I-ORG     0.0016    0.0049    0.0024      1219
      I-PER     0.0015    0.0369    0.0029       244
          O     0.0000    0.0000    0.0000    168382
      S-LOC     0.0162    0.0600    0.0255      6099
     S-MISC     0.0192    0.0016    0.0029      2580
      S-ORG     0.0130    0.4179    0.0253      3836
      S-PER  

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-LOC     0.8560    0.6282    0.7247      1041
     B-MISC     0.7652    0.4709    0.5830       858
      B-ORG     0.8258    0.6141    0.7044      2485
      B-PER     0.9256    0.9414    0.9335      4284
      E-LOC     0.8446    0.6215    0.7161      1041
     E-MISC     0.6949    0.4406    0.5392       858
      E-ORG     0.8450    0.6318    0.7230      2485
      E-PER     0.9425    0.9458    0.9442      4284
      I-LOC     0.0000    0.0000    0.0000       116
     I-MISC     0.8000    0.0135    0.0265       297
      I-ORG     0.8185    0.4512    0.5817      1219
      I-PER     0.8587    0.3238    0.4702       244
          O     0.0000    0.0000    0.0000       471
      S-LOC     0.8897    0.9061    0.8978      6099
     S-MISC     0.8906    0.6213    0.7320      2580
      S-ORG     0.9052    0.7140    0.7983      3836
      S-PER     0.9298    0.7094    0.8048      2316

avg / total     0.8684    0.7362    0.7874  


Evaluate model on the test set
             precision    recall  f1-score   support

      B-LOC     0.8120    0.8190    0.8155       232
     B-MISC     0.6687    0.6271    0.6472       177
      B-ORG     0.8191    0.8290    0.8240       579
      B-PER     0.9605    0.9843    0.9723      1086
      E-LOC     0.7958    0.8233    0.8093       232
     E-MISC     0.6905    0.6554    0.6725       177
      E-ORG     0.8365    0.8480    0.8422       579
      E-PER     0.9629    0.9807    0.9717      1086
      I-LOC     0.8333    0.2000    0.3226        25
     I-MISC     0.7586    0.5641    0.6471        39
      I-ORG     0.7852    0.7852    0.7852       256
      I-PER     0.9815    0.7571    0.8548        70
          O     0.0000    0.0000    0.0000       225
      S-LOC     0.9103    0.9325    0.9212      1436
     S-MISC     0.8708    0.7962    0.8318       525
      S-ORG     0.8915    0.8660    0.8786      1082
      S-PER     0.8858    0.8475    0.8662       531

avg / total 


Evaluate model on the valid set
             precision    recall  f1-score   support

      B-LOC     0.8718    0.8718    0.8718       234
     B-MISC     0.8390    0.6693    0.7446       257
      B-ORG     0.7852    0.7800    0.7826       450
      B-PER     0.9650    0.9822    0.9735      1234
      E-LOC     0.8793    0.8718    0.8755       234
     E-MISC     0.8738    0.7004    0.7775       257
      E-ORG     0.8180    0.8089    0.8134       450
      E-PER     0.9650    0.9822    0.9735      1234
      I-LOC     0.8421    0.6957    0.7619        23
     I-MISC     0.8235    0.4719    0.6000        89
      I-ORG     0.6417    0.9103    0.7527       301
      I-PER     0.9123    0.7123    0.8000        73
          O     0.0000    0.0000    0.0000       174
      S-LOC     0.9598    0.9688    0.9643      1603
     S-MISC     0.9481    0.8782    0.9118       665
      S-ORG     0.9455    0.8575    0.8994       891
      S-PER     0.8297    0.9375    0.8803       608

avg / total



Starting epoch 9
Training completed in 281.41 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.9466    0.9366    0.9416      1041
     B-MISC     0.9136    0.8753    0.8940       858
      B-ORG     0.9453    0.9380    0.9416      2485
      B-PER     0.9839    0.9874    0.9857      4284
      E-LOC     0.9427    0.9328    0.9377      1041
     E-MISC     0.9113    0.8741    0.8923       858
      E-ORG     0.9457    0.9384    0.9420      2485
      E-PER     0.9865    0.9897    0.9881      4284
      I-LOC     0.8641    0.7672    0.8128       116
     I-MISC     0.9114    0.7273    0.8090       297
      I-ORG     0.9354    0.9147    0.9249      1219
      I-PER     0.9492    0.9180    0.9333       244
          O     0.0000    0.0000    0.0000       292
      S-LOC     0.9758    0.9731    0.9745      6099
     S-MISC     0.9145    0.9411    0.9276      2580
      S-ORG     0.9667    0.9320    0.9490      3836
      S-PE

In [16]:


prediction_count=0


def predict(text):
    #         if prediction_count == 1:
    parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
    stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

    # Update the deploy folder, file, and dataset
    dataset_type = 'deploy'
    ### Delete all deployment data
    for filepath in glob.glob(os.path.join(parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    ### Create brat folder and file
    dataset_brat_deploy_folder = os.path.join(parameters['dataset_text_folder'], dataset_type)
    utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
    dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(
        str(prediction_count).zfill(5)))  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
    with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
        f.write(text)
    ### Update deploy filepaths
    dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters,
                                                                           dataset_types=[dataset_type])
    dataset_filepaths.update(dataset_filepaths)
    dataset_brat_folders.update(dataset_brat_folders)
    ### Update the dataset for the new deploy set
    dataset.update_dataset(dataset_filepaths, [dataset_type])

    # Predict labels and output brat
    output_filepaths = {}
    prediction_output = train.prediction_step(sess, dataset, dataset_type, model,
                                              transition_params_trained, stats_graph_folder,
                                              prediction_count, dataset_filepaths, parameters['tagging_format'],
                                              parameters['main_evaluation_mode'])
    _, _, output_filepaths[dataset_type] = prediction_output
    conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)

    # Print and output result
    text_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy',
                                 os.path.basename(dataset_brat_deploy_filepath))
    annotation_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
        utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
    text2, entities = brat2conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
    assert (text == text2)
    return entities


In [19]:
predict('my name Is Ngoc Linh')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
my name Is Ngoc Linh

entity: {'id': 'T1', 'type': 'PER', 'start': 11, 'end': 20, 'text': 'Ngoc Linh'}





[{'end': 20, 'id': 'T1', 'start': 11, 'text': 'Ngoc Linh', 'type': 'PER'}]