In [1]:
import matplotlib

matplotlib.use('Agg')
import train
import dataset as ds
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import utils
import os
import conll2brat
import glob
import codecs
import shutil
import time
import copy
import evaluate
import random
import pickle
import brat2conll
import numpy as np
import utils_nlp
import distutils.util as distutils_util
import configparser
from pprint import pprint

In [4]:
parameters = {'pretrained_model_folder':'../model',
                      'dataset_text_folder':'../../../ML_EntityData/data/en',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
                      'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':500,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
                      'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
                      'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':100,
                      'token_lstm_hidden_state_dimension':100,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/en/glove.6B.100d.txt',
                      'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
                      'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}

In [5]:
import utils
import dataset as ds
# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.Dataset(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Checking the validity of BRAT-formatted train set... Done.
Checking compatibility between CONLL and BRAT for train_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted valid set... Done.
Checking compatibility between CONLL and BRAT for valid_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted test set... Done.
Checking compatibility between CONLL and BRAT for test_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Load dataset... done (55.48 seconds)


In [6]:
import tensorflow as tf
from BLSTM_CRF import Char_BLSTM_CRF
# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = Char_BLSTM_CRF(dataset=dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

embedded_characters: Tensor("character_embedding/embedded_characters:0", shape=(?, ?, 25), dtype=float32)
embedded_tokens: Tensor("token_embedding/embedding_lookup:0", shape=(?, 100), dtype=float32)
token_lstm_input: Tensor("concatenate_token_and_character_vectors/token_lstm_input:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop: Tensor("dropout/token_lstm_input_drop/mul:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop_expanded: Tensor("dropout/token_lstm_input_drop_expanded:0", shape=(1, ?, 150), dtype=float32)
unary_scores_expanded: Tensor("crf/unary_scores_expanded:0", shape=(1, ?, 19), dtype=float32)
input_label_indices_flat_batch: Tensor("crf/input_label_indices_flat_batch:0", shape=(1, ?), dtype=int32)
sequence_lengths: Tensor("crf/sequence_lengths:0", shape=(1,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [7]:
import numpy as np
model.load_pretrained_token_embeddings(sess, dataset,embedding_filepath=parameters['token_pretrained_embedding_filepath'],
                                                       check_lowercase= parameters['check_for_lowercase'],check_digits=parameters['check_for_digits_replaced_with_zeros'],
                                                       token_to_vector=token_to_vector)
# Initial params_train
transition_params_trained = np.random.rand(len(dataset.unique_labels) + 2,len(dataset.unique_labels) + 2)

del token_to_vector

Load token embeddings... done (0.19 seconds)
number_of_token_original_case_found: 14618
number_of_token_lowercase_found: 11723
number_of_token_digits_replaced_with_zeros_found: 119
number_of_token_lowercase_and_digits_replaced_with_zeros_found: 16
number_of_loaded_word_vectors: 26476
dataset.vocabulary_size: 28984


In [8]:
import time
import copy
import os
import pickle
stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
start_time = time.time()
# results = {}
# results['epoch'] = {}
# results['execution_details'] = {}
# results['execution_details']['train_start'] = start_time
# results['execution_details']['time_stamp'] = experiment_timestamp
# results['execution_details']['early_stop'] = False
# results['execution_details']['keyboard_interrupt'] = False
# results['execution_details']['num_epochs'] = 0
# results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

In [9]:


bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
previous_best_valid_f1_score = 0
epoch_number = -1

while True:

    step = 0
    epoch_number += 1
    print('\nStarting epoch {0}'.format(epoch_number))

    epoch_start_time = time.time()

    if epoch_number != 0:
        # Train model: loop over all sequences of training set with shuffling
        sequence_numbers = list(range(len(dataset.token_indices['train'])))
        random.shuffle(sequence_numbers)
        for sequence_number in sequence_numbers:
            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters['dropout_rate'])
            step += 1
            if step % 10 == 0:
                print('Training {0:.2f}% done'.format(step / len(sequence_numbers) * 100), end='\r', flush=True)

    epoch_elapsed_training_time = time.time() - epoch_start_time
    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

    y_pred, y_true, output_filepaths = train.predict_labels_lite(sess=sess,model= model,transition_params_trained= transition_params_trained,
                                                                         dataset=dataset,epoch_number= epoch_number,
                                                                        stats_graph_folder= stats_graph_folder,dataset_filepaths= dataset_filepaths,
                                                                        tagging_format= parameters['tagging_format'], main_evaluation_mode=parameters['main_evaluation_mode'],use_crf=parameters['use_crf'])

       
    if epoch_number % 5 ==0:
        model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))
        
    if epoch_number > 10 :
        break


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.0025    0.0317    0.0046      1041
     B-MISC     0.0020    0.1503    0.0040       858
      B-ORG     0.0132    0.0004    0.0008      2485
      B-PER     0.0064    0.0049    0.0056      4284
      E-LOC     0.0066    0.0029    0.0040      1041
     E-MISC     0.0000    0.0000    0.0000       858
      E-ORG     0.0000    0.0000    0.0000      2485
      E-PER     0.0353    0.0299    0.0323      4284
      I-LOC     0.0000    0.0000    0.0000       116
     I-MISC     0.0182    0.0034    0.0057       297
      I-ORG     0.0021    0.0033    0.0026      1219
      I-PER     0.0002    0.0041    0.0003       244
          O     0.0000    0.0000    0.0000    163169
      S-LOC     0.0075    0.0021    0.0033      6099
     S-MISC     0.0108    0.0023    0.0038      2580
      S-ORG     0.0102    0.2487    0.0196      3836
      S-PER  

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-LOC     0.8180    0.6388    0.7174      1041
     B-MISC     0.7781    0.3473    0.4803       858
      B-ORG     0.8572    0.5557    0.6743      2485
      B-PER     0.9355    0.9307    0.9331      4284
      E-LOC     0.8136    0.6330    0.7120      1041
     E-MISC     0.7401    0.3252    0.4518       858
      E-ORG     0.8717    0.5658    0.6862      2485
      E-PER     0.9452    0.9381    0.9417      4284
      I-LOC     0.0000    0.0000    0.0000       116
     I-MISC     0.0000    0.0000    0.0000       297
      I-ORG     0.8228    0.3733    0.5135      1219
      I-PER     0.8000    0.3115    0.4484       244
          O     0.0000    0.0000    0.0000       335
      S-LOC     0.8895    0.9003    0.8949      6099
     S-MISC     0.8696    0.6279    0.7292      2580
      S-ORG     0.9039    0.6794    0.7757      3836
      S-PER     0.9092    0.7008    0.7915      2316

avg / total     0.8667    0.7146    0.7733  


Evaluate model on the test set
             precision    recall  f1-score   support

      B-LOC     0.8018    0.7672    0.7841       232
     B-MISC     0.6066    0.6271    0.6167       177
      B-ORG     0.8000    0.8428    0.8209       579
      B-PER     0.9692    0.9853    0.9772      1086
      E-LOC     0.8053    0.7845    0.7948       232
     E-MISC     0.6484    0.6667    0.6574       177
      E-ORG     0.8062    0.8480    0.8266       579
      E-PER     0.9753    0.9825    0.9789      1086
      I-LOC     0.7500    0.1200    0.2069        25
     I-MISC     0.5714    0.5128    0.5405        39
      I-ORG     0.7390    0.8516    0.7913       256
      I-PER     0.9833    0.8429    0.9077        70
          O     0.0000    0.0000    0.0000       257
      S-LOC     0.8979    0.9366    0.9168      1436
     S-MISC     0.8760    0.8076    0.8404       525
      S-ORG     0.9137    0.8420    0.8764      1082
      S-PER     0.9163    0.8456    0.8795       531

avg / total 


Evaluate model on the valid set
             precision    recall  f1-score   support

      B-LOC     0.9434    0.8547    0.8969       234
     B-MISC     0.8542    0.6381    0.7305       257
      B-ORG     0.7891    0.8400    0.8138       450
      B-PER     0.9689    0.9862    0.9775      1234
      E-LOC     0.9384    0.8462    0.8899       234
     E-MISC     0.9119    0.6848    0.7822       257
      E-ORG     0.8063    0.8600    0.8323       450
      E-PER     0.9697    0.9862    0.9779      1234
      I-LOC     0.8000    0.5217    0.6316        23
     I-MISC     0.8571    0.4719    0.6087        89
      I-ORG     0.7166    0.8738    0.7874       301
      I-PER     0.9298    0.7260    0.8154        73
          O     0.0000    0.0000    0.0000       169
      S-LOC     0.9524    0.9744    0.9633      1603
     S-MISC     0.9183    0.8962    0.9072       665
      S-ORG     0.9427    0.8866    0.9138       891
      S-PER     0.8343    0.9523    0.8894       608

avg / total



Starting epoch 9
Training completed in 565.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.9513    0.9193    0.9350      1041
     B-MISC     0.9476    0.8217    0.8801       858
      B-ORG     0.9155    0.9457    0.9303      2485
      B-PER     0.9767    0.9886    0.9826      4284
      E-LOC     0.9486    0.9212    0.9347      1041
     E-MISC     0.9502    0.8228    0.8819       858
      E-ORG     0.9151    0.9453    0.9299      2485
      E-PER     0.9795    0.9904    0.9849      4284
      I-LOC     0.8387    0.6724    0.7464       116
     I-MISC     0.9293    0.6195    0.7434       297
      I-ORG     0.9048    0.9048    0.9048      1219
      I-PER     0.9120    0.9344    0.9231       244
          O     0.0000    0.0000    0.0000       378
      S-LOC     0.9757    0.9759    0.9758      6099
     S-MISC     0.9466    0.9283    0.9374      2580
      S-ORG     0.9552    0.9445    0.9498      3836
      S-PE


Evaluate model on the test set
             precision    recall  f1-score   support

      B-LOC     0.7786    0.9095    0.8390       232
     B-MISC     0.6178    0.6667    0.6413       177
      B-ORG     0.8790    0.8152    0.8459       579
      B-PER     0.9783    0.9954    0.9868      1086
      E-LOC     0.7786    0.9095    0.8390       232
     E-MISC     0.6387    0.6893    0.6630       177
      E-ORG     0.8901    0.8256    0.8566       579
      E-PER     0.9764    0.9917    0.9840      1086
      I-LOC     0.4872    0.7600    0.5938        25
     I-MISC     0.5854    0.6154    0.6000        39
      I-ORG     0.8167    0.8008    0.8087       256
      I-PER     0.9324    0.9857    0.9583        70
          O     0.0000    0.0000    0.0000       265
      S-LOC     0.9334    0.9366    0.9350      1436
     S-MISC     0.8477    0.8267    0.8370       525
      S-ORG     0.9061    0.8919    0.8989      1082
      S-PER     0.9241    0.8945    0.9091       531

avg / total 

In [10]:


prediction_count=0


def predict(text):
    #         if prediction_count == 1:
    parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
    stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

    # Update the deploy folder, file, and dataset
    dataset_type = 'deploy'
    ### Delete all deployment data
    for filepath in glob.glob(os.path.join(parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    ### Create brat folder and file
    dataset_brat_deploy_folder = os.path.join(parameters['dataset_text_folder'], dataset_type)
    utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
    dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(
        str(prediction_count).zfill(5)))  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
    with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
        f.write(text)
    ### Update deploy filepaths
    dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters,
                                                                           dataset_types=[dataset_type])
    dataset_filepaths.update(dataset_filepaths)
    dataset_brat_folders.update(dataset_brat_folders)
    ### Update the dataset for the new deploy set
    dataset.update_dataset(dataset_filepaths, [dataset_type])

    # Predict labels and output brat
    output_filepaths = {}
    prediction_output = train.prediction_step_lite(sess, dataset, dataset_type, model,
                                              transition_params_trained, stats_graph_folder,
                                              prediction_count, dataset_filepaths, parameters['tagging_format'],
                                              parameters['main_evaluation_mode'])
    _, _, output_filepaths[dataset_type] = prediction_output
    conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)

    # Print and output result
    text_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy',
                                 os.path.basename(dataset_brat_deploy_filepath))
    annotation_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
        utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
    text2, entities = brat2conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
    assert (text == text2)
    return entities


In [16]:
predict('her name i Pham Ngoc Linh')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

      B-LOC     0.0000    0.0000    0.0000         0
     B-MISC     0.0000    0.0000    0.0000         0
      B-ORG     0.0000    0.0000    0.0000         0
      B-PER     0.0000    0.0000    0.0000         3

avg / total     0.0000    0.0000    0.0000         3

Formatting 000_deploy set from CONLL to BRAT... Done.

text:
her name are Pham Ngoc Linh

entity: {'id': 'T1', 'type': 'ORG', 'start': 13, 'end': 27, 'text': 'Pham Ngoc Linh'}





[{'end': 27, 'id': 'T1', 'start': 13, 'text': 'Pham Ngoc Linh', 'type': 'ORG'}]