In [1]:
import matplotlib

matplotlib.use('Agg')
import train
import dataset as ds
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import utils
import os
import conll2brat
import glob
import codecs
import shutil
import time
import copy
import evaluate
import random
import pickle
import brat2conll
import numpy as np
import utils_nlp
import distutils.util as distutils_util
import configparser
from pprint import pprint

  from ._conv import register_converters as _register_converters


In [2]:
parameters = {'pretrained_model_folder':'../model',
                      'dataset_text_folder':'../../../ML_EntityData/data/en',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
                      'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':500,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
                      'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
                      'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':100,
                      'token_lstm_hidden_state_dimension':100,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/en/glove.6B.100d.txt',
                      'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
                      'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}

In [3]:
import utils
import dataset as ds
# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.Dataset(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Checking the validity of BRAT-formatted train set... Done.
Checking compatibility between CONLL and BRAT for train_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted valid set... Done.
Checking compatibility between CONLL and BRAT for valid_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted test set... Done.
Checking compatibility between CONLL and BRAT for test_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Load dataset... done (47.02 seconds)


In [4]:
import tensorflow as tf
from BLSTM_CRF import Char_BLSTM_CRF
# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 2, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = Char_BLSTM_CRF(dataset=dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

embedded_characters: Tensor("character_embedding/embedded_characters:0", shape=(?, ?, 25), dtype=float32)
embedded_tokens: Tensor("token_embedding/embedding_lookup:0", shape=(?, 100), dtype=float32)
token_lstm_input: Tensor("concatenate_token_and_character_vectors/token_lstm_input:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop: Tensor("dropout/token_lstm_input_drop/mul:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop_expanded: Tensor("dropout/token_lstm_input_drop_expanded:0", shape=(1, ?, 150), dtype=float32)
unary_scores_expanded: Tensor("crf/unary_scores_expanded:0", shape=(1, ?, 19), dtype=float32)
input_label_indices_flat_batch: Tensor("crf/input_label_indices_flat_batch:0", shape=(1, ?), dtype=int32)
sequence_lengths: Tensor("crf/sequence_lengths:0", shape=(1,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [5]:
import numpy as np
model.load_pretrained_token_embeddings(sess, dataset,embedding_filepath=parameters['token_pretrained_embedding_filepath'],
                                                       check_lowercase= parameters['check_for_lowercase'],check_digits=parameters['check_for_digits_replaced_with_zeros'],
                                                       token_to_vector=token_to_vector)
# Initial params_train
transition_params_trained = np.random.rand(len(dataset.unique_labels) + 2,len(dataset.unique_labels) + 2)

del token_to_vector

Load token embeddings... done (0.37 seconds)
number_of_token_original_case_found: 14618
number_of_token_lowercase_found: 11723
number_of_token_digits_replaced_with_zeros_found: 119
number_of_token_lowercase_and_digits_replaced_with_zeros_found: 16
number_of_loaded_word_vectors: 26476
dataset.vocabulary_size: 28984


In [6]:
import time
import copy
import os
import pickle
stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
start_time = time.time()
# results = {}
# results['epoch'] = {}
# results['execution_details'] = {}
# results['execution_details']['train_start'] = start_time
# results['execution_details']['time_stamp'] = experiment_timestamp
# results['execution_details']['early_stop'] = False
# results['execution_details']['keyboard_interrupt'] = False
# results['execution_details']['num_epochs'] = 0
# results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

In [7]:


bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
previous_best_valid_f1_score = 0
epoch_number = -1

while True:

    step = 0
    epoch_number += 1
    print('\nStarting epoch {0}'.format(epoch_number))

    epoch_start_time = time.time()

    if epoch_number != 0:
        # Train model: loop over all sequences of training set with shuffling
        sequence_numbers = list(range(len(dataset.token_indices['train'])))
        random.shuffle(sequence_numbers)
        for sequence_number in sequence_numbers:
            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters['dropout_rate'])
            step += 1
            if step % 10 == 0:
                print('Training {0:.2f}% done'.format(step / len(sequence_numbers) * 100), end='\r', flush=True)

    epoch_elapsed_training_time = time.time() - epoch_start_time
    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

    y_pred, y_true, output_filepaths = train.predict_labels_lite(sess=sess,model= model,transition_params_trained= transition_params_trained,
                                                                         dataset=dataset,epoch_number= epoch_number,
                                                                        stats_graph_folder= stats_graph_folder,dataset_filepaths= dataset_filepaths,
                                                                        tagging_format= parameters['tagging_format'], main_evaluation_mode=parameters['main_evaluation_mode'],use_crf=parameters['use_crf'])

       
    if epoch_number % 3 ==0:
        model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))
        
    if epoch_number > 5 :
        break


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.0013    0.0096    0.0022      1041
     B-MISC     0.0039    0.1597    0.0076       858
      B-ORG     0.0094    0.0249    0.0137      2485
      B-PER     0.0756    0.0929    0.0834      4284
      E-LOC     0.0000    0.0000    0.0000      1041
     E-MISC     0.0000    0.0000    0.0000       858
      E-ORG     0.0101    0.0266    0.0146      2485
      E-PER     0.0119    0.2054    0.0225      4284
      I-LOC     0.0016    0.0431    0.0030       116
     I-MISC     0.0032    0.0640    0.0061       297
      I-ORG     0.0020    0.0049    0.0028      1219
      I-PER     0.0009    0.0369    0.0018       244
          O     0.0000    0.0000    0.0000    167594
      S-LOC     0.0377    0.0087    0.0141      6099
     S-MISC     0.0100    0.1446    0.0187      2580
      S-ORG     0.0831    0.1025    0.0918      3836
      S-PER  

  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

      B-LOC     0.7743    0.7118    0.7417      1041
     B-MISC     0.7000    0.5303    0.6034       858
      B-ORG     0.8161    0.6197    0.7045      2485
      B-PER     0.9228    0.9573    0.9397      4284
      E-LOC     0.7588    0.7012    0.7289      1041
     E-MISC     0.6822    0.5303    0.5967       858
      E-ORG     0.8319    0.6394    0.7231      2485
      E-PER     0.9268    0.9601    0.9431      4284
      I-LOC     0.0000    0.0000    0.0000       116
     I-MISC     1.0000    0.0202    0.0396       297
      I-ORG     0.6807    0.5176    0.5881      1219
      I-PER     0.8559    0.3893    0.5352       244
          O     0.0000    0.0000    0.0000       797
      S-LOC     0.8764    0.9311    0.9029      6099
     S-MISC     0.9135    0.6225    0.7404      2580
      S-ORG     0.8970    0.7560    0.8205      3836
      S-PER     0.9159    0.7526    0.8263      2316

avg / total     0.8439    0.7573    0.7897  


Evaluate model on the test set
             precision    recall  f1-score   support

      B-LOC     0.8009    0.7974    0.7991       232
     B-MISC     0.6532    0.6384    0.6457       177
      B-ORG     0.7473    0.8377    0.7899       579
      B-PER     0.9656    0.9834    0.9745      1086
      E-LOC     0.7741    0.7974    0.7856       232
     E-MISC     0.6744    0.6554    0.6648       177
      E-ORG     0.7593    0.8497    0.8020       579
      E-PER     0.9683    0.9834    0.9758      1086
      I-LOC     0.0000    0.0000    0.0000        25
     I-MISC     0.7778    0.5385    0.6364        39
      I-ORG     0.7209    0.8477    0.7792       256
      I-PER     0.9355    0.8286    0.8788        70
          O     0.0000    0.0000    0.0000       349
      S-LOC     0.9033    0.9304    0.9166      1436
     S-MISC     0.8720    0.8171    0.8437       525
      S-ORG     0.8763    0.8771    0.8767      1082
      S-PER     0.9202    0.8475    0.8824       531

avg / total 


Evaluate model on the valid set
             precision    recall  f1-score   support

      B-LOC     0.8655    0.8803    0.8729       234
     B-MISC     0.8137    0.6459    0.7202       257
      B-ORG     0.8213    0.7867    0.8036       450
      B-PER     0.9745    0.9911    0.9827      1234
      E-LOC     0.8589    0.8846    0.8716       234
     E-MISC     0.8689    0.6965    0.7732       257
      E-ORG     0.8472    0.8133    0.8299       450
      E-PER     0.9752    0.9887    0.9819      1234
      I-LOC     0.9474    0.7826    0.8571        23
     I-MISC     0.8571    0.4719    0.6087        89
      I-ORG     0.7346    0.8738    0.7982       301
      I-PER     0.9623    0.6986    0.8095        73
          O     0.0000    0.0000    0.0000       141
      S-LOC     0.9641    0.9713    0.9677      1603
     S-MISC     0.9134    0.8887    0.9009       665
      S-ORG     0.9352    0.8743    0.9037       891
      S-PER     0.8455    0.9539    0.8964       608

avg / total

In [14]:


prediction_count=0


def predict(text):
    #         if prediction_count == 1:
    parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
    stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

    # Update the deploy folder, file, and dataset
    dataset_type = 'deploy'
    ### Delete all deployment data
    for filepath in glob.glob(os.path.join(parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    ### Create brat folder and file
    dataset_brat_deploy_folder = os.path.join(parameters['dataset_text_folder'], dataset_type)
    utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
    dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(
        str(prediction_count).zfill(5)))  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
    with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
        f.write(text)
    ### Update deploy filepaths
    dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters,
                                                                           dataset_types=[dataset_type])
    dataset_filepaths.update(dataset_filepaths)
    dataset_brat_folders.update(dataset_brat_folders)
    ### Update the dataset for the new deploy set
    dataset.update_dataset(dataset_filepaths, [dataset_type])

    # Predict labels and output brat
    output_filepaths = {}
    prediction_output = train.prediction_step_lite(sess, dataset, dataset_type, model,
                                              transition_params_trained, stats_graph_folder,
                                              prediction_count, dataset_filepaths, parameters['tagging_format'],
                                              parameters['main_evaluation_mode'])
    predictions , _, output_filepaths[dataset_type] = prediction_output
    
    print([dataset.index_to_label[prediction] for prediction in predictions])
    conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)

    # Print and output result
    text_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy',
                                 os.path.basename(dataset_brat_deploy_filepath))
    annotation_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
        utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
    text2, entities = brat2conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
    assert (text == text2)
    return entities


In [15]:
predict('her name is Pham Ngoc Linh')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

      B-LOC     0.0000    0.0000    0.0000         0
     B-MISC     0.0000    0.0000    0.0000         0
      B-ORG     0.0000    0.0000    0.0000         0
      B-PER     0.0000    0.0000    0.0000         3

avg / total     0.0000    0.0000    0.0000         3

['O', 'O', 'O', 'B-PER', 'I-PER', 'E-PER']
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
her name is Pham Ngoc Linh

entity: {'id': 'T1', 'type': 'PER', 'start': 12, 'end': 26, 'text': 'Pham Ngoc Linh'}





[{'end': 26, 'id': 'T1', 'start': 12, 'text': 'Pham Ngoc Linh', 'type': 'PER'}]

In [40]:
import en_core_web_sm
nlp = en_core_web_sm.load()
def get_sentences(text):
    doc =nlp(text)
    sentences = []
    for span in doc.sents:
        sentence = [doc[i] for i in range(span.start, span.end)]
        sentence_tokens = []
        for token in sentence:
            token_dict = {}
            token_dict['start'] = token.idx
            token_dict['end'] = token.idx + len(token)
            token_dict['text'] = text[token_dict['start']:token_dict['end']]
            if token_dict['text'].strip() in ['\n', '\t', ' ', '']:
                continue
            # Make sure that the token text does not contain any space
            if len(token_dict['text'].split(' ')) != 1:
                print(
                    "WARNING: the text of the token contains space character, replaced with hyphen\n\t{0}\n\t{1}".format(
                        token_dict['text'],
                        token_dict['text'].replace(' ', '-')))
                token_dict['text'] = token_dict['text'].replace(' ', '-')
            sentence_tokens.append(token_dict)
        sentences.append(sentence_tokens)
    return sentences

In [46]:
get_sentences('her name is Pham Ngoc Linh')

[[{'end': 3, 'start': 0, 'text': 'her'},
  {'end': 8, 'start': 4, 'text': 'name'},
  {'end': 11, 'start': 9, 'text': 'is'},
  {'end': 16, 'start': 12, 'text': 'Pham'},
  {'end': 21, 'start': 17, 'text': 'Ngoc'},
  {'end': 26, 'start': 22, 'text': 'Linh'}]]