In [1]:
import matplotlib

matplotlib.use('Agg')
import train_P as train
import dataset as ds
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import utils
import os
import conll2brat
import glob
import codecs
import shutil
import time
import copy
import evaluate
import random
import pickle
import brat2conll
import numpy as np
import utils_nlp
import distutils.util as distutils_util
import configparser
from pprint import pprint

  from ._conv import register_converters as _register_converters


In [2]:
parameters = {'pretrained_model_folder':'../model',
                      'dataset_text_folder':'../../../ML_EntityData/data/en',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
#                       'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':500,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
#                       'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
#                       'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':100,
                      'token_lstm_hidden_state_dimension':100,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/en/glove.6B.100d.txt',
#                       'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
#                       'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}

In [3]:
import utils
import utils_data as ds
# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.DatasetP(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Checking the validity of BRAT-formatted train set... Done.
Checking compatibility between CONLL and BRAT for train_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted valid set... Done.
Checking compatibility between CONLL and BRAT for valid_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted test set... Done.
Checking compatibility between CONLL and BRAT for test_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Load dataset... done (57.36 seconds)


In [4]:
import tensorflow as tf
from BLSTM_CRF_P import BLSTM_CRF
# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 2, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = BLSTM_CRF(dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

embedded_characters: Tensor("character_embedding/embedded_characters:0", shape=(?, ?, 25), dtype=float32)
embedded_tokens: Tensor("token_embedding/embedding_lookup:0", shape=(?, 100), dtype=float32)
token_lstm_input: Tensor("concatenate_token_and_character_vectors/token_lstm_input:0", shape=(?, 163), dtype=float32)
token_lstm_input_drop: Tensor("dropout/token_lstm_input_drop/mul:0", shape=(?, 163), dtype=float32)
token_lstm_input_drop_expanded: Tensor("dropout/token_lstm_input_drop_expanded:0", shape=(1, ?, 163), dtype=float32)
unary_scores_expanded: Tensor("crf/unary_scores_expanded:0", shape=(1, ?, 19), dtype=float32)
input_label_indices_flat_batch: Tensor("crf/input_label_indices_flat_batch:0", shape=(1, ?), dtype=int32)
sequence_lengths: Tensor("crf/sequence_lengths:0", shape=(1,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [6]:
import numpy as np
model.load_pretrained_token_embeddings(sess, dataset,embedding_filepath=parameters['token_pretrained_embedding_filepath'],
                                                       check_lowercase= parameters['check_for_lowercase'],check_digits=parameters['check_for_digits_replaced_with_zeros'],
                                                       token_to_vector=token_to_vector)
# Initial params_train
transition_params_trained = np.random.rand(len(dataset.unique_labels) + 2,len(dataset.unique_labels) + 2)

del token_to_vector

Load token embeddings... done (0.22 seconds)
number_of_token_original_case_found: 14618
number_of_token_lowercase_found: 11723
number_of_token_digits_replaced_with_zeros_found: 119
number_of_token_lowercase_and_digits_replaced_with_zeros_found: 16
number_of_loaded_word_vectors: 26476
dataset.vocabulary_size: 28984


In [5]:
import numpy as np
transition_params_trained = model.restore_from_pretrained_model(dataset, sess , model_pathfile=('../model/en/model_00000.ckpt'),
                                                                                     dataset_pathfile=('../model/en/dataset.pickle'),
                                                                                     embedding_filepath= parameters['token_pretrained_embedding_filepath'],
                                                                                     character_dimension = parameters['character_embedding_dimension'],
                                                                                     token_dimension=parameters['token_embedding_dimension'],token_to_vector=token_to_vector)
del token_to_vector

INFO:tensorflow:Restoring parameters from ../model/en/model_00000.ckpt


In [6]:
import time
import copy
import os
import pickle
stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
start_time = time.time()
# results = {}
# results['epoch'] = {}
# results['execution_details'] = {}
# results['execution_details']['train_start'] = start_time
# results['execution_details']['time_stamp'] = experiment_timestamp
# results['execution_details']['early_stop'] = False
# results['execution_details']['keyboard_interrupt'] = False
# results['execution_details']['num_epochs'] = 0
# results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

In [18]:
bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
previous_best_valid_f1_score = 0
epoch_number = -1

while True:

    step = 0
    epoch_number += 1
    print('\nStarting epoch {0}'.format(epoch_number))

    epoch_start_time = time.time()

    if epoch_number != 0:
        # Train model: loop over all sequences of training set with shuffling
        sequence_numbers = list(range(len(dataset.token_indices['train'])))
        random.shuffle(sequence_numbers)
        for sequence_number in sequence_numbers:
            transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters['dropout_rate'])
            step += 1
            if step % 10 == 0:
                print('Training {0:.2f}% done'.format(step / len(sequence_numbers) * 100), end='\r', flush=True)

    epoch_elapsed_training_time = time.time() - epoch_start_time
    print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

    y_pred, y_true, output_filepaths = train.predict_labels(sess=sess,model= model,transition_params_trained= transition_params_trained,
                                                                         dataset=dataset,epoch_number= epoch_number,
                                                                        stats_graph_folder= stats_graph_folder,dataset_filepaths= dataset_filepaths,
                                                                        tagging_format= parameters['tagging_format'])

       
#     if epoch_number % 3 ==0:
    model.saver.save(sess, os.path.join(model_folder, 'model.ckpt'))
        
    if epoch_number > 2 :
        break


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

      B-LOC     0.9839    0.9971    0.9905      1041
     B-MISC     0.9584    0.9942    0.9760       858
      B-ORG     0.9963    0.9859    0.9911      2485
      B-PER     0.9967    0.9979    0.9973      4284
      E-LOC     0.9829    0.9962    0.9895      1041
     E-MISC     0.9618    0.9977    0.9794       858
      E-ORG     0.9972    0.9867    0.9919      2485
      E-PER     0.9977    0.9988    0.9983      4284
      I-LOC     0.9915    1.0000    0.9957       116
     I-MISC     0.9643    1.0000    0.9818       297
      I-ORG     0.9870    0.9951    0.9910      1219
      I-PER     0.9760    1.0000    0.9879       244
          O     0.0000    0.0000    0.0000       130
      S-LOC     0.9909    0.9970    0.9940      6099
     S-MISC     0.9801    0.9907    0.9854      2580
      S-ORG     0.9979    0.9804    0.9891      3836
      S-PER  


Evaluate model on the test set
             precision    recall  f1-score   support

      B-LOC     0.7414    0.9267    0.8238       232
     B-MISC     0.6166    0.6723    0.6432       177
      B-ORG     0.8515    0.8515    0.8515       579
      B-PER     0.9791    0.9917    0.9854      1086
      E-LOC     0.7405    0.9224    0.8215       232
     E-MISC     0.6684    0.7288    0.6973       177
      E-ORG     0.8547    0.8532    0.8539       579
      E-PER     0.9764    0.9899    0.9831      1086
      I-LOC     0.4878    0.8000    0.6061        25
     I-MISC     0.5476    0.5897    0.5679        39
      I-ORG     0.7270    0.8633    0.7893       256
      I-PER     0.9706    0.9429    0.9565        70
          O     0.0000    0.0000    0.0000       344
      S-LOC     0.9302    0.9366    0.9334      1436
     S-MISC     0.8887    0.8362    0.8616       525
      S-ORG     0.9102    0.8993    0.9047      1082
      S-PER     0.9281    0.8757    0.9012       531

avg / total 

In [12]:
prediction_count=0
def predict(text):
    #         if prediction_count == 1:
    parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
    stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

    # Update the deploy folder, file, and dataset
    dataset_type = 'deploy'
    ### Delete all deployment data
    for filepath in glob.glob(os.path.join(parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
        if os.path.isdir(filepath):
            shutil.rmtree(filepath)
        else:
            os.remove(filepath)
    ### Create brat folder and file
    dataset_brat_deploy_folder = os.path.join(parameters['dataset_text_folder'], dataset_type)
    utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
    dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(
        str(prediction_count).zfill(5)))  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
    with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
        f.write(text)
    ### Update deploy filepaths
    dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters,
                                                                           dataset_types=[dataset_type])
    dataset_filepaths.update(dataset_filepaths)
    dataset_brat_folders.update(dataset_brat_folders)
    ### Update the dataset for the new deploy set
    dataset.update_dataset(dataset_filepaths, [dataset_type])

    # Predict labels and output brat
    output_filepaths = {}
    prediction_output = train.prediction_step(sess, dataset, dataset_type, model,
                                              transition_params_trained, stats_graph_folder,
                                              prediction_count, dataset_filepaths, parameters['tagging_format'])
    predictions , _, output_filepaths[dataset_type] = prediction_output
    
#     print([dataset.index_to_label[prediction] for prediction in predictions])
    conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)

    # Print and output result
    text_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy',
                                 os.path.basename(dataset_brat_deploy_filepath))
    annotation_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
        utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
    text2, entities = brat2conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
    assert (text == text2)
    return entities


In [21]:
predict('my name is Phạm  Ngoc Linh')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
my name is Phạm  Ngoc Linh

entity: {'id': 'T1', 'type': 'PER', 'start': 17, 'end': 26, 'text': 'Ngoc Linh'}





[{'end': 26, 'id': 'T1', 'start': 17, 'text': 'Ngoc Linh', 'type': 'PER'}]

In [30]:
dataset.token_indices['deploy']

[[0, 0, 0, 12506, 28, 8, 2468, 67, 4958, 11492, 9, 1],
 [0, 28, 8, 2468, 67, 4958, 11492, 9, 218],
 [0,
  0,
  28,
  218,
  361,
  341,
  28015,
  0,
  21,
  186,
  6658,
  9,
  1,
  1638,
  0,
  0,
  0,
  1,
  86,
  708,
  31,
  0,
  0,
  0,
  10,
  0,
  0,
  0,
  1,
  19,
  7,
  12506,
  6508,
  16611,
  335,
  43,
  19,
  5526,
  10,
  755,
  1522,
  4,
  3,
  2572,
  108,
  482,
  4,
  9303,
  2],
 [0,
  19,
  86,
  427,
  163,
  8,
  0,
  9,
  10,
  156,
  8,
  0,
  9,
  4,
  3,
  540,
  365,
  4,
  9303,
  8,
  839,
  9303,
  9,
  2],
 [85,
  19,
  7,
  1222,
  1980,
  5,
  3,
  0,
  4,
  3,
  540,
  365,
  4,
  9303,
  5,
  15100,
  1,
  31,
  316,
  31,
  3,
  1810,
  14,
  2861,
  4,
  9303,
  8,
  0,
  9,
  10,
  3,
  0,
  0,
  8,
  0,
  93,
  0,
  9,
  230,
  3,
  9303,
  1284,
  2],
 [0,
  0,
  0,
  629,
  3,
  0,
  0,
  923,
  1138,
  26,
  0,
  0,
  1,
  11851,
  3,
  6508,
  16,
  1611,
  540,
  365,
  4,
  9303,
  5,
  15100,
  10,
  7191,
  3,
  315,
  558,
  5,
  0,
 

In [37]:
[dataset.index_to_token[i] for i in dataset.token_indices['deploy'][2]]

['UNK',
 'UNK',
 ':',
 ';',
 '19',
 'May',
 '1890',
 'UNK',
 '2',
 'September',
 '1969',
 ')',
 ',',
 'born',
 'UNK',
 'UNK',
 'UNK',
 ',',
 'also',
 'known',
 'as',
 'UNK',
 'UNK',
 'UNK',
 'and',
 'UNK',
 'UNK',
 'UNK',
 ',',
 'was',
 'a',
 'Vietnamese',
 'Communist',
 'revolutionary',
 'leader',
 'who',
 'was',
 'Chairman',
 'and',
 'First',
 'Secretary',
 'of',
 'the',
 'Workers',
 "'",
 'Party',
 'of',
 'Vietnam',
 '.']

In [29]:
predict("Hồ Chí Minh Vietnamese:  (About this sound listen), Saigon:  (About this sound listen); Chữ nôm: ; 19 May 1890 – 2 September 1969), born Nguyễn Sinh Cung, also known as Nguyễn Tất Thành and Nguyễn Ái Quốc, was a Vietnamese Communist revolutionary leader who was Chairman and First Secretary of the Workers' Party of Vietnam. Hồ was also Prime Minister (1945–55) and President (1945–69) of the Democratic Republic of Vietnam (North Vietnam). He was a key figure in the foundation of the Democratic Republic of Vietnam in 1945, as well as the People's Army of Vietnam (PAVN) and the Việt Cộng (NLF or VC) during the Vietnam War.Hồ Chí Minh led the Việt Minh independence movement from 1941 onward, establishing the Communist-ruled Democratic Republic of Vietnam in 1945 and defeating the French Union in 1954 at the battle of Điện Biên Phủ. He officially stepped down from power in 1965 due to health problems. After the war, Saigon, the former capital of the Republic of Vietnam, was renamed Hồ Chí Minh City.Any description of Ho's life before he came to power in Vietnam is necessarily fraught with ambiguity. He is known to have used at least 50 and perhaps as many as 200 pseudonyms. (Duiker says at least 75.) His place of birth and date of birth are products of academic consensus since neither is known with certainty. \"Official biographies and there are at least four, vary on names, dates, places and other hard facts. Unofficial biographies vary even more widely.")

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
Hồ Chí Minh Vietnamese:  (About this sound listen), Saigon:  (About this sound listen); Chữ nôm: ; 19 May 1890 – 2 September 1969), born Nguyễn Sinh Cung, also known as Nguyễn Tất Thành and Nguyễn Ái Quốc, was a Vietnamese Communist revolutionary leader who was Chairman and First Secretary of the Workers' Party of Vietnam. Hồ was also Prime Minister (1945–55) and President (1945–69) of the Democratic Republic of Vietnam (North Vietnam). He was a key figure in the foundation of the Democratic Republic of Vietnam in 1945, as well as the People's Army of Vietnam (PAVN) and the Việt Cộng (NLF or VC) during the Vietnam War.Hồ Chí Minh led the Việt Minh independence movement from 1941 onward, establishing the Communist-ruled Democratic Republic of Vietnam in 1945 and defeating the French Union in 19

[{'end': 22,
  'id': 'T1',
  'start': 0,
  'text': 'Hồ Chí Minh Vietnamese',
  'type': 'MISC'},
 {'end': 153,
  'id': 'T2',
  'start': 137,
  'text': 'Nguyễn Sinh Cung',
  'type': 'PER'},
 {'end': 185, 'id': 'T3', 'start': 176, 'text': 'Tất Thành', 'type': 'PER'},
 {'end': 204,
  'id': 'T4',
  'start': 190,
  'text': 'Nguyễn Ái Quốc',
  'type': 'PER'},
 {'end': 232,
  'id': 'T5',
  'start': 212,
  'text': 'Vietnamese Communist',
  'type': 'MISC'},
 {'end': 323,
  'id': 'T6',
  'start': 298,
  'text': "Workers' Party of Vietnam",
  'type': 'ORG'},
 {'end': 423,
  'id': 'T7',
  'start': 393,
  'text': 'Democratic Republic of Vietnam',
  'type': 'ORG'},
 {'end': 438,
  'id': 'T8',
  'start': 425,
  'text': 'North Vietnam',
  'type': 'LOC'},
 {'end': 516,
  'id': 'T9',
  'start': 486,
  'text': 'Democratic Republic of Vietnam',
  'type': 'ORG'},
 {'end': 547, 'id': 'T10', 'start': 541, 'text': 'People', 'type': 'MISC'},
 {'end': 565,
  'id': 'T11',
  'start': 550,
  'text': 'Army of Vietna