# Load packages

In [1]:
!pip install git+https://github.com/keras-team/keras.git -U
!pip install seqeval
!pip install tensorflow-gpu==2.0.0-alpha0
!pip install tensorflow-hub

Collecting git+https://github.com/keras-team/keras.git
  Cloning https://github.com/keras-team/keras.git to /tmp/pip-req-build-00km_d_f
  Running command git clone -q https://github.com/keras-team/keras.git /tmp/pip-req-build-00km_d_f
Building wheels for collected packages: Keras
  Building wheel for Keras (setup.py) ... [?25l[?25hdone
  Stored in directory: /tmp/pip-ephem-wheel-cache-cc1_ocf5/wheels/da/a4/7e/6b7bd9af18cc2e23b8dd5ed6de07a7e13bd80a17214eb88932
Successfully built Keras
Installing collected packages: Keras
  Found existing installation: Keras 2.2.4
    Uninstalling Keras-2.2.4:
      Successfully uninstalled Keras-2.2.4
Successfully installed Keras-2.2.4


In [0]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import TimeDistributed, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, Lambda
from tensorflow.keras.layers import concatenate
# from sklearn.metrics import recall_score, precision_score
from seqeval.metrics import classification_report as classification_report_entity
from seqeval.metrics import precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# Initialize parameters

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import os
loc = '/content/gdrive/My Drive/tf2.0'
idx2label = {0: 'B-START', 1: 'O', 2: 'B-OTHER', 3: 'I-OTHER', 4: 'I-START'}
label2idx = {v: k for k, v in idx2label.items()}
n_labels = len(idx2label)

# Load trained model

In [0]:
model_name = "fiction_lstm_elmo_v1_0504_1631_epoch30.h5"

In [6]:
from tensorflow.keras.models import load_model
import os

model_load = load_model(os.path.join(loc, model_name))

W0504 16:43:45.028355 140144470103936 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f7579d4d358>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0504 16:43:45.040663 140144470103936 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f7579d4dc50>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0504 16:43:45.044879 140144470103936 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f7578d0c518>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.
W0504 16:43:45.051020 140144470103936 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x7f75cfb28b38>: Note that this layer is not optimized for performance. Please use tf.keras.laye

# Prepare data

In [0]:
import re
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

In [0]:
def custom_tokenizer(sent):
    tokens = re.split('([,\.\?\:\;\‘\’\`\“\”\"\'~ ])', sent)
    tokens = [i for i in tokens if i and i != ' ']
    return tokens

# Prediction

In [0]:
max_len = 10

In [0]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
#             print(p)
            out_i.append(idx2label[p].replace("PAD", "O"))
#           print(out_i)
        out.append(out_i)
    return out

In [0]:
def y2label(y):
    out = []
    for y_i in y:
        out_i = []
        for p in y_i:
            for p_i in p:
                out_i.append(idx2label[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out

In [0]:
def predict_sentence(test_sentence_):
    tokens = custom_tokenizer(test_sentence_)
    tags = []

    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(tokens[i])
        except:
            new_seq.append("__PAD__")
    
    test_input = []
    for i in range(batch_size):
        test_input.append(new_seq)
    
    p_test = model_load.predict(np.array(test_input))[0]
    test_pred = np.argmax(p_test, axis=-1)
    out_i = []

    for p in test_pred:
        out_i.append(idx2label[p].replace("PAD", "O"))
    out_i

    return list(zip(tokens, out_i))

In [0]:
from bert_embedding import BertEmbedding
bert_embedding = BertEmbedding()

In [0]:
def bertSentenceEmbedding(x):
  embeddings = []
  count = 0
  for para in x:
#     count = 0
#     print(para)
    para_all_embed = np.asarray(bert_embedding(para))[:,1]
  
    para_embed = []
    for embed in np.asarray(para_all_embed):
      count = 0
      all_embed = [0]*768
      for word_embed in embed:
        count += 1
        all_embed = all_embed + word_embed
#       print(count)
      sent_embed = all_embed/count
      para_embed.append(sent_embed)
      
    embeddings.append(para_embed)
#     print(np.asarray(bert_embedding(para)))
  return embeddings

In [20]:
test_sentence = [['hi hello world joyce','hi joyce'],['hi hello joyce','joyce']]
X=[]
for seq in test_sentence:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("PAD")
    X.append(new_seq)
X[0]

res = bertSentenceEmbedding(X)

pred_num = model_load.predict(np.asarray(res))
pred = np.argmax(pred_num, axis=-1)
pred_labels = pred2label(pred)
pred_labels

[['O', 'I-OTHER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-OTHER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]

In [0]:
def predict_para(test_para):
    X=[]
    for seq in test_sentence:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("PAD")
        X.append(new_seq)
    
    res = bertSentenceEmbedding(X)

    pred_num = model_load.predict(np.asarray(res))
    pred = np.argmax(pred_num, axis=-1)
    pred_labels = pred2label(pred)
    return pred_labels

In [23]:
predict_para(test_sentence)

[['O', 'I-OTHER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-OTHER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]