## Step 1: Read in the data that you will be making predictions on. Let’s use this dataset containing titles, text, etc of Medium articles: https://www.kaggle.com/hsankesara/medium-articles#articles.csv.

In [None]:
df = pd.read_csv("articles.csv", encoding="latin1")

## Step 2: Clean the dataset that the predictions will be made on. The column ‘text’ contains the content of each Medium article, so we will only need to clean that column. First examine the column to see which cleaning steps need to be taken. I applied the following changes:

In [None]:
df['text'] = df['text'].str.replace(r'\s+', ' ')

In [None]:
df['text'] = df['text'].str.replace(r'\\n', ' ')
df['text'] = df['text'].str.replace(r'\\t', ' ')

In [None]:
df = df[df['text'] != '']

In [None]:
df['text'] = df['text'].str.replace(r'[^\x00-\x7F]+', '')

In [None]:
df['text'] = df['text'].str.replace("<!--.*-->","")

In [None]:
df = df[df['text'].notna()]

## Step 3: Load the pickled tags.

In [None]:
import pickle

tags = None
with open('tags.pickle', 'rb') as f:
     tags = pickle.load(f)

n_tags = len(tags)

## Step 4: Read in some more of the code used in the training steps.

### Start a tensorflow session.

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K

In [None]:
sess = tf.Session()
K.set_session(sess)

### Download bi-directional LSTM model pretrained with ELMo word embeddings to learn both word (e.g., syntax and semantics) and linguistic context of a training dataset.

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

### Create a function that vectorizes a sequence of strings with the ELMo embedding.

In [None]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

## Step 5: Set the max sentence length. It will be different than in the training script.

In [None]:
max_len = 200

## Step 6: Set batch size. It will be different than in the training script. It must be divisable by 32 because sequence_len is dtype=int32 and so needs to be made shape=(32,).

In [None]:
batch_size = 32

### Create a residual LSTM network with an ELMo embedding layer.

In [None]:
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda

In [None]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

In [None]:
model = Model(input_text, out)

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

## Step 7: Load in the trained model that you previously saved.

In [None]:
model.load_weights('./testmodel_weights')

## Step 8: Function which adds word padding to prediction dataset. Different from training script.

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from typing import List

def add_word_padding(words: List[str], words_per_sent = 200, pad_str = '__PAD__') -> List[str]:
    num_pad_words = words_per_sent - len(words)
    pad_words = ['__PAD__'] * num_pad_words
    return words + pad_words

## Step 9: Function which adds sentence padding to prediction dataset. 

### Each article is 512 sentences length. Both batch size and sent_per_article must be divisable by 32 because sequence_len is dtype=int32 and so needs to be made shape=(32,).

### If you know that the max sentence length of each article is quite a bit below 512 words, you should definitely lower sent_per_article. Of course, the lower the sentence length, the faster the prediction will run. However, make sure that you keep sent_per_article divisable by 32, so if the longest sentence length is 300 words, change sent_per_article to the next number above that which is divisible by 32 (so, in that case, 320).

### If you know that the max word length of each sentence of each article is say 100, lower words_per_sent to 100. There is no need for words_per_sent to be divisable by a particular number.

In [None]:
def gen_pad_sentences(sentences: List[List[str]], words_per_sent = 200, sent_per_article = 512, pad_str = '__PAD__') -> List[List[str]]:
    pad_sent = ['__PAD__'] * words_per_sent
    num_pad_sentences = sent_per_article - len(sentences)
    pad_sentences =  [pad_sent] * num_pad_sentences
    return pad_sentences

## Step 10: Preprocessing function which splits an article into a list of sentences, then splits each sentence into a list of words, and then applies word and sentence padding.

In [None]:
def create_nn_input(article: str) -> List:
    # split article into list of sentences
    sentences: List[str] = sent_tokenize(article)
    
    # split sentences into list of words
    sentences_split_into_words = [word_tokenize(sentence) for sentence in sentences]
   
    # add padding to words
    pad_words = [add_word_padding(sentence) for sentence in sentences_split_into_words]

    # add padding to sentences
    return pad_words + gen_pad_sentences(pad_words)

## Step 11: Predicting function

In [None]:
def predict_article(article):
    #create list of lists
    preprocessed_article = np.array(create_nn_input(article))
    predictions = model.predict(preprocessed_article)
    # picks best tag for each word
    predicted_label_indices = np.argmax(predictions, axis=-1)
    
    # gets tags of each element of a
    convert_tags = np.vectorize(lambda x: tags[x])
    #apply convert_tags on predicted_label_indices
    predicted_tags = convert_tags(predicted_label_indices)

    #todo: merge b and i pairs
    #2d array for labels create pairwise tuples with 2d array of words
    flat_words = preprocessed_article.flatten()
    flat_tags = predicted_tags.flatten()
    
    combine_tags_words = list(zip(flat_tags, flat_words))    
  
    #function, for each tuple (lbl, word) in result, keep tuples where label (t[0]) not O
    return list(filter(lambda t: t[0] != 'O', combine_tags_words))

## Step 12: Apply predicting function on each text row in dataset.

In [None]:
prediction = df.apply(lambda row: predict_article(row['text']), axis=1)

### Now you have made the predictions and they are in lists of tuples. Each tuple contains a word and tag pair. Each list of tuples corresponds to a different Medium article.

## Step 13: Next we will format the predictions and add them to the predicting dataset under a new column called “Entities”. Formatting them will allow us to more clearly see the entities extracted from each Medium article.

### First convert prediction into a pandas dataframe:

In [None]:
prediction = pd.DataFrame(prediction)

### Make a bunch of functions that format the predictions

In [None]:
from itertools import groupby

In [None]:
def merge_tags(x):
    # Change list of tuples into list
    nested_list = [item for t in x for item in t]

    # Add string 'split' before each tag containing B in 0th index of tag
    res = []
    for entry in nested_list:
        if entry[0:2] == 'B-':
            res.append('split')
        res.append(entry)
    nested_list[:] = res
    res
    
    # Split by string 'split'
    def split_condition(x):
        return x in {'split'}

    grouper = groupby(res, key=split_condition)

    # Convert to dictionary via enumerate
    conv_to_dict = dict(enumerate((list(j) for i, j in grouper if not i), 1))
        
    # Convert dictionary of lists into list of lists
    dictionary_to_list = [[k]+v for k,v in conv_to_dict.items()]

    # Remove first element in each list
    for x in dictionary_to_list:
        del x[0]
    
    return dictionary_to_list

In [None]:
# Apply merge_tags function to each list in list
def apply_merge_tags(x):
    return [replace_B(l) for l in x]

In [None]:
# Change tag type for B if group contains two I's, then drop the I's. 
# Otherwise, just drop the I's.
def replace_B(x):
    result = x

    #get all I-'s
    I_tags = list(filter(lambda x: x[:2] == 'I-', x))
    if len(I_tags) == 2:
        second_to_last_I_tag = I_tags[-2]
        new_B_tag = 'B' + second_to_last_I_tag[1:]
        bb_no_Is = list(filter(lambda x: x[:2] != 'I-', x))
        result = bb_no_Is.copy()
        result[0] = new_B_tag
    else:
        bb_no_Is = list(filter(lambda x: x[:2] != 'I-', x))
        result = bb_no_Is.copy()
        
    return result

In [None]:
# Change B- tags with more readable label names
def change_label(label):
    if label == 'B-geo':
        return 'Location:'
    elif label == 'B-gpe':
        return 'Geopolitical Entity:'
    elif label == 'B-org':
        return 'Company:'
    elif label == 'B-per':
        return 'Person:'
    elif label == 'B-tim':
        return 'Time Period:'
    else: return label

In [None]:
# Apply change_label function to first element of each list 
# Then change that element to show the result of applied function
def apply_change_label(y):
    for x in y:
        x[0] = change_label(x[0])

In [None]:
# Join all but first element into string
def join_string(y):
    return [[x[0]] + [' '.join(x[1:])] for x in y]

In [None]:
# Merge lists with same first element and create dictionary
def merge_lists(x):
    from collections import defaultdict

    d = defaultdict(list)

    for i, j in x:
        d[i].append(j)

    return d

### Apply the functions to each article

In [None]:
prediction = prediction.apply(lambda row: merge_tags(row[0]), axis=1)

In [None]:
# Convert prediction to df
prediction = pd.DataFrame(prediction)

In [None]:
prediction = prediction.apply(lambda row: apply_merge_tags(row[0]), axis=1)

In [None]:
# Convert prediction to df
prediction = pd.DataFrame(prediction)

In [None]:
prediction = prediction.apply(lambda row: replace_B(row[0]), axis=1)

In [None]:
# Convert prediction to df
prediction = pd.DataFrame(prediction)

In [None]:
prediction = prediction.apply(lambda row: change_label(row[0]), axis=1)

In [None]:
# Convert prediction to df
prediction = pd.DataFrame(prediction)

In [None]:
prediction.apply(lambda row: apply_change_label(row[0]), axis=1)

In [None]:
prediction = prediction.apply(lambda row: join_string(row[0]), axis=1)

In [None]:
# Convert prediction to df
prediction = pd.DataFrame(prediction)

### Apply final function to prediction and apply it to new column in df called Entities

In [None]:
df["Entities"] = prediction.apply(lambda row: merge_lists(row[0]), axis=1)

### Now you have a dataset of Medium articles with a new Entities column containing the formatted entities for each article. These entities can be viewed on the front end by converting the entities (nested list format) into JSON objects.