<a href="https://colab.research.google.com/github/pedrofuentes79/Algoritmos-y-Estructuras-de-Datos-UBA/blob/master/Named-Entity-Recognition/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional


In [None]:
drive.mount("/content/drive")


# Read the dataset as text
with open("/content/drive/MyDrive/ColabProjects/ner/wikigold/wikigold.conll.txt", 'r', encoding='utf-8') as file:
    dataset_text = file.read()

# Pre-process the dataset to remove trailing "\n" characters
lines = dataset_text.strip().split('\n')
lines = [line.rstrip('\n') for line in lines]

data = []
for line in lines:
    row = line.split(' ')
    data.append(row)

df = pd.DataFrame(data, columns=['Token', 'Entity'])


Mounted at /content/drive


In [None]:
# remove nan
df = df.dropna().reset_index(drop=True)

# Split "double words". these words are two words joined by a - character. I want them in separated rows.
# Example: "New-York" -> ["New", "York"]
df['Token'] = df['Token'].apply(lambda x: x.split('-') if '-' in x else x)

# Explode the dataframe to have one word per row. This will separate all elements of rows that have a list.
df["Token"] = df["Token"].explode().reset_index(drop=True)


In [None]:
# check the label types
df["Entity"].value_counts()

# map the labels to numbers
labels = df["Entity"].unique()
label_to_index = {}
index_to_label = {}
for i, label in enumerate(labels):
    label_to_index[label] = i
    index_to_label[i] = label

df["Label"] = df["Entity"].map(label_to_index)
# add in order to leave 0 as an empty value
df["Label"] = df["Label"].apply(lambda x: int(x)+1)

In [None]:
# find all "-DOCSTART-" tokens
docstart_positions = df[df['Token'].isin(['-DOCSTART-', 'DOCSTART'])].index.tolist()
print(docstart_positions)

[142, 576, 1888, 2163, 3770, 3886, 4855, 5838, 5914, 6044, 6098, 6209, 6245, 6557, 7036, 7074, 7212, 7349, 7559, 7646, 8794, 8962, 9797, 11087, 11126, 11653, 11998, 12031, 12122, 12219, 12951, 12966, 13218, 13233, 13306, 13460, 13475, 13710, 14033, 14073, 14230, 14584, 15018, 15343, 15402, 15626, 15694, 15734, 17619, 17781, 17893, 18534, 18852, 18919, 18995, 19026, 19079, 19089, 19200, 19252, 19293, 19308, 20711, 20933, 21088, 21416, 21680, 21945, 21988, 22046, 22136, 22383, 22811, 22872, 23373, 23767, 23922, 24192, 24319, 24439, 25018, 25194, 25206, 25283, 25482, 25623, 25693, 25741, 25912, 26296, 26746, 26791, 26861, 26876, 26934, 27023, 27774, 27852, 27909, 28327, 28410, 28486, 28674, 29202, 29349, 29400, 29467, 29556, 29666, 29755, 29787, 30064, 30353, 30839, 30899, 31072, 31367, 31447, 32192, 32619, 32669, 32978, 33014, 33905, 34798, 34858, 35706, 35950, 35967, 36180, 36597, 36986, 37145, 37250, 37401, 37414, 37501, 37534, 37698, 37793, 38555, 38886, 39133, 39142, 39151]


In [None]:
# create a new column listing the sentence number for each token
sentence_number = 0

for index, row in df.iterrows():
    if index in docstart_positions:
        sentence_number += 1
    df.loc[index, 'Sentence #'] = sentence_number

In [None]:
# drop all "DOCSTART" tokens
df = df[~df['Token'].isin(['-DOCSTART-', 'DOCSTART'])]

# Drop all "" values
df = df[df['Token'] != ''].reset_index(drop=True)


In [None]:
def get_dict_map(data):
    tok2idx = {}
    idx2tok = {}

    vocab = list(set(data.to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(df["Token"])

df["Embeddings"] = df["Token"].map(token2idx)
df.head()

In [None]:
# group by sentence number and join tokens into a sentence
df_grouped = df.groupby('Sentence #').agg({'Token': list, 'Label': list, "Embeddings": list}).reset_index()

Unnamed: 0,Sentence #,Token,Label
0,0.0,"[010, is, the, tenth, album, from, Japanese, P...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2, 1, ..."
1,1.0,"[The, 139th, was, formed, at, Camp, Howe, ,, n...","[1, 2, 1, 1, 1, 4, 4, 1, 1, 4, 1, 1, 1, 1, 1, ..."
2,2.0,"[The, 1896, Atlantic, City, rail, crash, occur...","[1, 1, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3.0,"[The, 2007, Bowling, Green, Falcons, football,...","[1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, ..."
4,4.0,"[30, Seconds, to, Mars, (, or, Thirty, Seconds...","[2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, ..."
...,...,...,...
140,140.0,"[EZ2DJ, is, a, series, of, music, video, games...","[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 2, ..."
141,141.0,"[The, Fairfax, Connector, is, a, public, bus, ...","[1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 4, 4, 1, 4, 1, ..."
142,142.0,"[Fairmont, State, University, is, a, public, u...","[2, 2, 2, 1, 1, 1, 1, 1, 1, 4, 1, 4, 4, 1, 1, ..."
143,143.0,"[A, list, of, works, by, Justin, Broadrick, .]","[1, 1, 1, 1, 1, 3, 3, 1]"


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_grouped['Token'].values)
word_index = tokenizer.word_index

In [None]:
# get max_seq_length
input_length = max(df_grouped['Token'].apply(len))
input_dim = len(list(set(df['Embeddings'].to_list())))+1
output_dim = 100
n_classes = len(list(set(df['Label'].to_list())))+1

print("Input dimensions: ", input_dim)
print("Output dimensions: ", output_dim)
print("Input length: ", input_length)
print("Number of classes: ", n_classes)

In [None]:
# TRAIN TEST SPLIT

In [None]:
# DEFINE MODEL

# define the model
rnn = Sequential()
rnn.add(Embedding(
    input_dim=len(word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_seq_length,
    trainable=False))

rnn.add(Bidirectional(LSTM(units=128, return_sequences=True)))
rnn.add(Dense(len(labels), activation='softmax'))

rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])