<a href="https://colab.research.google.com/github/pedrofuentes79/RNNs/blob/master/Named-Entity-Recognition/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Masking, TimeDistributed
from tensorflow.keras.optimizers import Adam

from google.colab import drive

!pip install fasttext




In [5]:
drive.mount("/content/drive")


# Read the dataset as text
with open("/content/drive/MyDrive/ColabProjects/ner/wikigold/wikigold.conll.txt", 'r', encoding='utf-8') as file:
    dataset_text = file.read()

# Pre-process the dataset to remove trailing "\n" characters
lines = dataset_text.strip().split('\n')
lines = [line.rstrip('\n') for line in lines]

data = []
for line in lines:
    row = line.split(' ')
    data.append(row)

df = pd.DataFrame(data, columns=['Token', 'Entity'])


Mounted at /content/drive


In [6]:
# remove nan
df = df.dropna().reset_index(drop=True)

# Split "double words". these words are two words joined by a - character. I want them in separated rows.
# Example: "New-York" -> ["New", "York"]
df['Token'] = df['Token'].apply(lambda x: x.split('-') if '-' in x else x)

# Explode the dataframe to have one word per row. This will separate all elements of rows that have a list.
df["Token"] = df["Token"].explode().reset_index(drop=True)


In [7]:
# drop all "DOCSTART" tokens
df = df[~df['Token'].isin(['-DOCSTART-', 'DOCSTART'])]

# Drop all "" values
df = df[df['Token'] != ''].reset_index(drop=True)


In [8]:
# Set Labels

# map the labels to numbers
labels = df["Entity"].unique()
label_to_index = {}
index_to_label = {}
for i, label in enumerate(labels):
    label_to_index[label] = i
    index_to_label[i] = label

df["Label"] = df["Entity"].map(label_to_index)
df["Label"] = df["Label"].apply(lambda x: int(x)+1)

In [9]:
# I think this assigns one number to each word.
def get_dict_map(data):
    tok2idx = {}
    idx2tok = {}

    vocab = list(set(data.to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(df["Token"])

df["Embeddings"] = df["Token"].map(token2idx)

In [10]:
# Create Sentence # Column

# This function will count the number of sentences in the dataset.
def get_sentence_number(data):
    sentence_number = 1
    sentence_number_list = []
    for word in data:
        if word == ".":
            sentence_number += 1
        sentence_number_list.append(sentence_number)
    return sentence_number_list

df["Sentence #"] = get_sentence_number(df["Token"])
df.head()

Unnamed: 0,Token,Entity,Label,Embeddings,Sentence #
0,010,I-MISC,1,2269,1
1,is,O,2,6995,1
2,the,O,2,2607,1
3,tenth,O,2,5306,1
4,album,O,2,7979,1


In [11]:
# Remove every row that is a "." in the token column
df = df[df["Token"] != "."].reset_index(drop=True)

In [12]:
# group by sentence number and join tokens into a sentence
df_grouped = df.groupby('Sentence #').agg({'Token': list, 'Label': list, "Embeddings": list}).reset_index()
df_grouped.head()

Unnamed: 0,Sentence #,Token,Label,Embeddings
0,1,"[010, is, the, tenth, album, from, Japanese, P...","[1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 3, 3, 3, 3]","[2269, 6995, 2607, 5306, 7979, 4992, 3269, 803..."
1,2,"[This, album, proved, to, be, more, commercial...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, ...","[5651, 7979, 7804, 4190, 444, 7491, 4841, 1840..."
2,3,"[Founding, member, Kojima, Minoru, played, gui...","[4, 4, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, ...","[2366, 3270, 5984, 5958, 549, 574, 4526, 2016,..."
3,4,"[XXX, can, of, This, had, a, different, meanin...","[1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[6627, 7293, 7034, 5651, 3578, 6695, 4689, 642..."
4,5,"[it, was, later, explained, that, the, song, w...","[2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, ...","[6161, 4692, 6080, 1116, 1235, 2607, 6754, 469..."


In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_grouped['Token'].values)
word_index = tokenizer.word_index

In [15]:
# FastText word embeddings
import fasttext
import numpy as np

# Load the trained FastText model
model = fasttext.load_model('/content/drive/MyDrive/ColabProjects/ner/FastText.bin')

# Create the embedding matrix
embedding_matrix = np.zeros((len(token2idx)+1, 100))
for word, i in word_index.items():
    embedding_matrix[i] = model.get_word_vector(word)

# Create another "Embeddings_FastText" column
df_grouped["Embeddings_FastText"] = df_grouped["Token"].apply(lambda x: [embedding_matrix[token2idx[word]] for word in x])



In [16]:
n_words = len(set(df['Token'].explode().to_list()))+1 # why the +1?
output_dim = 100
input_length = max(df_grouped['Token'].apply(len))
n_classes = len(set(df['Label'].to_list()))+1

print("Number of words: ", n_words)
print("Output dimensions: ", output_dim)
print("Input length: ", input_length)
print("Number of classes: ", n_classes)

Number of words:  8263
Output dimensions:  100
Input length:  144
Number of classes:  6


In [17]:
from tensorflow.keras.utils import to_categorical

# One-Hot encode the labels
y = df_grouped['Label'].values
y = [to_categorical(i, num_classes=n_classes) for i in y]

In [18]:
# PAD sequences
X = pad_sequences(df_grouped["Embeddings_FastText"].values, maxlen=input_length, padding='post', value=0)
X = np.squeeze(X) #?

In [19]:
# Pad the sequences so that the new ones look like this: [1,0,0,0,0,0]
for i in range(len(X)):
    for j in range(len(X[i])):
        # check if the token is a padding token
        if np.all(X[i][j]== 0) and len(y[i]) < 144:
            y[i] = np.append(y[i], [[1,0,0,0,0,0]], axis=0)


    #if X[i][j] and len(y[i]) < 144:
    #y[i] = np.append(y[i], [[1,0,0,0,0,0]], axis=0)

y = np.array(y)


In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, shuffle=True, test_size=0.5)

In [26]:
# define the model
rnn = Sequential()

rnn.add(Masking(mask_value=0.0, input_shape=(input_length, output_dim)))

rnn.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)))

rnn.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

rnn.add(TimeDistributed(Dense(n_classes, activation="relu")))

adam = Adam(beta_1=0.9, beta_2=0.999)

rnn.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])