<a href="https://colab.research.google.com/github/pedrofuentes79/RNNs/blob/master/Named-Entity-Recognition/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Masking, TimeDistributed
from tensorflow.keras.optimizers import Adam

from google.colab import drive

!pip install fasttext




In [2]:
drive.mount("/content/drive")


# Read the dataset as text
with open("/content/drive/MyDrive/ColabProjects/ner/wikigold/wikigold.conll.txt", 'r', encoding='utf-8') as file:
    dataset_text = file.read()

# Pre-process the dataset to remove trailing "\n" characters
lines = dataset_text.strip().split('\n')
lines = [line.rstrip('\n') for line in lines]

data = []
for line in lines:
    row = line.split(' ')
    data.append(row)

df = pd.DataFrame(data, columns=['Token', 'Entity'])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# remove nan
df = df.dropna().reset_index(drop=True)

# Split "double words". these words are two words joined by a - character. I want them in separated rows.
# Example: "New-York" -> ["New", "York"]
df['Token'] = df['Token'].apply(lambda x: x.split('-') if '-' in x else x)

# Explode the dataframe to have one word per row. This will separate all elements of rows that have a list.
df["Token"] = df["Token"].explode().reset_index(drop=True)


In [4]:
# drop all "DOCSTART" tokens
df = df[~df['Token'].isin(['-DOCSTART-', 'DOCSTART'])]

# Drop all "" values
df = df[df['Token'] != ''].reset_index(drop=True)


In [5]:
# Set Labels

# map the labels to numbers
labels = df["Entity"].unique()
label_to_index = {}
index_to_label = {}
for i, label in enumerate(labels):
    label_to_index[label] = i
    index_to_label[i] = label

df["Label"] = df["Entity"].map(label_to_index)


In [6]:
# I think this assigns one number to each word.
def get_dict_map(data):
    tok2idx = {}
    idx2tok = {}

    vocab = list(set(data.to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok


token2idx, idx2token = get_dict_map(df["Token"])

df["Embeddings"] = df["Token"].map(token2idx)

In [7]:
# Create Sentence # Column

# This function will count the number of sentences in the dataset.
def get_sentence_number(data):
    sentence_number = 1
    sentence_number_list = []
    for word in data:
        if word == ".":
            sentence_number += 1
        sentence_number_list.append(sentence_number)
    return sentence_number_list

df["Sentence #"] = get_sentence_number(df["Token"])
df.head()

Unnamed: 0,Token,Entity,Label,Embeddings,Sentence #
0,010,I-MISC,0,3041,1
1,is,O,1,859,1
2,the,O,1,6936,1
3,tenth,O,1,7635,1
4,album,O,1,1978,1


In [8]:
# Remove every row that is a "." in the token column
df = df[df["Token"] != "."].reset_index(drop=True)

In [9]:
# group by sentence number and join tokens into a sentence
df_grouped = df.groupby('Sentence #').agg({'Token': list, 'Label': list, "Embeddings": list}).reset_index()
df_grouped.head()

Unnamed: 0,Sentence #,Token,Label,Embeddings
0,1,"[010, is, the, tenth, album, from, Japanese, P...","[0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 2, 2, 2]","[3041, 859, 6936, 7635, 1978, 5591, 1547, 6380..."
1,2,"[This, album, proved, to, be, more, commercial...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ...","[539, 1978, 4283, 5061, 5575, 6392, 1036, 8145..."
2,3,"[Founding, member, Kojima, Minoru, played, gui...","[3, 3, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, ...","[8259, 7294, 3631, 4764, 6248, 5734, 2858, 188..."
3,4,"[XXX, can, of, This, had, a, different, meanin...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4750, 1951, 8193, 539, 5282, 3507, 5387, 2521..."
4,5,"[it, was, later, explained, that, the, song, w...","[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, ...","[5717, 1939, 7653, 7578, 3771, 6936, 5045, 193..."


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_grouped['Token'].values)
word_index = tokenizer.word_index

In [11]:
# FastText word embeddings
import fasttext
import numpy as np

# Load the trained FastText model
model = fasttext.load_model('/content/drive/MyDrive/ColabProjects/ner/FastText.bin')

# Create the embedding matrix
embedding_matrix = np.zeros((len(token2idx)+1, 100))
for word, i in word_index.items():
    embedding_matrix[i] = model.get_word_vector(word)




In [12]:
n_words = len(set(df['Token'].explode().to_list()))+1 # why the +1?
output_dim = 100
input_length = max(df_grouped['Token'].apply(len))
n_classes = len(set(df['Label'].to_list()))

print("Number of words: ", n_words)
print("Output dimensions: ", output_dim)
print("Input length: ", input_length)
print("Number of classes: ", n_classes)


Number of words:  8263
Output dimensions:  100
Input length:  144
Number of classes:  5


In [13]:
from tensorflow.keras.utils import to_categorical

X = df_grouped["Token"].apply(lambda x: pad_sequences(
                                                    tokenizer.texts_to_sequences(x),
                                                    maxlen=max(df_grouped['Token'].apply(len)),
                                                    padding="post",
                                                    truncating="post"))



In [14]:
# One-Hot encode the labels
y = df_grouped['Label'].values
y = [to_categorical(i, num_classes=n_classes) for i in y]

for i in range(len(y)):
    while len(y[i]) < input_length: # 144
        y[i] = np.append(y[i], [[1, 0, 0, 0, 0]], axis=0)



y = np.array(y)


In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, shuffle=True, test_size=0.5)

In [16]:
# define the model
rnn = Sequential()

rnn.add(Masking(mask_value=0.0, input_shape=(input_length,)))

rnn.add(Embedding(
    input_dim=len(embedding_matrix),
    output_dim=output_dim,  # Dimension of FastText embeddings (100)
    weights=[embedding_matrix],
    input_length=input_length,
    trainable=False  # Set to False to keep embeddings fixed
))

rnn.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.5)))

rnn.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

rnn.add(TimeDistributed(Dense(n_classes, activation="relu")))

adam = Adam(beta_1=0.9, beta_2=0.999)

rnn.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])