In [4]:
from keras.models import Model
from tensorflow.keras.layers import Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

### Necessary Functions

In [5]:
def get_sentences(dataset):
    n_sent = 1
    # print(dataset[0])
    grouped = dataset.groupby("sentence_idx").apply(lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                                                      s["tag"].values.tolist())])
    sentences = [s for s in grouped]

    def get_next():
        nonlocal n_sent
        try:
            s = grouped["Sentence: {}".format(n_sent)]
            n_sent += 1
            return s
        except:
            return None

    return sentences, get_next


def get_max_len(sentences):
    return max([len(s) for s in sentences])


def add_sentence_id_column(dataset):
    sentence_idx = 1
    sentence_indices = []
    for word in dataset['word']:
        sentence_indices.append(sentence_idx)
        if word == '.':
            sentence_idx += 1
    dataset.insert(0, 'sentence_idx', sentence_indices)
    return dataset

In [14]:
def encoding(data):
    # dframe = lstmFun.add_sentence_id_column(self.data)
    sentences, get_next = get_sentences(data)
    maxlen = get_max_len(sentences)
    print('Maximum sequence length:', maxlen)

    words = list(set(data["word"].values))
    words = ["ENDPAD"] + words
    n_words = len(words);

    tags = list(set(data["tag"].values))
    tags = ["O"] + tags
    n_tags = len(tags)

    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    x = [[word2idx[w[0]] for w in s] for s in sentences]
    x = pad_sequences(maxlen=maxlen, sequences=x, padding="post", value=n_words - 1)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]
    return x, y, maxlen, n_words, n_tags,tags,tag2idx

### encoding train and test data

In [30]:
train_data = pd.read_csv("work/data/label_data/ner_train_data.csv", encoding = "ISO-8859-1")
test_data = pd.read_csv("work/data/label_data/ner_test_data.csv", encoding = "ISO-8859-1")
x_train, y_train, train_maxlen, train_n_words, train_n_tag,train_tag,tag2idx_train = encoding(train_data)
x_test, y_test, test_maxlen, test_n_words, test_n_tag,test_tag,tag2idx_test = encoding(test_data)

Maximum sequence length: 104
Maximum sequence length: 104


### Train the model

In [8]:
from keras.layers import Reshape
input_t = Input(shape=(train_maxlen,))
model = Embedding(input_dim=train_n_words, output_dim=100, input_length=train_maxlen)(input_t)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(train_n_tag, activation="softmax"))(model)  # softmax output layer

In [9]:
model = Model(input_t, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [10]:
history = model.fit(x_train, np.array(y_train), batch_size=32, epochs=1, verbose=1)



In [12]:
print(input_t.shape)
print(x_train.shape)
print(np.array(y_train).shape)

(None, 104)
(38367, 104)
(38367, 104, 18)


In [13]:
dir = "/home/jovyan/work/data/output/LSTM_1batch"
model.save(dir)

2023-06-12 12:20:22.560483: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-12 12:20:22.768306: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-12 12:20:22.812983: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and 

INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/LSTM_1batch/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/LSTM_1batch/assets


In [17]:
# Evaluation
loss, accuracy = model.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Loss: 0.0254
Accuracy: 99.27%


### Evaluation per tag

In [16]:

# Get the predicted labels
y_pred = model.predict(x_test)
y_pred_indices = np.argmax(y_pred, axis=-1)
y_train_indices = np.argmax(y_train, axis=-1)

y_true_flattened = np.argmax(y_test, axis=-1).flatten()
y_pred_flattened = y_pred_indices.flatten()
Y_train_flatten = y_train_indices.flatten()

# Get the list of unique tags
unique_tags = list(set(test_tag))


# Initialize dictionaries to store evaluation metrics per tag
precision_per_tag = {}
recall_per_tag = {}
f1_per_tag = {}
accuracy_per_tag = {}


# Calculate evaluation metrics for each tag
for tag in unique_tags:
    y_true_tag = y_true_flattened[y_true_flattened == tag]
    y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
    #
    mask = (y_true_flattened == tag2idx_test[tag])
    y_true_tag = y_true_flattened[mask]
    y_pred_tag = y_pred_flattened[mask]


    # Calculate F1 score
    f1 = f1_score(y_true_tag, y_pred_tag, average='weighted')

    # Calculate accuracy
    accuracy = accuracy_score(y_true_tag, y_pred_tag)

    f1_per_tag[tag] = f1
    accuracy_per_tag[tag] = accuracy

# Print the evaluation metrics per tag
for tag in unique_tags:
    print("Tag:", tag)
    print("F1 Score:", f1_per_tag[tag])
    print("Accuracy:", accuracy_per_tag[tag])
    print()




  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]


(0, 3990168) O
(0, 3990168) I-nat
(0, 3990168) B-per
(0, 3990168) I-tim
(0, 3990168) B-org
(0, 3990168) B-gpe
(0, 3990168) B-art
(0, 3990168) B-tim
(0, 3990168) I-eve
(0, 3990168) I-art
(0, 3990168) B-eve
(0, 3990168) I-geo
(0, 3990168) I-org
(0, 3990168) B-nat
(0, 3990168) I-per
(0, 3990168) B-geo
(0, 3990168) I-gpe
Tag: O
F1 Score: 0.9995795267071785
Accuracy: 0.9991594068613221

Tag: I-nat
F1 Score: 0.0
Accuracy: 0.0

Tag: B-per
F1 Score: 0.8909365189383128
Accuracy: 0.8033232850525697

Tag: I-tim
F1 Score: 0.7617243012790147
Accuracy: 0.6151491966335119

Tag: B-org
F1 Score: 0.7842696629213483
Accuracy: 0.6451016635859519

Tag: B-gpe
F1 Score: 0.9566039286594887
Accuracy: 0.9168176447420244

Tag: B-art
F1 Score: 0.0
Accuracy: 0.0

Tag: B-tim
F1 Score: 0.885382798863753
Accuracy: 0.7943380004912798

Tag: I-eve
F1 Score: 0.0
Accuracy: 0.0

Tag: I-art
F1 Score: 0.0
Accuracy: 0.0

Tag: B-eve
F1 Score: 0.0
Accuracy: 0.0

Tag: I-geo
F1 Score: 0.8128980891719745
Accuracy: 0.68477531857813

  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened == tag]
  num = Y_train_flatten[Y_train_flatten == tag].shape
  y_true_tag = y_true_flattened[y_true_flattened == tag]
  y_pred_tag = y_pred_flattened[y_pred_flattened 

#### Read models and  Compair two models with different epoch

In [31]:
import tensorflow as tf
lstm_1_epochs = 'work/data/output/LSTM_1batch'
model1e = tf.keras.models.load_model(lstm_1_epochs)


In [32]:
loss, accuracy = model1e.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))

Loss: 0.0254
Accuracy: 99.27%


In [29]:
lstm_40_epochs = 'work/data/output/LSTM_'
model40e = tf.keras.models.load_model(lstm_40_epochs)
loss, accuracy = model1e.evaluate(x_test, np.array(y_test))
print("Loss: {:.4f}".format(loss))
print("Accuracy: {:.2f}%".format(accuracy * 100))


Loss: 0.0254
Accuracy: 99.27%


In [38]:
    # accuracy_per_tag[tag] = accuracy
# number_sample_train={}
# for tag in unique_tags:
tag_count = train_data["tag"].value_counts()
print(tag_count)

tag
O        711031
B-geo     29980
B-tim     16284
B-org     16230
I-per     13806
B-per     13601
I-org     13469
B-gpe     12695
I-geo      5964
I-tim      5228
B-art       316
B-eve       248
I-art       239
I-eve       200
I-gpe       159
B-nat       151
I-nat        39
Name: count, dtype: int64
