In [38]:
from keras.models import Model
from tensorflow.keras.layers import Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

In [39]:
def get_sentences(dataset):
    n_sent = 1
    # print(dataset[0])
    grouped = dataset.groupby("sentence_idx").apply(lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(),
                                                                                      s["tag"].values.tolist())])
    sentences = [s for s in grouped]

    def get_next():
        nonlocal n_sent
        try:
            s = grouped["Sentence: {}".format(n_sent)]
            n_sent += 1
            return s
        except:
            return None

    return sentences, get_next


def get_max_len(sentences):
    return max([len(s) for s in sentences])


def add_sentence_id_column(dataset):
    sentence_idx = 1
    sentence_indices = []
    for word in dataset['word']:
        sentence_indices.append(sentence_idx)
        if word == '.':
            sentence_idx += 1
    dataset.insert(0, 'sentence_idx', sentence_indices)
    return dataset

In [40]:
def encoding(data):
    # dframe = lstmFun.add_sentence_id_column(self.data)
    sentences, get_next = get_sentences(data)
    maxlen = get_max_len(sentences)
    print('Maximum sequence length:', maxlen)

    words = list(set(data["word"].values))
    words = ["ENDPAD"] + words
    n_words = len(words);

    tags = list(set(data["tag"].values))
    tags = ["O"] + tags
    n_tags = len(tags)

    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    x = [[word2idx[w[0]] for w in s] for s in sentences]
    x = pad_sequences(maxlen=maxlen, sequences=x, padding="post", value=n_words - 1)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=maxlen, sequences=y, padding="post", value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]
    return x, y, maxlen, n_words, n_tags

In [41]:
train_data = pd.read_csv("work/data/label_data/ner_train_data.csv", encoding = "ISO-8859-1")
test_data = pd.read_csv("work/data/label_data/ner_test_data.csv", encoding = "ISO-8859-1")
x_train, y_train, train_maxlen, train_n_words, train_n_tag = encoding(train_data)
x_test, y_test, test_maxlen, test_n_words, test_tag = encoding(test_data)

Maximum sequence length: 104
Maximum sequence length: 104


In [58]:
from keras.layers import Reshape

input_t = Input(shape=(train_maxlen,))
model = Embedding(input_dim=train_n_words, output_dim=100, input_length=train_maxlen)(input_t)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(train_n_tag, activation="softmax"))(model)  # softmax output layer

In [59]:
model = Model(input_t, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [60]:
history = model.fit(x_train, np.array(y_train), batch_size=32, epochs=1, verbose=1)



In [61]:
print(input.shape)
print(x_train.shape)
print(np.array(y_train).shape)

(None, 104)
(38367, 104)
(38367, 104, 18)


In [62]:
dir = "/home/jovyan/work/data/output/yek"
model.save(dir)

2023-06-11 22:13:58.383638: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-11 22:13:58.568318: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,100]
	 [[{{node while/Placeholder_2}}]]
2023-06-11 22:13:58.610676: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and 

INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/yek/assets


INFO:tensorflow:Assets written to: /home/jovyan/work/data/output/yek/assets
