In [19]:
with open("dataset/keps_keyword-extraction-prosa/train_preprocess.txt", 'r') as f:
    lines = f.readlines()

doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        doc.append(sent)
        sent = []
doc.append(sent)

with open("dataset/keps_keyword-extraction-prosa/valid_preprocess.txt", 'r') as f:
    lines = f.readlines()

validation_doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        validation_doc.append(sent)
        sent = []
validation_doc.append(sent)

print(validation_doc[1])

with open("dataset/keps_keyword-extraction-prosa/test_preprocess_masked_label.txt", 'r') as f:
    lines = f.readlines()

test_doc = []
sent = []
for e in lines:
    if e != "\n":
        split_sent = e.split("\t")
        label = split_sent[1].split("\n")[0]
        sent.append((split_sent[0], label))
    else:
        test_doc.append(sent)
        sent = []
test_doc.append(sent)


[('admin', 'O'), ('@halobca', 'B'), ('kok', 'O'), ('susah', 'B'), ('dihubungi', 'B'), ('ya', 'O'), ('apa', 'O'), ('sedang', 'O'), ('gangguan', 'B')]


In [20]:
X_train = [] # store input sequence
Y_train = [] # store output sequence
for sentence in doc:
    X_sentence = []
    Y_sentence = []
    for entity in sentence: 
        X_sentence.append(entity[0]) # entity[0] contains the word
        Y_sentence.append(entity[1]) # entity[1] contains corresponding tag
 
    X_train.append(X_sentence)
    Y_train.append(Y_sentence)

X_validation = [] # store input sequence
Y_validation = [] # store output sequence
for sentence in validation_doc:
    X_sentence = []
    Y_sentence = []
    for entity in sentence: 
        X_sentence.append(entity[0]) # entity[0] contains the word
        Y_sentence.append(entity[1]) # entity[1] contains corresponding tag
 
    X_validation.append(X_sentence)
    Y_validation.append(Y_sentence)

X_test = [] # store input sequence
Y_test = [] # store output sequence
for sentence in test_doc:
    X_sentence = []
    Y_sentence = []
    for entity in sentence: 
        X_sentence.append(entity[0]) # entity[0] contains the word
        Y_sentence.append(entity[1]) # entity[1] contains corresponding tag
 
    X_test.append(X_sentence)
    Y_test.append(Y_sentence)

num_words = len(set([word.lower() for sentence in X_train for word in sentence]))
num_tags   = len(set([word.lower() for sentence in Y_train for word in sentence]))
print("Total number of tagged sentences: {}".format(len(X_train + X_validation)))
print("Vocabulary size: {}".format(num_words))
print("Total number of tags: {}".format(num_tags))

Total number of tagged sentences: 1002
Vocabulary size: 1843
Total number of tags: 3


In [21]:
print("sample X: ", X_train[0], "\n")
print("sample Y: ", Y_train[0], "\n")

sample X:  ['Setelah', 'melalui', 'proses', 'telepon', 'yang', 'panjang', 'tutup', 'sudah', 'kartu', 'kredit', 'bca', 'Ribet'] 

sample Y:  ['O', 'B', 'B', 'I', 'O', 'O', 'B', 'O', 'B', 'I', 'I', 'B'] 



In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer

# encode X and Y
word_tokenizer = Tokenizer() # instantiate tokeniser
word_tokenizer.fit_on_texts(X_train+X_validation+X_test) # fit tokeniser on data
X_train_encoded = word_tokenizer.texts_to_sequences(X_train)  # use the tokeniser to encode input sequence

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y_train+Y_validation+Y_test)
Y_train_encoded = tag_tokenizer.texts_to_sequences(Y_train)

X_validation_encoded = word_tokenizer.texts_to_sequences(X_validation)
Y_validation_encoded = tag_tokenizer.texts_to_sequences(Y_validation)
X_test_encoded = word_tokenizer.texts_to_sequences(X_test)
Y_test_encoded = tag_tokenizer.texts_to_sequences(Y_test)

In [5]:
# sentence vector padding

from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQ_LENGTH = 39  # sequences greater than maxlenght in length will be truncated
X_train_padded = pad_sequences(X_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_train_padded = pad_sequences(Y_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

X_validation_padded = pad_sequences(X_validation_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_validation_padded = pad_sequences(Y_validation_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

X_test_padded = pad_sequences(X_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_test_padded = pad_sequences(Y_test_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

In [6]:
# print the first sequence
print(X_train_padded[0], "\n")
print(Y_train_padded[0])
print(doc[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0 354  97 248  56   6 282  49   9  19
  23   1 114] 

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 3 1 1 2 1 2 3
 3 2]
[('Setelah', 'O'), ('melalui', 'B'), ('proses', 'B'), ('telepon', 'I'), ('yang', 'O'), ('panjang', 'O'), ('tutup', 'B'), ('sudah', 'O'), ('kartu', 'B'), ('kredit', 'I'), ('bca', 'I'), ('Ribet', 'B')]


RNN will learn the zero to zero mapping while training. So we don't need to worry about the padded zeroes. Please note that zero is not reserved for any word or tag, it's only reserved for padding.

In [7]:
# assign padded sequences to X and Y
X_train, Y_train = X_train_padded, Y_train_padded
X_validation, Y_validation = X_validation_padded, Y_validation_padded
X_test, Y_test = X_test_padded, Y_test_padded

In [8]:
# user pretrained word2vec model
# https://github.com/deryrahman/word2vec-bahasa-indonesia

import gensim

path = 'idwiki_word2vec_100.model'
word2vec = gensim.models.word2vec.Word2Vec.load(path)

FileNotFoundError: [Errno 2] No such file or directory: 'idwiki_word2vec_100.model'

In [None]:
word2vec.most_similar('bca')


In [7]:
# assign word vectors from word2vec model

import numpy as np

EMBEDDING_SIZE  = 100  # each word in word2vec model is represented using a 100 dimensional vector
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

# create an empty embedding matix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

# create a word to index dictionary mapping
word2id = word_tokenizer.word_index

# copy vectors from word2vec model to the words present in corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        pass

# check embedding dimension
print("Embeddings shape: {}".format(embedding_weights.shape))

NameError: name 'word2vec' is not defined

In [None]:
# let's look at an embedding of a word
embedding_weights[word_tokenizer.word_index['bca']]

In [9]:
Y_train

[['O', 'B', 'B', 'I', 'O', 'O', 'B', 'O', 'B', 'I', 'I', 'B'],
 ['B',
  'O',
  'O',
  'B',
  'B',
  'I',
  'O',
  'O',
  'O',
  'B',
  'I',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'B', 'B', 'I', 'O', 'B'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O'],
 ['B', 'O', 'B', 'B', 'B', 'I'],
 ['B', 'B', 'O', 'B'],
 ['O', 'B', 'B', 'B', 'O', 'B', 'I'],
 ['O', 'B', 'B', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'B', 'B', 'B'],
 ['O', 'O', 'B', 'B', 'B', 'O', 'O', 'B'],
 ['O', 'B', 'B', 'B', 'I'],
 ['O', 'B', 'I', 'O', 'B', 'B', 'I', 'B', 'O', 'B', 'I', 'O', 'O'],
 ['B', 'I', 'B', 'I', 'O', 'B'],
 ['O', 'O', 'O', 'B', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'B', 'B', 'I', 'I', 'B', 'O'],
 ['B', 'I', 'I', 'B', 'O'],
 ['O', 'O', 'O', 'B', 'O'],
 ['B', 'B', 'B', 'O', 'B', 'O', 'B', 'I'],
 ['O', 'B', 'B', 'O', 'O', 'B', 'B', 'O', 'B', 'I'],
 ['O', 'B', 'O', 'B', 'B', 'O', 'B', 'I', 'I'],
 ['O', 'B', 'B', 'B', 'I', 'I', 'O', 'O

In [8]:
# use Keras' to_categorical function to one-hot encode Y
from tensorflow.keras.utils import to_categorical

Y_train = to_categorical(Y_train)
Y_validation = to_categorical(Y_validation)
Y_test = to_categorical(Y_test)
print(Y_train.shape)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (801,) + inhomogeneous part.

In [None]:
import tensorflow.keras.backend as K

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, TimeDistributed


# total number of tags
NUM_CLASSES = Y_train.shape[2]

# create vanilla RNN architecture
rnn_model = Sequential()
rnn_model.add(Embedding(input_dim=VOCABULARY_SIZE, # vocabulary size - number of unique words in data
                        output_dim=EMBEDDING_SIZE, # length of vector with which each word is represented
                        input_length=MAX_SEQ_LENGTH, # length of input sequence
                        trainable=False # False - don't update the embeddings
))

# add an RNN layer which contains 64 RNN cells
rnn_model.add(SimpleRNN(64, 
              return_sequences=True  # True - return whole sequence; False - return single output of the end of the sequence
))

# add time distributed (output at each sequence) layer
rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

rnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=[get_f1])
rnn_model.summary()


In [None]:
rnn_training = rnn_model.fit(X_train, Y_train, batch_size=128, epochs=100, validation_data=(X_validation, Y_validation))

In [None]:
# visualise training history
from matplotlib import pyplot as plt

plt.plot(rnn_training.history['get_f1'])
plt.plot(rnn_training.history['val_get_f1'])
plt.title('model f1')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
# LSTM architecture
from tensorflow.keras.layers import LSTM

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=VOCABULARY_SIZE,
                         output_dim=EMBEDDING_SIZE,
                         input_length=MAX_SEQ_LENGTH,
                         weights=[embedding_weights],
                         trainable=True 
))
lstm_model.add(LSTM(64, return_sequences=True))
lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))
lstm_model.compile(loss='categorical_crossentropy',
                   optimizer='adam',
                   metrics=[get_f1])
lstm_model.summary()


In [None]:
lstm_training = lstm_model.fit(X_train, Y_train, batch_size=128, epochs=20, validation_data=(X_validation, Y_validation))

In [None]:
# visualise training history
from matplotlib import pyplot as plt

plt.plot(lstm_training.history['get_f1'])
plt.plot(lstm_training.history['val_get_f1'])
plt.title('model f1')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
# Save the model
from tensorflow.keras.models import save_model, load_model

filepath = './lstm_saved_model'
save_model(lstm_model, filepath)

filepath = './rnn_saved_model'
save_model(rnn_model, filepath)

In [None]:
# decode back the one hot vector category to OBI format

predictions = lstm_model.predict(X_test)
i2label_dict = {
    0: "-",
    1: "O",
    2: "B",
    3: "I"
}
all_labels = []
for j, sentence in enumerate(predictions):
    label = []
    for w_label in sentence:
        max = -1
        maxi = 0
        for i, labelval in enumerate(w_label):
            if i == 0:
                continue
            if labelval > max:
                max = labelval
                maxi = i
        if maxi != 0:
            label.append(i2label_dict[maxi])
    leng = len(test_doc[j])
    all_labels.append(label[-leng:])

In [None]:
import csv 
    
# field names 
fields = ['index', 'label'] 
    
# data rows of csv file 
rows = [[i, str(e)] for i, e in enumerate(all_labels)] 
    
# name of csv file 
filename = "pred.txt"
    
# writing to csv file 
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the fields 
    csvwriter.writerow(fields) 

    # writing the data rows 
    csvwriter.writerows(rows)
