In [1]:
import numpy as np
import logging
from gensim.models import Word2Vec
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from nergrit_non_contextual_embeddings import NergritNonContextualEmbeddings


app = NergritNonContextualEmbeddings()

In [2]:
app.init_data(ignore_case=True)

saving result to cache '.cache/nergrit:train_df.pkl'
saving result to cache '.cache/nergrit:validation_df.pkl'
saving result to cache '.cache/nergrit:test_series.pkl'
saving result to cache '.cache/nergrit:test_sentence_series.pkl'
saving result to cache '.cache/nergrit:uncased_vocab_series.pkl'


In [3]:
w2vmodel = app.build_word2vec_model()
words = app.get_words()

saving result to cache '.cache/nergrit-non-contextual-embeddings:vocab_words.pkl'
saving result to cache '.cache/nergrit-non-contextual-embeddings:word2vec_model.pkl'
using cached result from '.cache/nergrit-non-contextual-embeddings:vocab_words.pkl'


In [4]:
X_train, y_train = app.build_Xy_train(w2vmodel)
X_train

saving result to cache '.cache/nergrit-non-contextual-embeddings:word2vec_train.pkl'


[[0.002105476800352335,
  -0.006237909663468599,
  0.0029140478000044823,
  0.004091741517186165,
  -0.000488733930978924,
  0.004350798204541206,
  -0.003914132714271545,
  -0.005875679664313793,
  -0.009151228703558445,
  0.00020552166097331792,
  0.004056098870933056,
  0.007063276134431362,
  0.003701130859553814,
  0.008776221424341202,
  0.0019351186929270625,
  -0.005401507951319218,
  0.003991950303316116,
  -0.002034802921116352,
  0.004072307143360376,
  -0.006925111636519432,
  -0.007616884540766478,
  -0.004861050751060247,
  -0.008193422108888626,
  -0.0034995097666978836,
  -0.0018212193390354514,
  -0.00047369764070026577,
  0.00753423385322094,
  -0.003060092218220234,
  -0.0015402584103867412,
  -0.0038482456002384424,
  -0.003992485348135233,
  0.005828313063830137,
  0.0028188624419271946,
  0.006583340931683779,
  -0.0070097423158586025,
  0.005737274419516325,
  0.005780327599495649,
  -0.0052564130164682865,
  0.007105777971446514,
  -0.008502118289470673,
  -0.00

In [5]:
from tensorflow.keras.utils import to_categorical

X_train, y_train = app.build_Xy_train(w2vmodel)
X_validation, y_validation = app.build_Xy_validation(w2vmodel)
le = app.build_label_encoder()

def reshape(X):
    m, n = np.shape(X)
    return np.reshape(X, (m,n,1))

def convert_categorical(Y):
    return to_categorical(Y, len(set(Y)))

X_train = reshape(X_train)
y_train = convert_categorical(y_train)

X_validation = reshape(X_validation)
y_validation = convert_categorical(y_validation)

print(np.shape(X_train), np.shape(y_train))
print(np.shape(X_validation), np.shape(y_validation))

using cached result from '.cache/nergrit-non-contextual-embeddings:word2vec_train.pkl'
saving result to cache '.cache/nergrit-non-contextual-embeddings:word2vec_validation.pkl'
(322644, 100, 1) (322644, 7)
(6983, 100, 1) (6983, 7)


In [6]:
le.inverse_transform([0,1,2,3,4,5,6])

array(['B-ORGANISATION', 'B-PERSON', 'B-PLACE', 'I-ORGANISATION',
       'I-PERSON', 'I-PLACE', 'O'], dtype='<U14')

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, SimpleRNN, TimeDistributed, Dense
from tensorflow.keras import models

model = Sequential()
model.add(Embedding(
    input_dim=len(app.uncased_vocab_series),
))
model.add(LSTM(32, activation='tanh', input_shape = X_train.shape[1:]))
model.add(Dense(7, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# model = models.load_model("lstm.model")

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 32)          3200      
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 7)                 231       
Total params: 11,751
Trainable params: 11,751
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(
    X_train,
    y_train,
    batch_size=512,
    epochs=1,
    validation_data=(X_validation, y_validation),
)



<tensorflow.python.keras.callbacks.History at 0x7f9c914b0910>

In [17]:
model.save("lstm.model")

2022-03-27 15:51:06.574117: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: lstm.model/assets


INFO:tensorflow:Assets written to: lstm.model/assets


In [18]:
y_pred = model.predict_classes(X_validation)



y_pred = rnn_model.predict(X_validation)

In [19]:
pred_labels = le.inverse_transform(y_pred)

In [20]:

X_validation, y_validation = app.build_Xy_validation(w2vmodel)
np.shape(y_validation)
y_validation

using cached result from '.cache/nergrit-non-contextual-embeddings:word2vec_validation.pkl'


array([6, 6, 6, ..., 6, 6, 6])

In [21]:
accuracy = accuracy_score(y_validation, y_pred)
precision = precision_score(y_validation, y_pred, average='macro')
recall = recall_score(y_validation, y_pred, average='macro')
f1 = f1_score(y_validation, y_pred, average='macro')

print("Accuracy = {0:.2f}".format(accuracy))
print("Precision = {0:.2f}".format(precision))
print("Recall = {0:.2f}".format(recall))
print("F1 = {0:.2f}".format(f1))

Accuracy = 0.02
Precision = 0.00
Recall = 0.14
F1 = 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
y_test_preds = []

for X_test in app.build_X_sentences_test_iter(w2vmodel):
    X_test = reshape(X_test)
    y_test = model.predict_classes(X_test)
    test_labels = le.inverse_transform(y_test)
    y_test_preds.append(test_labels)



In [23]:
import pandas as pd
import numpy as np
np.set_string_function(lambda x: repr(list(x)), repr=False)
np.set_printoptions(linewidth=np.inf)

result_df = pd.DataFrame({'label': y_test_preds}).reset_index()
result_df.to_csv('pred.txt', index=False)

In [24]:
le.inverse_transform(model.predict_classes(reshape([w2vmodel.wv['karangasem']])))

array(['I-ORGANISATION'], dtype='<U14')